From 53d80b9e4914e773dfb87c4dbb6383927122337a Mon Sep 17 00:00:00 2001 From: Weng Xuetian Date: Fri, 5 Apr 2024 23:03:12 -0700 Subject: [PATCH] Add new API to separate loading dict and update pinyin dictionary. --- src/libime/core/triedictionary.cpp | 6 ++ src/libime/core/triedictionary.h | 16 +++- src/libime/pinyin/pinyindictionary.cpp | 128 +++++++++++++------------ src/libime/pinyin/pinyindictionary.h | 10 ++ 4 files changed, 99 insertions(+), 61 deletions(-) diff --git a/src/libime/core/triedictionary.cpp b/src/libime/core/triedictionary.cpp index e1c51712..eaad4ea4 100644 --- a/src/libime/core/triedictionary.cpp +++ b/src/libime/core/triedictionary.cpp @@ -73,6 +73,12 @@ const TrieDictionary::TrieType *TrieDictionary::trie(size_t idx) const { return &d->tries_[idx]; } +void TrieDictionary::setTrie(size_t idx, TrieType trie) { + FCITX_D(); + *mutableTrie(idx) = std::move(trie); + emit(idx); +} + TrieDictionary::TrieType *TrieDictionary::mutableTrie(size_t idx) { FCITX_D(); return &d->tries_[idx]; diff --git a/src/libime/core/triedictionary.h b/src/libime/core/triedictionary.h index 80ed0dc7..a8f359cc 100644 --- a/src/libime/core/triedictionary.h +++ b/src/libime/core/triedictionary.h @@ -45,7 +45,19 @@ class LIBIMECORE_EXPORT TrieDictionary : public Dictionary, // Clear dictionary. void clear(size_t idx); - const DATrie *trie(size_t idx) const; + const TrieType *trie(size_t idx) const; + + /** + * Set trie from external source. + * + * There is no validation on the data within it, subclass may expect a + * certain way of organize the key and value. + * + * @param idx the index need to be within [0, dictSize()) + * @param trie new trie. + * @since 1.1.7 + */ + void setTrie(size_t idx, TrieType trie); // Total number to dictionary. size_t dictSize() const; @@ -54,7 +66,7 @@ class LIBIMECORE_EXPORT TrieDictionary : public Dictionary, FCITX_DECLARE_SIGNAL(TrieDictionary, dictSizeChanged, void(size_t)); protected: - DATrie *mutableTrie(size_t idx); + TrieType *mutableTrie(size_t idx); void addWord(size_t idx, std::string_view key, float cost = 0.0f); bool removeWord(size_t idx, std::string_view key); diff --git a/src/libime/pinyin/pinyindictionary.cpp b/src/libime/pinyin/pinyindictionary.cpp index 9a183aa2..a0e08e31 100644 --- a/src/libime/pinyin/pinyindictionary.cpp +++ b/src/libime/pinyin/pinyindictionary.cpp @@ -202,6 +202,66 @@ size_t fuzzyFactor(PinyinFuzzyFlags flags) { return factor; } +PinyinDictionary::TrieType loadTextImpl(std::istream &in) { + PinyinDictionary::TrieType trie; + + std::string buf; + auto isSpaceCheck = boost::is_any_of(" \n\t\r\v\f"); + while (!in.eof()) { + if (!std::getline(in, buf)) { + break; + } + + boost::trim_if(buf, isSpaceCheck); + std::vector tokens; + boost::split(tokens, buf, isSpaceCheck); + if (tokens.size() == 3 || tokens.size() == 2) { + const std::string &hanzi = tokens[0]; + std::string_view pinyin = tokens[1]; + float prob = 0.0F; + if (tokens.size() == 3) { + prob = std::stof(tokens[2]); + } + + try { + auto result = PinyinEncoder::encodeFullPinyinWithFlags( + pinyin, PinyinFuzzyFlag::VE_UE); + result.push_back(pinyinHanziSep); + result.insert(result.end(), hanzi.begin(), hanzi.end()); + trie.set(result.data(), result.size(), prob); + } catch (const std::invalid_argument &e) { + LIBIME_ERROR() + << "Failed to parse line: " << buf << ", skipping."; + } + } + } + return trie; +} + +PinyinDictionary::TrieType loadBinaryImpl(std::istream &in) { + PinyinDictionary::TrieType trie; + uint32_t magic = 0; + uint32_t version = 0; + throw_if_io_fail(unmarshall(in, magic)); + if (magic != pinyinBinaryFormatMagic) { + throw std::invalid_argument("Invalid pinyin magic."); + } + throw_if_io_fail(unmarshall(in, version)); + switch (version) { + case 0x1: + trie.load(in); + break; + case pinyinBinaryFormatVersion: + readZSTDCompressed( + in, [&trie](std::istream &compressIn) { trie.load(compressIn); }); + break; + default: + throw std::invalid_argument("Invalid pinyin version."); + break; + } + return trie; +} + } // namespace class PinyinMatchContext { @@ -747,77 +807,27 @@ void PinyinDictionary::load(size_t idx, const char *filename, void PinyinDictionary::load(size_t idx, std::istream &in, PinyinDictFormat format) { + setTrie(idx, load(in, format)); +} + +PinyinDictionary::TrieType PinyinDictionary::load(std::istream &in, + PinyinDictFormat format) { switch (format) { case PinyinDictFormat::Text: - loadText(idx, in); - break; + return loadTextImpl(in); case PinyinDictFormat::Binary: - loadBinary(idx, in); - break; + return loadBinaryImpl(in); default: throw std::invalid_argument("invalid format type"); } - emit(idx); } void PinyinDictionary::loadText(size_t idx, std::istream &in) { - DATrie trie; - - std::string buf; - auto isSpaceCheck = boost::is_any_of(" \n\t\r\v\f"); - while (!in.eof()) { - if (!std::getline(in, buf)) { - break; - } - - boost::trim_if(buf, isSpaceCheck); - std::vector tokens; - boost::split(tokens, buf, isSpaceCheck); - if (tokens.size() == 3 || tokens.size() == 2) { - const std::string &hanzi = tokens[0]; - std::string_view pinyin = tokens[1]; - float prob = 0.0F; - if (tokens.size() == 3) { - prob = std::stof(tokens[2]); - } - - try { - auto result = PinyinEncoder::encodeFullPinyinWithFlags( - pinyin, PinyinFuzzyFlag::VE_UE); - result.push_back(pinyinHanziSep); - result.insert(result.end(), hanzi.begin(), hanzi.end()); - trie.set(result.data(), result.size(), prob); - } catch (const std::invalid_argument &e) { - LIBIME_ERROR() - << "Failed to parse line: " << buf << ", skipping."; - } - } - } - *mutableTrie(idx) = std::move(trie); + *mutableTrie(idx) = loadTextImpl(in); } void PinyinDictionary::loadBinary(size_t idx, std::istream &in) { - DATrie trie; - uint32_t magic = 0; - uint32_t version = 0; - throw_if_io_fail(unmarshall(in, magic)); - if (magic != pinyinBinaryFormatMagic) { - throw std::invalid_argument("Invalid pinyin magic."); - } - throw_if_io_fail(unmarshall(in, version)); - switch (version) { - case 0x1: - trie.load(in); - break; - case pinyinBinaryFormatVersion: - readZSTDCompressed( - in, [&trie](std::istream &compressIn) { trie.load(compressIn); }); - break; - default: - throw std::invalid_argument("Invalid pinyin version."); - break; - } - *mutableTrie(idx) = std::move(trie); + *mutableTrie(idx) = loadBinaryImpl(in); } void PinyinDictionary::save(size_t idx, const char *filename, diff --git a/src/libime/pinyin/pinyindictionary.h b/src/libime/pinyin/pinyindictionary.h index c87b7a58..158c5161 100644 --- a/src/libime/pinyin/pinyindictionary.h +++ b/src/libime/pinyin/pinyindictionary.h @@ -72,6 +72,16 @@ class LIBIMEPINYIN_EXPORT PinyinDictionary : public TrieDictionary { void setFlags(size_t idx, PinyinDictFlags flags); + /** + * Load text format into the Trie + * + * @param in input stream + * @param format dict format. + * @see TrieDictionary::setTrie + * @since 1.1.7 + */ + static TrieType load(std::istream &in, PinyinDictFormat format); + using dictionaryChanged = TrieDictionary::dictionaryChanged; protected: