Skip to content

Commit

Permalink
Add new API to separate loading dict and update pinyin dictionary.
Browse files Browse the repository at this point in the history
  • Loading branch information
wengxt committed Apr 6, 2024
1 parent 2e76aff commit 53d80b9
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 61 deletions.
6 changes: 6 additions & 0 deletions src/libime/core/triedictionary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,12 @@ const TrieDictionary::TrieType *TrieDictionary::trie(size_t idx) const {
return &d->tries_[idx];
}

void TrieDictionary::setTrie(size_t idx, TrieType trie) {
FCITX_D();
*mutableTrie(idx) = std::move(trie);
emit<TrieDictionary::dictionaryChanged>(idx);
}

TrieDictionary::TrieType *TrieDictionary::mutableTrie(size_t idx) {
FCITX_D();
return &d->tries_[idx];
Expand Down
16 changes: 14 additions & 2 deletions src/libime/core/triedictionary.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,19 @@ class LIBIMECORE_EXPORT TrieDictionary : public Dictionary,
// Clear dictionary.
void clear(size_t idx);

const DATrie<float> *trie(size_t idx) const;
const TrieType *trie(size_t idx) const;

/**
* Set trie from external source.
*
* There is no validation on the data within it, subclass may expect a
* certain way of organize the key and value.
*
* @param idx the index need to be within [0, dictSize())
* @param trie new trie.
* @since 1.1.7
*/
void setTrie(size_t idx, TrieType trie);

// Total number to dictionary.
size_t dictSize() const;
Expand All @@ -54,7 +66,7 @@ class LIBIMECORE_EXPORT TrieDictionary : public Dictionary,
FCITX_DECLARE_SIGNAL(TrieDictionary, dictSizeChanged, void(size_t));

protected:
DATrie<float> *mutableTrie(size_t idx);
TrieType *mutableTrie(size_t idx);
void addWord(size_t idx, std::string_view key, float cost = 0.0f);
bool removeWord(size_t idx, std::string_view key);

Expand Down
128 changes: 69 additions & 59 deletions src/libime/pinyin/pinyindictionary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,66 @@ size_t fuzzyFactor(PinyinFuzzyFlags flags) {
return factor;
}

PinyinDictionary::TrieType loadTextImpl(std::istream &in) {
PinyinDictionary::TrieType trie;

std::string buf;
auto isSpaceCheck = boost::is_any_of(" \n\t\r\v\f");
while (!in.eof()) {
if (!std::getline(in, buf)) {
break;
}

boost::trim_if(buf, isSpaceCheck);
std::vector<std::string> tokens;
boost::split(tokens, buf, isSpaceCheck);
if (tokens.size() == 3 || tokens.size() == 2) {
const std::string &hanzi = tokens[0];
std::string_view pinyin = tokens[1];
float prob = 0.0F;
if (tokens.size() == 3) {
prob = std::stof(tokens[2]);
}

try {
auto result = PinyinEncoder::encodeFullPinyinWithFlags(
pinyin, PinyinFuzzyFlag::VE_UE);
result.push_back(pinyinHanziSep);
result.insert(result.end(), hanzi.begin(), hanzi.end());
trie.set(result.data(), result.size(), prob);
} catch (const std::invalid_argument &e) {
LIBIME_ERROR()
<< "Failed to parse line: " << buf << ", skipping.";
}
}
}
return trie;
}

PinyinDictionary::TrieType loadBinaryImpl(std::istream &in) {
PinyinDictionary::TrieType trie;
uint32_t magic = 0;
uint32_t version = 0;
throw_if_io_fail(unmarshall(in, magic));
if (magic != pinyinBinaryFormatMagic) {
throw std::invalid_argument("Invalid pinyin magic.");
}
throw_if_io_fail(unmarshall(in, version));
switch (version) {
case 0x1:
trie.load(in);
break;
case pinyinBinaryFormatVersion:
readZSTDCompressed(
in, [&trie](std::istream &compressIn) { trie.load(compressIn); });
break;
default:
throw std::invalid_argument("Invalid pinyin version.");
break;
}
return trie;
}

} // namespace

class PinyinMatchContext {
Expand Down Expand Up @@ -747,77 +807,27 @@ void PinyinDictionary::load(size_t idx, const char *filename,

void PinyinDictionary::load(size_t idx, std::istream &in,
PinyinDictFormat format) {
setTrie(idx, load(in, format));
}

PinyinDictionary::TrieType PinyinDictionary::load(std::istream &in,
PinyinDictFormat format) {
switch (format) {
case PinyinDictFormat::Text:
loadText(idx, in);
break;
return loadTextImpl(in);
case PinyinDictFormat::Binary:
loadBinary(idx, in);
break;
return loadBinaryImpl(in);
default:
throw std::invalid_argument("invalid format type");
}
emit<PinyinDictionary::dictionaryChanged>(idx);
}

void PinyinDictionary::loadText(size_t idx, std::istream &in) {
DATrie<float> trie;

std::string buf;
auto isSpaceCheck = boost::is_any_of(" \n\t\r\v\f");
while (!in.eof()) {
if (!std::getline(in, buf)) {
break;
}

boost::trim_if(buf, isSpaceCheck);
std::vector<std::string> tokens;
boost::split(tokens, buf, isSpaceCheck);
if (tokens.size() == 3 || tokens.size() == 2) {
const std::string &hanzi = tokens[0];
std::string_view pinyin = tokens[1];
float prob = 0.0F;
if (tokens.size() == 3) {
prob = std::stof(tokens[2]);
}

try {
auto result = PinyinEncoder::encodeFullPinyinWithFlags(
pinyin, PinyinFuzzyFlag::VE_UE);
result.push_back(pinyinHanziSep);
result.insert(result.end(), hanzi.begin(), hanzi.end());
trie.set(result.data(), result.size(), prob);
} catch (const std::invalid_argument &e) {
LIBIME_ERROR()
<< "Failed to parse line: " << buf << ", skipping.";
}
}
}
*mutableTrie(idx) = std::move(trie);
*mutableTrie(idx) = loadTextImpl(in);
}

void PinyinDictionary::loadBinary(size_t idx, std::istream &in) {
DATrie<float> trie;
uint32_t magic = 0;
uint32_t version = 0;
throw_if_io_fail(unmarshall(in, magic));
if (magic != pinyinBinaryFormatMagic) {
throw std::invalid_argument("Invalid pinyin magic.");
}
throw_if_io_fail(unmarshall(in, version));
switch (version) {
case 0x1:
trie.load(in);
break;
case pinyinBinaryFormatVersion:
readZSTDCompressed(
in, [&trie](std::istream &compressIn) { trie.load(compressIn); });
break;
default:
throw std::invalid_argument("Invalid pinyin version.");
break;
}
*mutableTrie(idx) = std::move(trie);
*mutableTrie(idx) = loadBinaryImpl(in);
}

void PinyinDictionary::save(size_t idx, const char *filename,
Expand Down
10 changes: 10 additions & 0 deletions src/libime/pinyin/pinyindictionary.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,16 @@ class LIBIMEPINYIN_EXPORT PinyinDictionary : public TrieDictionary {

void setFlags(size_t idx, PinyinDictFlags flags);

/**
* Load text format into the Trie
*
* @param in input stream
* @param format dict format.
* @see TrieDictionary::setTrie
* @since 1.1.7
*/
static TrieType load(std::istream &in, PinyinDictFormat format);

using dictionaryChanged = TrieDictionary::dictionaryChanged;

protected:
Expand Down

0 comments on commit 53d80b9

Please sign in to comment.