-
Notifications
You must be signed in to change notification settings - Fork 251
Custom space symbol #83
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
c326265
ddf4da8
49cb0ff
bb6851d
b1294a6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -155,7 +155,8 @@ bool add_word_to_dictionary( | |
std::vector<int> int_word; | ||
|
||
for (auto &c : characters) { | ||
if (c == " ") { | ||
// if (c == " ") { | ||
if (c == "|") { // Gideon: replaced the space symbol " " => "|" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can't be hardcoded. You'll have to parameterize the space character. Though, looking at it more closely, it looks like you could probably just do a lookup based on the |
||
int_word.push_back(SPACE_ID); | ||
} else { | ||
auto int_c = char_map.find(c); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,7 +16,8 @@ using namespace lm::ngram; | |
Scorer::Scorer(double alpha, | ||
double beta, | ||
const std::string& lm_path, | ||
const std::vector<std::string>& vocab_list) { | ||
const std::vector<std::string>& vocab_list, | ||
const std::string &space_symbol) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you please set a default arg here. |
||
this->alpha = alpha; | ||
this->beta = beta; | ||
|
||
|
@@ -28,7 +29,7 @@ Scorer::Scorer(double alpha, | |
dict_size_ = 0; | ||
SPACE_ID_ = -1; | ||
|
||
setup(lm_path, vocab_list); | ||
setup(lm_path, vocab_list, space_symbol); | ||
} | ||
|
||
Scorer::~Scorer() { | ||
|
@@ -41,11 +42,12 @@ Scorer::~Scorer() { | |
} | ||
|
||
void Scorer::setup(const std::string& lm_path, | ||
const std::vector<std::string>& vocab_list) { | ||
const std::vector<std::string>& vocab_list, | ||
const std::string &space_symbol) { | ||
// load language model | ||
load_lm(lm_path); | ||
// set char map for scorer | ||
set_char_map(vocab_list); | ||
set_char_map(vocab_list, space_symbol); | ||
// fill the dictionary for FST | ||
if (!is_character_based()) { | ||
fill_dictionary(true); | ||
|
@@ -79,10 +81,14 @@ double Scorer::get_log_cond_prob(const std::vector<std::string>& words) { | |
model->NullContextWrite(&state); | ||
for (size_t i = 0; i < words.size(); ++i) { | ||
lm::WordIndex word_index = model->BaseVocabulary().Index(words[i]); | ||
|
||
// encounter OOV | ||
if (word_index == 0) { | ||
return OOV_SCORE; | ||
} | ||
// Gideon: Alternatively, comment out above (but in fact, it doesn't seem to work better) | ||
// Rather than using hard-code OOV score, assign the language model <UNK> probability to the OOV words. | ||
// See: https://github.com/parlance/ctcdecode/issues/62 | ||
cond_prob = model->BaseScore(&state, word_index, &out_state); | ||
tmp_state = state; | ||
state = out_state; | ||
|
@@ -132,26 +138,28 @@ std::string Scorer::vec2str(const std::vector<int>& input) { | |
return word; | ||
} | ||
|
||
std::vector<std::string> Scorer::split_labels(const std::vector<int>& labels) { | ||
std::vector<std::string> Scorer::split_labels(const std::vector<int>& labels, const std::string &space_symbol) { | ||
if (labels.empty()) return {}; | ||
|
||
std::string s = vec2str(labels); | ||
std::vector<std::string> words; | ||
if (is_character_based_) { | ||
words = split_utf8_str(s); | ||
} else { | ||
words = split_str(s, " "); | ||
// words = split_str(s, " "); | ||
words = split_str(s, space_symbol); //Gideon: replaced the space character from " " to a custom string | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Delete dead code and comment |
||
} | ||
return words; | ||
} | ||
|
||
void Scorer::set_char_map(const std::vector<std::string>& char_list) { | ||
void Scorer::set_char_map(const std::vector<std::string>& char_list, const std::string &space_symbol) { | ||
char_list_ = char_list; | ||
char_map_.clear(); | ||
|
||
for (size_t i = 0; i < char_list_.size(); i++) { | ||
if (char_list_[i] == " ") { | ||
SPACE_ID_ = i; | ||
//if (char_list_[i] == " ") { | ||
if (char_list_[i] == space_symbol) { //Gideon: replaced the space character from " " to a custom string | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Delete dead code and comment |
||
SPACE_ID_ = i; | ||
} | ||
// The initial state of FST is state 0, hence the index of chars in | ||
// the FST should start from 1 to avoid the conflict with the initial | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Dead code tells no lies. Please remove this comment and the line of code you commented out below (line 41). That's what git history is for.