Skip to content

Custom space symbol #83

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions ctcdecode/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,22 @@

class CTCBeamDecoder(object):
def __init__(self, labels, model_path=None, alpha=0, beta=0, cutoff_top_n=40, cutoff_prob=1.0, beam_width=100,
num_processes=4, blank_id=0, log_probs_input=False):
num_processes=4, blank_id=0, log_probs_input=False,
space_symbol=" "):

self.cutoff_top_n = cutoff_top_n
self._beam_width = beam_width
self._scorer = None
self._num_processes = num_processes
self._labels = ''.join(labels).encode()
self._num_labels = len(labels)
self._blank_id = blank_id
self._space_symbol = space_symbol
self._log_probs = 1 if log_probs_input else 0

if model_path:
self._scorer = ctc_decode.paddle_get_scorer(alpha, beta, model_path.encode(), self._labels,
self._num_labels)
self._num_labels, self._space_symbol.encode())
self._cutoff_prob = cutoff_prob

def decode(self, probs, seq_lens=None):
Expand All @@ -33,10 +37,11 @@ def decode(self, probs, seq_lens=None):
if self._scorer:
ctc_decode.paddle_beam_decode_lm(probs, seq_lens, self._labels, self._num_labels, self._beam_width,
self._num_processes, self._cutoff_prob, self.cutoff_top_n, self._blank_id,
self._log_probs ,self._scorer, output, timesteps, scores, out_seq_len)
self._space_symbol.encode(),
self._log_probs, self._scorer, output, timesteps, scores, out_seq_len)
else:
ctc_decode.paddle_beam_decode(probs, seq_lens, self._labels, self._num_labels, self._beam_width, self._num_processes,
self._cutoff_prob, self.cutoff_top_n, self._blank_id, self._log_probs,
self._cutoff_prob, self.cutoff_top_n, self._blank_id, self._space_symbol.encode(), self._log_probs,
output, timesteps, scores, out_seq_len)

return output, scores, timesteps, out_seq_len
Expand Down
37 changes: 23 additions & 14 deletions ctcdecode/src/binding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,13 @@ int beam_decode(at::Tensor th_probs,
double cutoff_prob,
size_t cutoff_top_n,
size_t blank_id,
const std::string &space_symbol,
bool log_input,
void *scorer,
at::Tensor th_output,
at::Tensor th_timesteps,
at::Tensor th_scores,
at::Tensor th_out_length)
{
at::Tensor th_out_length){
std::vector<std::string> new_vocab;
utf8_to_utf8_char_vec(labels, new_vocab);
Scorer *ext_scorer = NULL;
Expand Down Expand Up @@ -66,7 +66,7 @@ int beam_decode(at::Tensor th_probs,
}

std::vector<std::vector<std::pair<double, Output>>> batch_results =
ctc_beam_search_decoder_batch(inputs, new_vocab, beam_size, num_processes, cutoff_prob, cutoff_top_n, blank_id, log_input, ext_scorer);
ctc_beam_search_decoder_batch(inputs, new_vocab, beam_size, num_processes, cutoff_prob, cutoff_top_n, blank_id, space_symbol, log_input, ext_scorer);
auto outputs_accessor = th_output.accessor<int, 3>();
auto timesteps_accessor = th_timesteps.accessor<int, 3>();
auto scores_accessor = th_scores.accessor<float, 2>();
Expand All @@ -89,7 +89,9 @@ int beam_decode(at::Tensor th_probs,
}
}
return 1;
}
}



int paddle_beam_decode(at::Tensor th_probs,
at::Tensor th_seq_lens,
Expand All @@ -100,15 +102,18 @@ int paddle_beam_decode(at::Tensor th_probs,
double cutoff_prob,
size_t cutoff_top_n,
size_t blank_id,
const char* space_symbol,
int log_input,
at::Tensor th_output,
at::Tensor th_timesteps,
at::Tensor th_scores,
at::Tensor th_out_length){

std::string space_symbol_string(space_symbol);
return beam_decode(th_probs, th_seq_lens, labels, vocab_size, beam_size, num_processes,
cutoff_prob, cutoff_top_n, blank_id, log_input, NULL, th_output, th_timesteps, th_scores, th_out_length);
}
cutoff_prob, cutoff_top_n, blank_id, space_symbol_string,
log_input, NULL, th_output, th_timesteps, th_scores, th_out_length);
}

int paddle_beam_decode_lm(at::Tensor th_probs,
at::Tensor th_seq_lens,
Expand All @@ -119,28 +124,32 @@ int paddle_beam_decode_lm(at::Tensor th_probs,
double cutoff_prob,
size_t cutoff_top_n,
size_t blank_id,
int log_input,
void *scorer,
const char* space_symbol,
bool log_input,
int *scorer,
at::Tensor th_output,
at::Tensor th_timesteps,
at::Tensor th_scores,
at::Tensor th_out_length){

std::string space_symbol_string(space_symbol);
return beam_decode(th_probs, th_seq_lens, labels, vocab_size, beam_size, num_processes,
cutoff_prob, cutoff_top_n, blank_id, log_input, scorer, th_output, th_timesteps, th_scores, th_out_length);
}

cutoff_prob, cutoff_top_n, blank_id, space_symbol_string, log_input, scorer, th_output, th_timesteps, th_scores, th_out_length);
}

void* paddle_get_scorer(double alpha,
double beta,
const char* lm_path,
const char* labels,
int vocab_size) {
int vocab_size,
const char* space_symbol) {
std::vector<std::string> new_vocab;
utf8_to_utf8_char_vec(labels, new_vocab);
Scorer* scorer = new Scorer(alpha, beta, lm_path, new_vocab);
// Create a string object from the char* space_symbol
std::string space_symbol_string(space_symbol);
Scorer* scorer = new Scorer(alpha, beta, lm_path, new_vocab, space_symbol_string);
return static_cast<void*>(scorer);
}
}

int is_character_based(void *scorer){
Scorer *ext_scorer = static_cast<Scorer *>(scorer);
Expand Down
7 changes: 5 additions & 2 deletions ctcdecode/src/binding.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ int paddle_beam_decode(THFloatTensor *th_probs,
double cutoff_prob,
size_t cutoff_top_n,
size_t blank_id,
const char* space_symbol,
int log_input,
THIntTensor *th_output,
THIntTensor *th_timesteps,
Expand All @@ -22,7 +23,8 @@ int paddle_beam_decode_lm(THFloatTensor *th_probs,
double cutoff_prob,
size_t cutoff_top_n,
size_t blank_id,
bool log_input,
const char* space_symbol,
bool log_input,
int *scorer,
THIntTensor *th_output,
THIntTensor *th_timesteps,
Expand All @@ -33,7 +35,8 @@ void* paddle_get_scorer(double alpha,
double beta,
const char* lm_path,
const char* labels,
int vocab_size);
int vocab_size,
const char* space_symbol);

int is_character_based(void *scorer);
size_t get_max_order(void *scorer);
Expand Down
9 changes: 7 additions & 2 deletions ctcdecode/src/ctc_beam_search_decoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ std::vector<std::pair<double, Output>> ctc_beam_search_decoder(
double cutoff_prob,
size_t cutoff_top_n,
size_t blank_id,
const std::string &space_symbol,
int log_input,
Scorer *ext_scorer) {
// dimension check
Expand All @@ -36,7 +37,9 @@ std::vector<std::pair<double, Output>> ctc_beam_search_decoder(
// size_t blank_id = vocabulary.size();

// assign space id
auto it = std::find(vocabulary.begin(), vocabulary.end(), " ");
// Changed by Gideon from the blank symbol " " to a custom symbol specified as argument
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dead code tells no lies. Please remove this comment and the line of code you commented out below (line 41). That's what git history is for.

auto it = std::find(vocabulary.begin(), vocabulary.end(), space_symbol);
//auto it = std::find(vocabulary.begin(), vocabulary.end(), " ");
int space_id = it - vocabulary.begin();
// if no space in vocabulary
if ((size_t)space_id >= vocabulary.size()) {
Expand Down Expand Up @@ -176,7 +179,7 @@ std::vector<std::pair<double, Output>> ctc_beam_search_decoder(
std::vector<int> timesteps;
prefixes[i]->get_path_vec(output, timesteps);
auto prefix_length = output.size();
auto words = ext_scorer->split_labels(output);
auto words = ext_scorer->split_labels(output, space_symbol);
// remove word insert
approx_ctc = approx_ctc - prefix_length * ext_scorer->beta;
// remove language model weight:
Expand All @@ -198,6 +201,7 @@ ctc_beam_search_decoder_batch(
double cutoff_prob,
size_t cutoff_top_n,
size_t blank_id,
const std::string &space_symbol,
int log_input,
Scorer *ext_scorer) {
VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!");
Expand All @@ -216,6 +220,7 @@ ctc_beam_search_decoder_batch(
cutoff_prob,
cutoff_top_n,
blank_id,
space_symbol,
log_input,
ext_scorer));
}
Expand Down
5 changes: 5 additions & 0 deletions ctcdecode/src/ctc_beam_search_decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,16 @@
* in desending order.
*/

const std::string DEFAULT_SPACE_SYMBOL = std::string(" ");

std::vector<std::pair<double, Output>> ctc_beam_search_decoder(
const std::vector<std::vector<double>> &probs_seq,
const std::vector<std::string> &vocabulary,
size_t beam_size,
double cutoff_prob = 1.0,
size_t cutoff_top_n = 40,
size_t blank_id = 0,
const std::string &space_symbol = DEFAULT_SPACE_SYMBOL,
int log_input = 0,
Scorer *ext_scorer = nullptr);

Expand All @@ -45,6 +48,7 @@ std::vector<std::pair<double, Output>> ctc_beam_search_decoder(
* num_processes: Number of threads for beam search.
* cutoff_prob: Cutoff probability for pruning.
* cutoff_top_n: Cutoff number for pruning.
* space_symbol: The symbol used to indicate spaces, default is " ".
* ext_scorer: External scorer to evaluate a prefix, which consists of
* n-gram language model scoring and word insertion term.
* Default null, decoding the input sample without scorer.
Expand All @@ -61,6 +65,7 @@ ctc_beam_search_decoder_batch(
double cutoff_prob = 1.0,
size_t cutoff_top_n = 40,
size_t blank_id = 0,
const std::string &space_symbol = DEFAULT_SPACE_SYMBOL,
int log_input = 0,
Scorer *ext_scorer = nullptr);

Expand Down
3 changes: 2 additions & 1 deletion ctcdecode/src/decoder_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,8 @@ bool add_word_to_dictionary(
std::vector<int> int_word;

for (auto &c : characters) {
if (c == " ") {
// if (c == " ") {
if (c == "|") { // Gideon: replaced the space symbol " " => "|"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can't be hardcoded. You'll have to parameterize the space character. Though, looking at it more closely, it looks like you could probably just do a lookup based on the SPACE_ID param...

int_word.push_back(SPACE_ID);
} else {
auto int_c = char_map.find(c);
Expand Down
26 changes: 17 additions & 9 deletions ctcdecode/src/scorer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ using namespace lm::ngram;
Scorer::Scorer(double alpha,
double beta,
const std::string& lm_path,
const std::vector<std::string>& vocab_list) {
const std::vector<std::string>& vocab_list,
const std::string &space_symbol) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please set a default arg here.

this->alpha = alpha;
this->beta = beta;

Expand All @@ -28,7 +29,7 @@ Scorer::Scorer(double alpha,
dict_size_ = 0;
SPACE_ID_ = -1;

setup(lm_path, vocab_list);
setup(lm_path, vocab_list, space_symbol);
}

Scorer::~Scorer() {
Expand All @@ -41,11 +42,12 @@ Scorer::~Scorer() {
}

void Scorer::setup(const std::string& lm_path,
const std::vector<std::string>& vocab_list) {
const std::vector<std::string>& vocab_list,
const std::string &space_symbol) {
// load language model
load_lm(lm_path);
// set char map for scorer
set_char_map(vocab_list);
set_char_map(vocab_list, space_symbol);
// fill the dictionary for FST
if (!is_character_based()) {
fill_dictionary(true);
Expand Down Expand Up @@ -79,10 +81,14 @@ double Scorer::get_log_cond_prob(const std::vector<std::string>& words) {
model->NullContextWrite(&state);
for (size_t i = 0; i < words.size(); ++i) {
lm::WordIndex word_index = model->BaseVocabulary().Index(words[i]);

// encounter OOV
if (word_index == 0) {
return OOV_SCORE;
}
// Gideon: Alternatively, comment out above (but in fact, it doesn't seem to work better)
// Rather than using hard-code OOV score, assign the language model <UNK> probability to the OOV words.
// See: https://github.com/parlance/ctcdecode/issues/62
cond_prob = model->BaseScore(&state, word_index, &out_state);
tmp_state = state;
state = out_state;
Expand Down Expand Up @@ -132,26 +138,28 @@ std::string Scorer::vec2str(const std::vector<int>& input) {
return word;
}

std::vector<std::string> Scorer::split_labels(const std::vector<int>& labels) {
std::vector<std::string> Scorer::split_labels(const std::vector<int>& labels, const std::string &space_symbol) {
if (labels.empty()) return {};

std::string s = vec2str(labels);
std::vector<std::string> words;
if (is_character_based_) {
words = split_utf8_str(s);
} else {
words = split_str(s, " ");
// words = split_str(s, " ");
words = split_str(s, space_symbol); //Gideon: replaced the space character from " " to a custom string
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Delete dead code and comment

}
return words;
}

void Scorer::set_char_map(const std::vector<std::string>& char_list) {
void Scorer::set_char_map(const std::vector<std::string>& char_list, const std::string &space_symbol) {
char_list_ = char_list;
char_map_.clear();

for (size_t i = 0; i < char_list_.size(); i++) {
if (char_list_[i] == " ") {
SPACE_ID_ = i;
//if (char_list_[i] == " ") {
if (char_list_[i] == space_symbol) { //Gideon: replaced the space character from " " to a custom string
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Delete dead code and comment

SPACE_ID_ = i;
}
// The initial state of FST is state 0, hence the index of chars in
// the FST should start from 1 to avoid the conflict with the initial
Expand Down
10 changes: 6 additions & 4 deletions ctcdecode/src/scorer.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ class Scorer {
Scorer(double alpha,
double beta,
const std::string &lm_path,
const std::vector<std::string> &vocabulary);
const std::vector<std::string> &vocabulary,
const std::string &space_symbol);
~Scorer();

double get_log_cond_prob(const std::vector<std::string> &words);
Expand All @@ -67,7 +68,7 @@ class Scorer {

// trransform the labels in index to the vector of words (word based lm) or
// the vector of characters (character based lm)
std::vector<std::string> split_labels(const std::vector<int> &labels);
std::vector<std::string> split_labels(const std::vector<int> &labels, const std::string &space_symbol);

// language model weight
double alpha;
Expand All @@ -80,7 +81,8 @@ class Scorer {
protected:
// necessary setup: load language model, set char map, fill FST's dictionary
void setup(const std::string &lm_path,
const std::vector<std::string> &vocab_list);
const std::vector<std::string> &vocab_list,
const std::string &space_symbo);

// load language model from given path
void load_lm(const std::string &lm_path);
Expand All @@ -89,7 +91,7 @@ class Scorer {
void fill_dictionary(bool add_space);

// set char map
void set_char_map(const std::vector<std::string> &char_list);
void set_char_map(const std::vector<std::string> &char_list, const std::string &space_symbol);

double get_log_prob(const std::vector<std::string> &words);

Expand Down