|
| 1 | +#include <iostream> |
| 2 | + |
| 3 | +#include <boost/program_options.hpp> |
| 4 | + |
| 5 | +#include <onmt/Tokenizer.h> |
| 6 | + |
| 7 | +namespace po = boost::program_options; |
| 8 | + |
| 9 | +int main(int argc, char* argv[]) |
| 10 | +{ |
| 11 | + po::options_description desc("Tokenization"); |
| 12 | + desc.add_options() |
| 13 | + ("help", "display available options") |
| 14 | + ("mode", po::value<std::string>()->default_value("conservative"), "Define how aggressive should the tokenization be: 'aggressive' only keeps sequences of letters/numbers, 'conservative' allows mix of alphanumeric as in: '2,000', 'E65', 'soft-landing'") |
| 15 | + ("joiner_annotate", po::bool_switch()->default_value(false), "include joiner annotation using 'joiner' character") |
| 16 | + ("joiner", po::value<std::string>()->default_value(onmt::Tokenizer::joiner_marker), "character used to annotate joiners") |
| 17 | + ("joiner_new", po::bool_switch()->default_value(false), "in joiner_annotate mode, 'joiner' is an independent token") |
| 18 | + ("case_feature", po::bool_switch()->default_value(false), "lowercase corpus and generate case feature") |
| 19 | + ("bpe_model", po::value<std::string>()->default_value(""), "path to the BPE model") |
| 20 | + ; |
| 21 | + |
| 22 | + po::variables_map vm; |
| 23 | + po::store(po::parse_command_line(argc, argv, desc), vm); |
| 24 | + po::notify(vm); |
| 25 | + |
| 26 | + if (vm.count("help")) |
| 27 | + { |
| 28 | + std::cerr << desc << std::endl; |
| 29 | + return 1; |
| 30 | + } |
| 31 | + |
| 32 | + onmt::ITokenizer* tokenizer = new onmt::Tokenizer(vm["mode"].as<std::string>() == "aggressive" |
| 33 | + ? onmt::Tokenizer::Mode::Aggressive |
| 34 | + : onmt::Tokenizer::Mode::Conservative, |
| 35 | + vm["bpe_model"].as<std::string>(), |
| 36 | + vm["case_feature"].as<bool>(), |
| 37 | + vm["joiner_annotate"].as<bool>(), |
| 38 | + vm["joiner_new"].as<bool>(), |
| 39 | + vm["joiner"].as<std::string>()); |
| 40 | + |
| 41 | + std::string line; |
| 42 | + |
| 43 | + while (std::getline(std::cin, line)) |
| 44 | + { |
| 45 | + if (!line.empty()) |
| 46 | + std::cout << tokenizer->tokenize(line); |
| 47 | + |
| 48 | + std::cout << std::endl; |
| 49 | + } |
| 50 | + |
| 51 | + return 0; |
| 52 | +} |
0 commit comments