Skip to content

Commit 619d023

Browse files
committed
external/cmd_line_parser
1 parent 98dec08 commit 619d023

11 files changed

+99
-174
lines changed

.clang-format

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ AllowAllParametersOfDeclarationOnNextLine: true
1212
AllowShortBlocksOnASingleLine: false
1313
AllowShortCaseLabelsOnASingleLine: false
1414
AllowShortFunctionsOnASingleLine: Empty
15-
AllowShortIfStatementsOnASingleLine: false
15+
AllowShortIfStatementsOnASingleLine: true
1616
AllowShortLoopsOnASingleLine: true
1717
AlwaysBreakAfterDefinitionReturnType: None
1818
AlwaysBreakAfterReturnType: None

.gitmodules

+3
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,6 @@
44
[submodule "external/essentials"]
55
path = external/essentials
66
url = https://github.com/jermp/essentials.git
7+
[submodule "external/cmd_line_parser"]
8+
path = external/cmd_line_parser
9+
url = https://github.com/jermp/cmd_line_parser.git

CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
3434
endif ()
3535

3636
if (UNIX)
37-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
37+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
3838
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
3939
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
4040
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb")

external/cmd_line_parser

Submodule cmd_line_parser added at 1776808

include/utils/pools.hpp

+1-3
Original file line numberDiff line numberDiff line change
@@ -108,9 +108,7 @@ struct grams_counts_pool {
108108
size_t gram_bytes = gram.second - gram.first;
109109

110110
if (gram_bytes) {
111-
if (m_strings_pool.size() + gram_bytes > m_max_bytes) {
112-
return false;
113-
}
111+
if (m_strings_pool.size() + gram_bytes > m_max_bytes) return false;
114112

115113
m_strings_pool.insert(m_strings_pool.end(), gram.first,
116114
gram.second);

include/utils/util.hpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -486,7 +486,7 @@ void save(uint8_t header, T const& data_structure,
486486
}
487487

488488
template <typename T>
489-
size_t load(T& data_structure, char const* binary_filename) {
489+
size_t load(T& data_structure, std::string const& binary_filename) {
490490
std::ifstream is(binary_filename, std::ios::binary);
491491
if (!is.good()) {
492492
throw std::runtime_error(
@@ -501,7 +501,7 @@ size_t load(T& data_structure, char const* binary_filename) {
501501
return bytes;
502502
}
503503

504-
std::string get_model_type(char const* binary_filename) {
504+
std::string get_model_type(std::string const& binary_filename) {
505505
std::ifstream is(binary_filename, std::ios::binary);
506506
if (!is.good()) {
507507
throw std::runtime_error(
@@ -510,7 +510,7 @@ std::string get_model_type(char const* binary_filename) {
510510
uint8_t header = 0;
511511
essentials::load_pod(is, header);
512512
binary_header bin_header;
513-
bool verbose = true;
513+
static constexpr bool verbose = true;
514514
auto model_string_type = bin_header.parse(header, verbose);
515515
is.close();
516516
return model_string_type;

src/lookup_perf_test.cpp

+16-20
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,20 @@
44
#include "utils/util.hpp"
55
#include "utils/pools.hpp"
66
#include "../external/essentials/include/essentials.hpp"
7+
#include "../external/cmd_line_parser/include/parser.hpp"
78

89
using namespace tongrams;
910

1011
template <typename Model>
11-
void perf_test(const char* query_filename, const char* binary_filename,
12-
uint32_t runs) {
12+
void perf_test(std::string const& index_filename,
13+
std::string const& query_filename, uint32_t runs) {
1314
strings_pool sp;
1415
std::vector<size_t> offsets;
1516
offsets.push_back(0);
1617

1718
essentials::logger("Loading strings in memory for faster lookup");
1819
{
19-
emphf::file_lines lines(query_filename);
20+
emphf::file_lines lines(query_filename.c_str());
2021
for (auto& l : lines) {
2122
auto br = bytes::split_upon_check_end(l, '\t');
2223
sp.append(br);
@@ -29,7 +30,7 @@ void perf_test(const char* query_filename, const char* binary_filename,
2930

3031
Model model;
3132
essentials::logger("Loading data structure");
32-
size_t file_size = util::load(model, binary_filename);
33+
size_t file_size = util::load(model, index_filename);
3334
std::cout << "\tTotal bytes: " << file_size << "\n";
3435
std::cout << "\tTotal ngrams: " << model.size() << "\n";
3536
std::cout << "\tBytes per gram: " << double(file_size) / model.size()
@@ -70,35 +71,30 @@ void perf_test(const char* query_filename, const char* binary_filename,
7071
}
7172

7273
int main(int argc, char** argv) {
73-
if (argc < 4 || building_util::request_help(argc, argv)) {
74-
building_util::display_legend();
75-
std::cout << "Usage " << argv[0] << ":\n"
76-
<< "\t" << style::bold << style::underline
77-
<< "binary_filename" << style::off << "\n"
78-
<< "\t" << style::bold << style::underline << "query_filename"
79-
<< style::off << "\n"
80-
<< "\t" << style::bold << style::underline << "runs"
81-
<< style::off << std::endl;
82-
return 1;
83-
}
74+
cmd_line_parser::parser parser(argc, argv);
75+
parser.add("index_filename", "Index filename.");
76+
parser.add("query_filename", "Query filename.");
77+
parser.add("runs",
78+
"Number of runs for the benchmark. Must be greater than 1.");
79+
if (!parser.parse()) return 1;
8480

85-
const char* binary_filename = argv[1];
86-
const char* query_filename = argv[2];
87-
uint32_t runs = std::atoi(argv[3]);
81+
auto index_filename = parser.get<std::string>("index_filename");
82+
auto query_filename = parser.get<std::string>("query_filename");
83+
auto runs = parser.get<uint32_t>("runs");
8884

8985
if (runs < 2) {
9086
std::cerr << "Error: number of runs must be greater than 1."
9187
<< std::endl;
9288
return 1;
9389
}
9490

95-
auto model_string_type = util::get_model_type(binary_filename);
91+
auto model_string_type = util::get_model_type(index_filename);
9692

9793
if (false) {
9894
#define LOOP_BODY(R, DATA, T) \
9995
} \
10096
else if (model_string_type == BOOST_PP_STRINGIZE(T)) { \
101-
perf_test<T>(query_filename, binary_filename, runs);
97+
perf_test<T>(index_filename, query_filename, runs);
10298

10399
BOOST_PP_SEQ_FOR_EACH(LOOP_BODY, _, SXLM_COUNT_TYPES);
104100
#undef LOOP_BODY

src/print_stats.cpp

+9-12
Original file line numberDiff line numberDiff line change
@@ -4,34 +4,31 @@
44
#include "utils/util.hpp"
55
#include "utils/stats.cpp"
66
#include "../external/essentials/include/essentials.hpp"
7+
#include "../external/cmd_line_parser/include/parser.hpp"
78

89
using namespace tongrams;
910

1011
template <typename T>
11-
void print_stats(const char* binary_filename) {
12+
void print_stats(std::string const& index_filename) {
1213
T model;
1314
essentials::logger("Loading data structure");
14-
size_t bytes = util::load(model, binary_filename);
15+
size_t bytes = util::load(model, index_filename);
1516
model.print_stats(bytes);
1617
}
1718

1819
int main(int argc, char** argv) {
19-
if (argc < 2 || building_util::request_help(argc, argv)) {
20-
building_util::display_legend();
21-
std::cerr << "Usage " << argv[0] << ":\n"
22-
<< "\t" << style::bold << style::underline
23-
<< "binary_filename" << style::off << std::endl;
24-
return 1;
25-
}
20+
cmd_line_parser::parser parser(argc, argv);
21+
parser.add("index_filename", "Index filename.");
22+
if (!parser.parse()) return 1;
2623

27-
const char* binary_filename = argv[1];
28-
auto model_string_type = util::get_model_type(binary_filename);
24+
auto index_filename = parser.get<std::string>("index_filename");
25+
auto model_string_type = util::get_model_type(index_filename);
2926

3027
if (false) {
3128
#define LOOP_BODY(R, DATA, T) \
3229
} \
3330
else if (model_string_type == BOOST_PP_STRINGIZE(T)) { \
34-
print_stats<T>(binary_filename);
31+
print_stats<T>(index_filename);
3532

3633
BOOST_PP_SEQ_FOR_EACH(LOOP_BODY, _, SXLM_TYPES);
3734
#undef LOOP_BODY

src/score.cpp

+13-18
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,17 @@
66
#include "utils/iterators.hpp"
77
#include "lm_types.hpp"
88
#include "../external/essentials/include/essentials.hpp"
9+
#include "../external/cmd_line_parser/include/parser.hpp"
910

1011
using namespace tongrams;
1112

1213
template <typename Model>
13-
void score_corpus(const char* binary_filename, const char* corpus_filename) {
14+
void score_corpus(std::string const& index_filename,
15+
std::string const& corpus_filename) {
1416
Model model;
1517
essentials::logger("Loading data structure");
16-
util::load(model, binary_filename);
17-
18-
text_lines corpus(corpus_filename);
18+
util::load(model, index_filename);
19+
text_lines corpus(corpus_filename.c_str());
1920

2021
uint64_t tot_OOVs = 0;
2122
uint64_t corpus_tokens = 0;
@@ -91,26 +92,20 @@ void score_corpus(const char* binary_filename, const char* corpus_filename) {
9192
}
9293

9394
int main(int argc, char** argv) {
94-
if (argc < 3 || building_util::request_help(argc, argv)) {
95-
building_util::display_legend();
96-
std::cout << "Usage " << argv[0] << ":\n"
97-
<< "\t" << style::bold << style::underline
98-
<< "binary_filename" << style::off << "\n"
99-
<< "\t" << style::bold << style::underline
100-
<< "corpus_filename" << style::off << std::endl;
101-
return 1;
102-
}
103-
104-
const char* binary_filename = argv[1];
105-
const char* corpus_filename = argv[2];
95+
cmd_line_parser::parser parser(argc, argv);
96+
parser.add("index_filename", "Index filename.");
97+
parser.add("corpus_filename", "Corpus filename.");
98+
if (!parser.parse()) return 1;
10699

107-
auto model_string_type = util::get_model_type(binary_filename);
100+
auto index_filename = parser.get<std::string>("index_filename");
101+
auto corpus_filename = parser.get<std::string>("corpus_filename");
102+
auto model_string_type = util::get_model_type(index_filename);
108103

109104
if (false) {
110105
#define LOOP_BODY(R, DATA, T) \
111106
} \
112107
else if (model_string_type == BOOST_PP_STRINGIZE(T)) { \
113-
score_corpus<T>(binary_filename, corpus_filename);
108+
score_corpus<T>(index_filename, corpus_filename);
114109

115110
BOOST_PP_SEQ_FOR_EACH(LOOP_BODY, _, SXLM_SCORE_TYPES);
116111
#undef LOOP_BODY

src/sort_arpa.cpp

+27-59
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,12 @@
77
#include "utils/mph_tables.hpp"
88
#include "utils/pools.hpp"
99
#include "../external/essentials/include/essentials.hpp"
10+
#include "../external/cmd_line_parser/include/parser.hpp"
1011

1112
using namespace tongrams;
12-
size_t available_ram;
1313

14-
void build_vocabulary(const char* vocab_filename, single_valued_mpht64& vocab) {
14+
void build_vocabulary(char const* vocab_filename, single_valued_mpht64& vocab,
15+
size_t available_ram) {
1516
// assume unigrams fit in memory
1617
grams_counts_pool unigrams(available_ram);
1718
unigrams.load_from<grams_gzparser>(vocab_filename);
@@ -28,88 +29,55 @@ void build_vocabulary(const char* vocab_filename, single_valued_mpht64& vocab) {
2829
for (uint64_t id = 0; id < n; ++id) {
2930
cvb.push_back(id);
3031
}
31-
32-
// NOTE: build vocabulary excluding null terminators
33-
// from unigrams strings so that we can lookup
34-
// for any substring of a n-gram
35-
// without allocating a std::string
3632
single_valued_mpht64::builder builder(bytes, compact_vector(cvb),
3733
identity_adaptor());
3834
builder.build(vocab);
3935
}
4036

41-
// sort always in SUFFIX order
4237
int main(int argc, char** argv) {
43-
if (argc < 5 || building_util::request_help(argc, argv)) {
44-
building_util::display_legend();
45-
std::cerr << "Usage " << argv[0] << ":\n"
46-
<< "\t" << style::bold << style::underline << "order"
47-
<< style::off << "\n"
48-
<< "\t" << style::bold << style::underline << "arpa_filename"
49-
<< style::off << "\n"
50-
<< "\t" << style::bold << style::underline << "vocab_filename"
51-
<< style::off << "\n"
52-
<< "\t" << style::bold << style::underline
53-
<< "output_filename" << style::off << "\n"
54-
<< "\t[--t " << style::underline << "tmp_dir" << style::off
55-
<< "]\n"
56-
<< "\t[--ram " << style::underline << "percentage"
57-
<< style::off << "]" << std::endl;
58-
std::cerr << "---------------------------------------------------------"
59-
"-------\n"
60-
<< style::bold << style::underline << "tmp_dir" << style::off
61-
<< " is the directory for temporaries.\n"
62-
<< "If omitted is assumed to be the current directory.\n"
63-
<< "RAM percentage is expressed as real in (0.0, 100.0]."
64-
<< std::endl;
65-
return 1;
66-
}
38+
cmd_line_parser::parser parser(argc, argv);
39+
parser.add("order", "n-gram order. Must be larger than 0.");
40+
parser.add("arpa_filename", "ARPA filename.");
41+
parser.add("vocab_filename", "Vocabulary filename.");
42+
parser.add("output_filename", "Output filename.");
43+
parser.add("tmp_dir", "Temporary directory for sorting.", "--tmp", false);
44+
parser.add("ram", "Percentage of RAM to use. It must be in (0,100].",
45+
"--ram", false);
46+
if (!parser.parse()) return 1;
47+
48+
auto order = parser.get<uint32_t>("order");
49+
auto arpa_filename = parser.get<std::string>("arpa_filename");
50+
auto vocab_filename = parser.get<std::string>("vocab_filename");
51+
auto output_filename = parser.get<std::string>("output_filename");
6752

68-
uint32_t order = std::atoi(argv[1]);
69-
const char* arpa_filename = argv[2];
70-
const char* vocab_filename = argv[3];
71-
const char* output_filename = argv[4];
7253
std::string default_tmp_dir("./");
7354
std::string tmp_dir = default_tmp_dir;
55+
if (parser.parsed("tmp_dir")) {
56+
tmp_dir = parser.get<std::string>("tmp_dir");
57+
essentials::create_directory(tmp_dir);
58+
}
7459

75-
available_ram = sysconf(_SC_PAGESIZE) * sysconf(_SC_PHYS_PAGES);
60+
size_t available_ram = sysconf(_SC_PAGESIZE) * sysconf(_SC_PHYS_PAGES);
7661
size_t ram_percentage = available_ram;
7762
double perc = 100.0;
78-
79-
for (int i = 5; i < argc; ++i) {
80-
if (argv[i] == std::string("--ram")) {
81-
perc = std::stod(argv[++i]);
82-
if (perc <= 0.0 || perc > 100.0) {
83-
std::cerr << "percentage must be a vaue within (0.0, 100.0]"
84-
<< std::endl;
85-
return 1;
86-
}
87-
} else if (argv[i] == std::string("--t")) {
88-
tmp_dir = std::string(argv[++i]);
89-
essentials::create_directory(tmp_dir);
90-
} else {
91-
std::cerr << "unknown option: '" << argv[i] << "'" << std::endl;
92-
return 1;
93-
}
94-
}
95-
63+
if (parser.parsed("ram")) perc = parser.get<double>("ram");
9664
ram_percentage *= perc / 100;
9765
std::cout << "Sorting with " << perc << "\% of available RAM"
9866
<< " (" << ram_percentage << "/" << available_ram << ")"
9967
<< std::endl;
10068

10169
{
10270
std::vector<uint64_t> counts;
103-
arpa_parser ap(arpa_filename);
71+
arpa_parser ap(arpa_filename.c_str());
10472
ap.read_header(counts);
105-
if (!order or order > counts.size()) {
73+
if (order == 0 or order > counts.size()) {
10674
std::cerr << "invalid specified order" << std::endl;
10775
return 1;
10876
}
10977

11078
single_valued_mpht64 vocab;
11179
essentials::logger("Building vocabulary");
112-
build_vocabulary(vocab_filename, vocab);
80+
build_vocabulary(vocab_filename.c_str(), vocab, available_ram);
11381

11482
ap.read_line();
11583

@@ -124,7 +92,7 @@ int main(int argc, char** argv) {
12492
auto n = counts[order - 1];
12593
grams_probs_pool pool(n, ram_percentage);
12694

127-
// ngrams are sorted in SUFFIX order
95+
// NOTE: SUFFIX order
12896
typedef suffix_order_comparator(single_valued_mpht64,
12997
prob_backoff_record) comparator_type;
13098
comparator_type cmp(vocab);

0 commit comments

Comments
 (0)