Skip to content

Commit 1fdd4bf

Browse files
committed
minor esthetic fixes; fixes to README
1 parent a49631c commit 1fdd4bf

File tree

6 files changed

+45
-41
lines changed

6 files changed

+45
-41
lines changed

README.md

+17-13
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
`tongrams` - Tons of *N*-Grams
2-
----------
1+
Tongrams - Tons of *N*-Grams
2+
==============================
33

44
**NEWS: check the language model estimation library [here](https://github.com/jermp/tongrams_estimation)!**
55

6-
`tongrams` is a C++ library to index and query large language models
6+
Tongrams is a C++ library to index and query large language models
77
in compressed space. It is the result of some research papers [1,2] by Giulio Ermanno Pibiri
88
and Rossano Venturini.
99

@@ -99,53 +99,57 @@ their usage.
9999

100100
We now show some examples.
101101

102-
##### Example 1.
102+
##### Example 1
103103
The command
104104

105105
./build_trie ef_trie 5 count --dir ../test_data --out ef_trie.count.bin
106106

107107
builds an Elias-Fano trie
108+
108109
* of order 5;
109110
* that stores frequency counts;
110111
* from the *N*-gram counts files contained in the directory `test_data`;
111112
* with no context-based remapping (default);
112113
* whose counts ranks are encoded with the indexed codewords (IC) technique (default);
113114
* that is serialized to the binary file `ef_trie.count.bin`.
114115

115-
##### Example 2.
116+
##### Example 2
116117
The command
117118

118119
./build_trie pef_trie 5 count --dir ../test_data --remapping 1 --ranks PSEF --out pef_trie.count.out
119120

120121
builds a partitioned Elias-Fano trie
122+
121123
* of order 5;
122124
* that stores frequency counts;
123125
* from the *N*-gram counts files contained in the directory `test_data`;
124126
* with context-based remapping of order 1;
125127
* whose counts ranks are encoded with prefix sums (PS) + Elias-Fano (EF);
126128
* that is serialized to the binary file `pef_trie.count.out`.
127129

128-
##### Example 3.
130+
##### Example 3
129131
The command
130132

131133
./build_trie ef_trie 5 prob_backoff --remapping 2 --u -20.0 --p 8 --b 8 --arpa ../test_data/arpa --out ef_trie.prob_backoff.bin
132134

133135
builds an Elias-Fano trie
136+
134137
* of order 5;
135138
* that stores probabilities and backoffs;
136139
* with context-based remapping of order 2;
137140
* with `<unk>` probability of -20.0 and using 8 bits for quantizing probabilities (`--p`) and backoffs (`--b`);
138141
* from the arpa file named `arpa`;
139142
* that is serialized to the binary file `ef_trie.prob_backoff.bin`.
140143

141-
##### Example 4.
144+
##### Example 4
142145
The command
143146

144-
./build_hash 5 8 4 count --dir ../test_data --out hash.bin
147+
./build_hash 5 8 count --dir ../test_data --out hash.bin
145148

146149
builds a MPH-based model
150+
147151
* of order 5;
148-
* that uses 8 bytes per hash key and 4 bytes per unique count;
152+
* that uses 8 bytes per hash key;
149153
* that stores frequency counts;
150154
* from the *N*-gram counts files contained in the directory `test_data`;
151155
* that is serialized to the binary file `hash.bin`.
@@ -155,15 +159,15 @@ Tests
155159
The `test` directory contains the unit tests of some of the fundamental building blocks used by the implemented data structures. As usual, running the executables without any arguments will show the list of their expected input parameters.
156160
Examples:
157161

158-
./compact_vector_test 10000 13
159-
./fast_ef_sequence_test 1000000 128
162+
./test_compact_vector 10000 13
163+
./test_fast_ef_sequence 1000000 128
160164

161165
The directory also contains the unit test for the data structures storing frequency counts, named `check_count_model`, which validates the implementation by checking that each count stored in the data structure is the same as the one provided in the input files from which the data structure was previously built.
162166
Example:
163167

164-
./check_count_model count_data_structure.bin ../test_data
168+
./test_count_model ef_trie.count.bin ../test_data
165169

166-
where `count_data_structure.bin` is the name of the data structure binary file and `test_data` is the name of the folder containing the input *N*-gram counts files.
170+
where `ef_trie.count.bin` is the name of the data structure binary file (maybe built with the command shown in Example 1) and `test_data` is the name of the folder containing the input *N*-gram counts files.
167171

168172
Benchmarks
169173
----------

include/mph_count_lm.hpp

+8-8
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,16 @@ struct mph_count_lm {
2020

2121
typename Values::builder counts_builder(m_order);
2222

23-
for (uint8_t order = 1; order <= m_order; ++order) {
23+
for (uint8_t ord = 1; ord <= m_order; ++ord) {
2424
std::string filename;
25-
util::input_filename(input_dir, order, filename);
25+
util::input_filename(input_dir, ord, filename);
2626
util::check_filename(filename);
2727
grams_gzparser gp(filename.c_str());
2828

2929
std::vector<uint64_t> counts;
3030
counts.reserve(gp.num_lines());
3131

32-
essentials::logger("Reading " + std::to_string(order) +
32+
essentials::logger("Reading " + std::to_string(ord) +
3333
"-grams counts");
3434
for (auto const& l : gp) {
3535
counts_builder.eat_value(l.count);
@@ -39,23 +39,23 @@ struct mph_count_lm {
3939
}
4040

4141
size_t available_ram = sysconf(_SC_PAGESIZE) * sysconf(_SC_PHYS_PAGES);
42-
for (uint8_t order = 1; order <= m_order; ++order) {
43-
essentials::logger("Building " + std::to_string(order) + "-grams");
42+
for (uint8_t ord = 1; ord <= m_order; ++ord) {
43+
essentials::logger("Building " + std::to_string(ord) + "-grams");
4444
grams_counts_pool pool(available_ram * 0.8);
4545
std::string filename;
46-
util::input_filename(input_dir, order, filename);
46+
util::input_filename(input_dir, ord, filename);
4747
pool.load_from<grams_gzparser>(filename.c_str());
4848

4949
auto& pool_index = pool.index();
5050
uint64_t n = pool_index.size();
5151
compact_vector::builder counts_ranks_cvb(
52-
n, util::ceil_log2(counts_builder.size(order - 1) + 1));
52+
n, util::ceil_log2(counts_builder.size(ord - 1) + 1));
5353

5454
std::vector<byte_range> byte_ranges;
5555
byte_ranges.reserve(n);
5656
for (auto const& record : pool_index) {
5757
byte_ranges.push_back(record.gram);
58-
uint64_t rank = counts_builder.rank(order - 1, record.count);
58+
uint64_t rank = counts_builder.rank(ord - 1, record.count);
5959
counts_ranks_cvb.push_back(rank);
6060
}
6161

include/mph_prob_lm.hpp

+12-13
Original file line numberDiff line numberDiff line change
@@ -37,21 +37,21 @@ struct mph_prob_lm {
3737
"specified order exceeds arpa file order");
3838
}
3939

40-
for (uint8_t order = 1; order <= m_order; ++order) {
40+
for (uint8_t ord = 1; ord <= m_order; ++ord) {
4141
std::vector<float> probs;
4242
std::vector<float> backoffs;
43-
uint64_t n = counts[order - 1];
43+
uint64_t n = counts[ord - 1];
4444
probs.reserve(n);
4545
backoffs.reserve(n);
4646

47-
ap.read_values(order, n, probs, backoffs);
47+
ap.read_values(ord, n, probs, backoffs);
4848
assert(probs.size() == n);
4949

50-
if (order !=
50+
if (ord !=
5151
1) { // need to scan unigrams anyway to set arpa offsets
5252
probs_builder.build_probs_sequence(probs,
5353
probs_quantization_bits);
54-
if (order != m_order) {
54+
if (ord != m_order) {
5555
backoffs_builder.build_backoffs_sequence(
5656
backoffs, backoffs_quantization_bits);
5757
}
@@ -66,11 +66,10 @@ struct mph_prob_lm {
6666

6767
size_t available_ram =
6868
sysconf(_SC_PAGESIZE) * sysconf(_SC_PHYS_PAGES);
69-
for (uint8_t order = 2; order <= m_order; ++order) {
70-
essentials::logger("Building " + std::to_string(order) +
69+
for (uint8_t ord = 2; ord <= m_order; ++ord) {
70+
essentials::logger("Building " + std::to_string(ord) +
7171
"-grams");
72-
arpa_iterator it(m_arpa_filename, order,
73-
arpa_offsets[order - 1]);
72+
arpa_iterator it(m_arpa_filename, ord, arpa_offsets[ord - 1]);
7473
uint64_t n = it.num_grams();
7574
grams_probs_pool pool(n, available_ram * 0.8);
7675

@@ -85,18 +84,18 @@ struct mph_prob_lm {
8584

8685
compact_vector::builder cvb(
8786
n, probs_quantization_bits +
88-
(order != m_order ? backoffs_quantization_bits : 0));
87+
(ord != m_order ? backoffs_quantization_bits : 0));
8988

9089
for (auto const& record : pool_index) {
9190
bytes.push_back(record.gram);
9291
float prob = record.prob;
9392
float backoff = record.backoff;
9493
// store interleaved ranks
95-
uint64_t prob_rank = probs_builder.rank(order - 2, prob, 0);
94+
uint64_t prob_rank = probs_builder.rank(ord - 2, prob, 0);
9695
uint64_t packed = prob_rank;
97-
if (order != m_order) {
96+
if (ord != m_order) {
9897
uint64_t backoff_rank =
99-
backoffs_builder.rank(order - 2, backoff, 1);
98+
backoffs_builder.rank(ord - 2, backoff, 1);
10099
packed |= backoff_rank << probs_quantization_bits;
101100
}
102101
cvb.push_back(packed);

include/sequences/fast_ef_sequence.hpp

+5-5
Original file line numberDiff line numberDiff line change
@@ -407,12 +407,12 @@ struct fast_ef_sequence {
407407
uint64_t nodes = uint64_t(1) << h;
408408
for (uint64_t i = 0; i < nodes; ++i) {
409409
auto r = ranges.pop_front();
410-
uint64_t lo = r.first;
411-
uint64_t hi = r.second;
412-
size_t mid = (lo + hi) >> 1;
410+
uint64_t l = r.first;
411+
uint64_t h = r.second;
412+
size_t mid = (l + h) >> 1;
413413
samplings.push_back(it[mid]);
414-
range_t left(lo, mid);
415-
range_t right(mid + 1, hi);
414+
range_t left(l, mid);
415+
range_t right(mid + 1, h);
416416
ranges.push_back(left);
417417
ranges.push_back(right);
418418
}

src/sort_arpa.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,8 @@ int main(int argc, char** argv) {
5757
ap.read_line();
5858

5959
// skip to specified order
60-
for (uint8_t i = 0; i < order - 1; ++i) {
60+
assert(order > 0);
61+
for (uint32_t i = 0; i != order - 1; ++i) {
6162
while (ap.read_line())
6263
;
6364
ap.read_line();

0 commit comments

Comments
 (0)