Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Query container #373

Open
wants to merge 27 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
12d5fb6
Query contaier
elshize Apr 26, 2020
1d3aa88
Query container parsing
elshize Apr 26, 2020
ce2b720
Merge branch 'master' into query-container
elshize Apr 27, 2020
cdc17f3
CLI test
elshize Apr 27, 2020
98fe8b1
Merge branch 'query-container' of github.com:pisa-engine/pisa into qu…
elshize Apr 27, 2020
b0e5d1a
Fix .travis.yml syntax
elshize Apr 27, 2020
6e2ab62
Fix .travis.yml syntax
elshize Apr 27, 2020
2cce2cd
Fix when cli test are executed
elshize Apr 27, 2020
982d316
Merge branch 'master' into query-container
elshize Apr 28, 2020
1838258
Refactor out common code from tool
elshize Apr 28, 2020
3ba4588
Merge branch 'master' into query-container
elshize Apr 29, 2020
7107f65
Small refactoring and term resolver tests
elshize May 1, 2020
ede9c98
Fix tool description
elshize May 1, 2020
b8f625c
Multiple thresholds per query
elshize May 3, 2020
78cf15c
Return program with 1 if fails
elshize May 3, 2020
d4e63cf
Merge branch 'master' into query-container
elshize May 19, 2020
ebf1acb
Merge branch 'master' into query-container
elshize May 21, 2020
83f9c74
Fix merging issue
elshize May 22, 2020
4b0b05c
Merge branch 'master' into query-container
elshize Jun 1, 2020
a22f794
Merge branch 'master' into query-container
elshize Jun 2, 2020
e0b052a
Merge branch 'master' into query-container
elshize Jun 2, 2020
6382a38
Merge branch 'master' into query-container
elshize Jun 4, 2020
ca09597
Merge branch 'master' into query-container
elshize Jun 5, 2020
f173ea3
Merge branch 'master' into query-container
elshize Jun 5, 2020
104d310
Merge branch 'master' into query-container
elshize Jun 15, 2020
af1871d
Merge branch 'master' into query-container
elshize Jun 18, 2020
8e6da84
Merge branch 'master' into query-container
elshize Jun 24, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Merge branch 'master' into query-container
elshize committed Jun 15, 2020
commit 104d3102b60a407b2539ab3171f49b41daeb2e9f
12 changes: 10 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -11,7 +11,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
name: [linux-gcc-9, linux-gcc-7, linux-clang-10]
name: [linux-gcc-9, linux-gcc-7, linux-clang-6, linux-clang-10, macos-gcc-9, macos-xcode-11.3]
include:
- name: linux-gcc-9
os: ubuntu-latest
@@ -20,6 +20,10 @@ jobs:
- name: linux-gcc-7
os: ubuntu-latest
compiler: "gcc" # default on bionic: gcc-7
- name: linux-clang-6
os: ubuntu-latest
compiler: "clang"
version: "6.0"
- name: linux-clang-10
os: ubuntu-latest
compiler: "clang"
@@ -49,6 +53,10 @@ jobs:
if [ "${cc}" = "clang-10" ]; then
sudo add-apt-repository "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main"
fi
if [ "${cc}" = "clang-6.0" ]; then
# Problems compiling with Clang 6 against libstdc++-9
sudo apt-get remove libstdc++-9-dev libgcc-9-dev cpp-9
fi
sudo apt-get update
sudo apt-get install -y libtool m4 autoconf
if [ "${{ matrix.compiler }}" = "gcc" ]; then
@@ -89,4 +97,4 @@ jobs:
- name: Test
shell: bash
working-directory: ${{runner.workspace}}/build
run: ctest -j 4
run: ctest -VV -j 4
54 changes: 54 additions & 0 deletions .github/workflows/coverage.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
name: Code coverage

on:
push:
branches: [ master ]
pull_request:
branches: [ master ]

jobs:
build:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v2

- name: Install lcov
shell: bash
run: |
echo "::set-env name=CC::gcc-7"
echo "::set-env name=CXX::g++-7"
sudo apt-get install -y lcov
- name: Create Build Environment
shell: bash
run: cmake -E make_directory ${{runner.workspace}}/build

- name: Configure
shell: bash
working-directory: ${{runner.workspace}}/build
run: cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=Debug -DPISA_BUILD_TOOLS=OFF -DENABLE_COVERAGE=ON ..

- name: Build
shell: bash
working-directory: ${{runner.workspace}}/build
run: cmake --build . --config Debug -- -j 4

- name: Test
shell: bash
working-directory: ${{runner.workspace}}/build
run: ctest -j 4

- name: Generate Coverage repport
shell: bash
working-directory: ${{runner.workspace}}/build
run: |
lcov --capture --directory . --output-file coverage.info
lcov --remove coverage.info '/usr/*' --output-file coverage.info # filter system-files
lcov --remove coverage.info '**/external/*' --output-file coverage.info # filter external folder
lcov --remove coverage.info '**/test/*' --output-file coverage.info # filter tests
lcov --list coverage.info # debug info
- name: Upload to codecov.io
uses: codecov/codecov-action@v1
with:
token: ${{ secrets.CODECOV_TOKEN }}
file: ${{runner.workspace}}/build/coverage.info
20 changes: 20 additions & 0 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
name: Docker
on:
push:
branches: [ master ]

jobs:
build:
runs-on: ubuntu-latest

steps:
- name: Trigger docker build
shell: bash
run: |
curl -s -X POST \
-H "Content-Type:application/json" \
-H "Travis-API-Version:3" \
-H "Accept:application/json" \
-H "Authorization:token ${{secrets.TRAVIS_API_TOKEN}}" \
-d '{"request":{"branch":"master"}}' \
'https://api.travis-ci.com/repo/pisa-engine%2Fdocker/requests'
3 changes: 0 additions & 3 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
[submodule "external/FastPFor"]
path = external/FastPFor
url = https://github.com/lemire/FastPFor.git
[submodule "external/stxxl"]
path = external/stxxl
url = https://github.com/stxxl/stxxl.git
[submodule "external/CMake-codecov"]
path = external/CMake-codecov
url = https://github.com/RWTH-HPC/CMake-codecov.git
90 changes: 0 additions & 90 deletions .travis.yml

This file was deleted.

117 changes: 117 additions & 0 deletions docs/source/experiments/regression-robust04.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# PISA: Regression Tests for [Disks 4 & 5](https://trec.nist.gov/data_disks.html) (Robust04)

## Indexing

First, we will create a directory where all the indexes are going to be stored:

```
mkdir robust04
```

### Parsing

```bash
gzip -dc $(find /path/to/disk45/ -type f -name '*.*z' \
\( -path '*/disk4/fr94/[0-9]*/*' -o -path '*/disk4/ft/ft*' \
-o -path '*/disk5/fbis/fb*' -o -path '*/disk5/latimes/la*' \)) \
| bin/parse_collection -f trectext -b 10000 --stemmer porter2 --content-parser html -o robust04/fwd
```

You can replace `gzip -dc` with `zcat` on Linux or `gzcat` on MacOS.
The directory `/path/to/disk45/` should be the root directory of [TREC Disks 4 & 5](https://trec.nist.gov/data_disks.html).

### Inverting

```bash
/path/to/pisa/build/bin/invert \
-i robust04/fwd \
-o robust04/inv \
-b 400000
```

### Reordering
We perform [Recursive Graph Bisection (aka BP) algorithm](https://dl.acm.org/doi/10.1145/2939672.2939862), which is currently the state-of-the-art for minimizing the compressed space used by an inverted index (or graph) through document reordering.

```bash
/path/to/pisa/build/bin/recursive_graph_bisection \
-c robust04/inv \
-o robust04/inv.bp \
--documents robust04/fwd.doclex \
--reordered-documents \
robust04/fwd.bp.doclex
```

### Meta data
To perform BM25 queries it is necessary to build an additional file containing the information needed to compute the score, such as the document lengths. The following command builds a metadata file with block-max structure with blocks of fixed size of 64 postings:

```bash
/path/to/pisa/build/bin/create_wand_data \
-c robust04/inv.bp \
-b 64 \
-o robust04/inv.bm25.bmw \
-s bm25
```

### Index Compression

```bash
/path/to/pisa/build/bin/create_freq_index \
-e block_simdbp \
-c robust04/inv.bp \
-o robust04/inv.block_simdbp \
--check
```
## Retrieval

Queries can be downloaded from NIST:
[TREC 2004 Robust Track (Topics 301-450 & 601-700)](http://trec.nist.gov/data/robust/04.testset.gz)

```bash
wget http://trec.nist.gov/data/robust/04.testset.gz
gunzip 04.testset.gz
/path/to/pisa/build/bin/extract_topics -i 04.testset -o topics.robust2004
```
The above command will download the topics from the NIST website, extract the archive and parse topics in order to get `title`, `desc` and `narr` fields in separate files.

```
/path/to/pisa/build/bin/evaluate_queries \
-e block_simdbp \
-a block_max_wand \
-i robust04/inv.block_simdbp \
-w robust04/inv.bm25.bmw \
--stemmer porter2 \
--documents robust04/fwd.bp.doclex \
--terms robust04/fwd.termlex \
-k 1000 \
--scorer bm25 \
-q topics.robust2004.title \
> run.robust2004.bm25.title.robust2004.txt
```

## Evaluation

Qrels can be downloaded from NIST:
[TREC 2004 Robust Track (Topics 301-450 & 601-700)](http://trec.nist.gov/data/robust/qrels.robust2004.txt)
```
wget http://trec.nist.gov/data/robust/qrels.robust2004.txt
```

[trec_eval](https://github.com/usnistgov/trec_eval) is the standard tool used by the TREC community for
evaluating an ad-hoc retrieval run, given the results file and a standard set of judged results (qrels).
It needs to be compiled and installed in order to perform the following command:

```
trec_eval -m map -m P.30 -m ndcg_cut.20 qrels.robust2004.txt run.robust2004.bm25.title.robust2004.txt
```

With the above commands, you should be able to replicate the following results:

```
map all 0.2543
P_30 all 0.3139
ndcg_cut_20 all 0.4250
```

## Replication Log

+ Results replicated by [@amallia](https://github.com/amallia) on 2020-04-03 (commit [b01073](https://github.com/pisa-engine/pisa/commit/2b010731e6ea1b45a5f4a7caa9135a76219ed487))
7 changes: 6 additions & 1 deletion docs/source/getting_started.md
Original file line number Diff line number Diff line change
@@ -15,7 +15,7 @@ To build the code:
$ cd build
$ cmake .. -DCMAKE_BUILD_TYPE=Release
$ make

## Run unit tests

To run the unit tests simply perform a `make test`.
@@ -24,3 +24,8 @@ The directory `test/test_data` contains a small document collection used in the
unit tests. The binary format of the collection is described in a following
section.
An example set of queries can also be found in `test/test_data/queries`.


## PISA Regression Experiments

+ [Regressions for Disks 4 & 5 (Robust04)](experiments/regression-robust04.html)
18 changes: 0 additions & 18 deletions external/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -11,24 +11,6 @@ target_compile_options(FastPFor PRIVATE -Wno-cast-align)
set(CLI11_TESTING OFF CACHE BOOL "skip trecpp testing")
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/CLI11 EXCLUDE_FROM_ALL)

# stxxl
add_definitions(-DSTXXL_VERBOSE_LEVEL=-10) # suppress messages to stdout
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/stxxl)
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${STXXL_CXX_FLAGS}")
include_directories(${STXXL_INCLUDE_DIRS})
set_target_properties(stxxl PROPERTIES
CXX_STANDARD 14
)
set_target_properties(${LIBSTXXL_SOURCES} PROPERTIES
CXX_STANDARD 14
)
set_target_properties(stxxl_tool test1 test2 PROPERTIES
CXX_STANDARD 14
)
target_compile_options(stxxl PRIVATE -Wno-deprecated-declarations
)


# Add streamvbyte
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/streamvbyte/include)
add_library(streamvbyte STATIC ${CMAKE_CURRENT_SOURCE_DIR}/streamvbyte/src/streamvbyte.c
1 change: 0 additions & 1 deletion external/stxxl
Submodule stxxl deleted from b9e44f
6 changes: 2 additions & 4 deletions include/pisa/index_types.hpp
Original file line number Diff line number Diff line change
@@ -17,7 +17,6 @@
#include "block_freq_index.hpp"

#include "freq_index.hpp"
#include "mixed_block.hpp"
#include "sequence/partitioned_sequence.hpp"
#include "sequence/positive_sequence.hpp"
#include "sequence/uniform_partitioned_sequence.hpp"
@@ -43,14 +42,13 @@ using block_qmx_index = block_freq_index<pisa::qmx_block>;
using block_simple8b_index = block_freq_index<pisa::simple8b_block>;
using block_simple16_index = block_freq_index<pisa::simple16_block>;
using block_simdbp_index = block_freq_index<pisa::simdbp_block>;
using block_mixed_index = block_freq_index<pisa::mixed_block>;

} // namespace pisa

#define PISA_INDEX_TYPES \
(ef)(single)(pefuniform)(pefopt)(block_optpfor)(block_varintg8iu)(block_streamvbyte)( \
block_maskedvbyte)(block_interpolative)(block_qmx)(block_varintgb)(block_simple8b)( \
block_simple16)(block_simdbp)(block_mixed)
block_simple16)(block_simdbp)
#define PISA_BLOCK_INDEX_TYPES \
(block_optpfor)(block_varintg8iu)(block_streamvbyte)(block_maskedvbyte)(block_interpolative)( \
block_qmx)(block_varintgb)(block_simple8b)(block_simple16)(block_simdbp)(block_mixed)
block_qmx)(block_varintgb)(block_simple8b)(block_simple16)(block_simdbp)
254 changes: 0 additions & 254 deletions include/pisa/mixed_block.hpp

This file was deleted.

65 changes: 50 additions & 15 deletions include/pisa/recursive_graph_bisection.hpp
Original file line number Diff line number Diff line change
@@ -8,6 +8,7 @@

#include "pstl/algorithm"
#include "pstl/execution"
#include "tbb/enumerable_thread_specific.h"
#include "tbb/task_group.h"

#include "forward_index.hpp"
@@ -23,6 +24,15 @@ const Log2<4096> log2;

namespace bp {

using ThreadLocalGains = tbb::enumerable_thread_specific<single_init_vector<double>>;
using ThreadLocalDegrees = tbb::enumerable_thread_specific<single_init_vector<size_t>>;

struct ThreadLocal {
ThreadLocalGains gains;
ThreadLocalDegrees left_degrees;
ThreadLocalDegrees right_degrees;
};

PISA_ALWAYSINLINE double expb(double logn1, double logn2, size_t deg1, size_t deg2)
{
__m128 _deg = _mm_cvtepi32_ps(_mm_set_epi32(deg1, deg1, deg2, deg2));
@@ -33,6 +43,20 @@ namespace bp {
return a[3] - a[2] + a[1] - a[0]; // Can we do it with SIMD?
};

template <typename ThreadLocalContainer>
[[nodiscard]] PISA_ALWAYSINLINE auto&
clear_or_init(ThreadLocalContainer&& container, std::size_t size)
{
bool exists = false;
auto& ref = container.local(exists);
if (exists) {
ref.clear();
} else {
ref.resize(size);
}
return ref;
}

} // namespace bp

template <class Iterator>
@@ -150,13 +174,13 @@ void compute_move_gains_caching(
const std::ptrdiff_t from_n,
const std::ptrdiff_t to_n,
const single_init_vector<size_t>& from_lex,
const single_init_vector<size_t>& to_lex)
const single_init_vector<size_t>& to_lex,
bp::ThreadLocal& thread_local_data)
{
const auto logn1 = log2(from_n);
const auto logn2 = log2(to_n);

thread_local single_init_vector<double> gain_cache(from_lex.size());
gain_cache.clear();
auto& gain_cache = bp::clear_or_init(thread_local_data.gains, from_lex.size());
auto compute_document_gain = [&](auto& d) {
double gain = 0.0;
auto terms = range.terms(d);
@@ -187,12 +211,15 @@ void compute_move_gains_caching(

template <class Iterator, class GainF>
void compute_gains(
document_partition<Iterator>& partition, const degree_map_pair& degrees, GainF gain_function)
document_partition<Iterator>& partition,
const degree_map_pair& degrees,
GainF gain_function,
bp::ThreadLocal& thread_local_data)
{
auto n1 = partition.left.size();
auto n2 = partition.right.size();
gain_function(partition.left, n1, n2, degrees.left, degrees.right);
gain_function(partition.right, n2, n1, degrees.right, degrees.left);
gain_function(partition.left, n1, n2, degrees.left, degrees.right, thread_local_data);
gain_function(partition.right, n2, n1, degrees.right, degrees.left, thread_local_data);
}

template <class Iterator>
@@ -226,18 +253,22 @@ void swap(document_partition<Iterator>& partition, degree_map_pair& degrees)
}

template <class Iterator, class GainF>
void process_partition(document_partition<Iterator>& partition, GainF gain_function, int iterations = 20)
void process_partition(
document_partition<Iterator>& partition,
GainF gain_function,
bp::ThreadLocal& thread_local_data,
int iterations = 20)
{
thread_local single_init_vector<size_t> left_degree(partition.left.term_count());
left_degree.clear();
thread_local single_init_vector<size_t> right_degree(partition.right.term_count());
right_degree.clear();
auto& left_degree =
bp::clear_or_init(thread_local_data.left_degrees, partition.left.term_count());
auto& right_degree =
bp::clear_or_init(thread_local_data.right_degrees, partition.right.term_count());
compute_degrees(partition.left, left_degree);
compute_degrees(partition.right, right_degree);
degree_map_pair degrees{left_degree, right_degree};

for (int iteration = 0; iteration < iterations; ++iteration) {
compute_gains(partition, degrees, gain_function);
compute_gains(partition, degrees, gain_function, thread_local_data);
tbb::parallel_invoke(
[&] {
std::sort(
@@ -261,13 +292,14 @@ template <class Iterator>
void recursive_graph_bisection(
document_range<Iterator> documents, size_t depth, size_t cache_depth, progress& p)
{
bp::ThreadLocal thread_local_data;
std::sort(documents.begin(), documents.end());
auto partition = documents.split();
if (cache_depth >= 1) {
process_partition(partition, compute_move_gains_caching<true, Iterator>);
process_partition(partition, compute_move_gains_caching<true, Iterator>, thread_local_data);
--cache_depth;
} else {
process_partition(partition, compute_move_gains_caching<false, Iterator>);
process_partition(partition, compute_move_gains_caching<false, Iterator>, thread_local_data);
}

p.update(documents.size());
@@ -289,6 +321,7 @@ void recursive_graph_bisection(
template <class Iterator>
void recursive_graph_bisection(std::vector<computation_node<Iterator>> nodes, progress& p)
{
bp::ThreadLocal thread_local_data;
std::sort(nodes.begin(), nodes.end());
auto first = nodes.begin();
auto end = nodes.end();
@@ -297,19 +330,21 @@ void recursive_graph_bisection(std::vector<computation_node<Iterator>> nodes, pr
first, end, [&first](const auto& node) { return node.level > first->level; });
bool last_level = last == end;
tbb::task_group level_group;
std::for_each(first, last, [&level_group, last_level, &p](auto& node) {
std::for_each(first, last, [&thread_local_data, &level_group, last_level, &p](auto& node) {
level_group.run([&]() {
std::sort(node.partition.left.begin(), node.partition.left.end());
std::sort(node.partition.right.begin(), node.partition.right.end());
if (node.cache) {
process_partition(
node.partition,
compute_move_gains_caching<true, Iterator>,
thread_local_data,
node.iteration_count);
} else {
process_partition(
node.partition,
compute_move_gains_caching<false, Iterator>,
thread_local_data,
node.iteration_count);
}
if (last_level) {
2 changes: 1 addition & 1 deletion include/pisa/util/single_init_vector.hpp
Original file line number Diff line number Diff line change
@@ -63,4 +63,4 @@ class single_init_vector: public std::vector<single_init_entry<T>> {
struct degree_map_pair {
single_init_vector<size_t>& left;
single_init_vector<size_t>& right;
};
};
999 changes: 999 additions & 0 deletions test/test_data/bp-node-config.txt

Large diffs are not rendered by default.

132 changes: 132 additions & 0 deletions test/test_recursive_graph_bisection.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#define CATCH_CONFIG_MAIN

#include <catch2/catch.hpp>

#include "pisa/forward_index_builder.hpp"
#include "pisa/invert.hpp"
#include "pisa/reorder_docids.hpp"
#include "pisa/temporary_directory.hpp"
#include "pisa_config.hpp"

using namespace pisa;

using StrColl = std::vector<std::vector<std::pair<std::string, std::uint32_t>>>;

[[nodiscard]] auto coll_to_strings(std::string const& coll_file, std::string const& doclex_file)
-> StrColl
{
auto doclex_buf = Payload_Vector_Buffer::from_file(doclex_file);
pisa::Payload_Vector<> doclex(doclex_buf);
pisa::binary_freq_collection coll(coll_file.c_str());
StrColl strcoll;
for (auto posting_list: coll) {
std::vector<std::pair<std::string, std::uint32_t>> pl;
std::transform(
posting_list.docs.begin(),
posting_list.docs.end(),
posting_list.freqs.begin(),
std::back_inserter(pl),
[&doclex](auto&& doc, auto&& freq) {
return std::pair<std::string, std::uint32_t>(doclex[doc], freq);
});
std::sort(pl.begin(), pl.end());
strcoll.push_back(pl);
}
return strcoll;
}

void compare_strcolls(StrColl const& expected, StrColl const& actual)
{
REQUIRE(expected.size() == actual.size());
for (int list_idx = 0; list_idx < expected.size(); list_idx += 1) {
REQUIRE(expected[list_idx].size() == actual[list_idx].size());
for (int posting_idx = 0; posting_idx < expected[list_idx].size(); posting_idx += 1) {
REQUIRE(expected[list_idx][posting_idx].first == actual[list_idx][posting_idx].first);
REQUIRE(expected[list_idx][posting_idx].second == actual[list_idx][posting_idx].second);
}
}
}

TEST_CASE("Reorder documents with BP")
{
Temporary_Directory tmp;

auto next_record = [](std::istream& in) -> std::optional<Document_Record> {
Plaintext_Record record;
if (in >> record) {
return Document_Record(record.trecid(), record.content(), record.url());
}
return std::nullopt;
};
auto id = [] {
return [](std::string&& term) -> std::string { return std::forward<std::string>(term); };
};

auto fwd_path = (tmp.path() / "fwd").string();
auto inv_path = (tmp.path() / "inv").string();
auto bp_fwd_path = (tmp.path() / "fwd.bp").string();
auto bp_inv_path = (tmp.path() / "inv.bp").string();

GIVEN("Built a forward index and inverted")
{
std::string collection_input(PISA_SOURCE_DIR "/test/test_data/clueweb1k.plaintext");
REQUIRE(boost::filesystem::exists(boost::filesystem::path(collection_input)) == true);
int thread_count = 2;
int batch_size = 1000;

std::ifstream is(collection_input);
Forward_Index_Builder builder;
builder.build(
is, fwd_path, next_record, id, parse_plaintext_content, batch_size, thread_count);

pisa::invert::invert_forward_index(fwd_path, inv_path, batch_size, thread_count);

WHEN("Reordered documents with BP")
{
int code = recursive_graph_bisection(RecursiveGraphBisectionOptions{
.input_basename = inv_path,
.output_basename = bp_inv_path,
.output_fwd = std::nullopt,
.input_fwd = std::nullopt,
.document_lexicon = fmt::format("{}.doclex", fwd_path),
.reordered_document_lexicon = fmt::format("{}.doclex", bp_fwd_path),
.depth = std::nullopt,
.node_config = std::nullopt,
.min_length = 0,
.compress_fwd = false,
.print_args = false,
});
REQUIRE(code == 0);
THEN("Both collections are equal when mapped to strings")
{
auto expected = coll_to_strings(inv_path, fmt::format("{}.doclex", fwd_path));
auto actual = coll_to_strings(bp_inv_path, fmt::format("{}.doclex", bp_fwd_path));
compare_strcolls(expected, actual);
}
}

WHEN("Reordered documents with BP node version")
{
int code = recursive_graph_bisection(RecursiveGraphBisectionOptions{
.input_basename = inv_path,
.output_basename = bp_inv_path,
.output_fwd = std::nullopt,
.input_fwd = std::nullopt,
.document_lexicon = fmt::format("{}.doclex", fwd_path),
.reordered_document_lexicon = fmt::format("{}.doclex", bp_fwd_path),
.depth = std::nullopt,
.node_config = PISA_SOURCE_DIR "/test/test_data/bp-node-config.txt",
.min_length = 0,
.compress_fwd = false,
.print_args = false,
});
REQUIRE(code == 0);
THEN("Both collections are equal when mapped to strings")
{
auto expected = coll_to_strings(inv_path, fmt::format("{}.doclex", fwd_path));
auto actual = coll_to_strings(bp_inv_path, fmt::format("{}.doclex", bp_fwd_path));
compare_strcolls(expected, actual);
}
}
}
}
17 changes: 1 addition & 16 deletions tools/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -4,16 +4,6 @@ target_link_libraries(compress_inverted_index
CLI11
)

add_executable(optimal_hybrid_index optimal_hybrid_index.cpp)
target_include_directories(optimal_hybrid_index PRIVATE ${STXXL_INCLUDE_DIRS})
target_link_libraries(optimal_hybrid_index
${STXXL_LIBRARIES}
pisa
)
set_target_properties(optimal_hybrid_index PROPERTIES
CXX_STANDARD 14
)

add_executable(create_wand_data create_wand_data.cpp)
target_link_libraries(create_wand_data
pisa
@@ -43,11 +33,6 @@ target_link_libraries(profile_queries
pisa
)

add_executable(profile_decoding profile_decoding.cpp)
target_link_libraries(profile_decoding
pisa
)

add_executable(evaluate_collection_ordering evaluate_collection_ordering.cpp)
target_link_libraries(evaluate_collection_ordering
pisa
@@ -147,4 +132,4 @@ add_executable(kth_threshold kth_threshold.cpp)
target_link_libraries(kth_threshold
pisa
CLI11
)
)
500 changes: 0 additions & 500 deletions tools/optimal_hybrid_index.cpp

This file was deleted.

124 changes: 0 additions & 124 deletions tools/profile_decoding.cpp

This file was deleted.

2 changes: 2 additions & 0 deletions tools/reorder_docids.hpp
Original file line number Diff line number Diff line change
@@ -2,11 +2,13 @@

#include "app.hpp"
#include "pisa/reorder_docids.hpp"
#include "tbb/task_scheduler_init.h"

namespace pisa {

auto reorder_docids(ReorderDocuments args) -> int
{
tbb::task_scheduler_init init(args.threads());
try {
if (args.bp()) {
return recursive_graph_bisection(RecursiveGraphBisectionOptions{
You are viewing a condensed version of this merge commit. You can view the full changes here.