Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into upperlower
Browse files Browse the repository at this point in the history
  • Loading branch information
Ami11111 committed Jul 22, 2024
2 parents cf28709 + b982062 commit 0f147e9
Show file tree
Hide file tree
Showing 10 changed files with 330 additions and 76 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ jobs:
- name: Build release version
run: |
sed -i "s/^version = \".*\"/version = \"$(echo $RELEASE_TAG | cut -c2-)\"/" pyproject.toml
sudo docker exec ${BUILDER_CONTAINER} bash -c "git config --global safe.directory \"*\" && cd /infinity && rm -fr cmake-build-release && mkdir -p cmake-build-release && cmake -G Ninja -DCMAKE_BUILD_TYPE=RelWithDebInfo -DARROW_BUILD_SHARED=OFF -DARROW_ENABLE_TIMING_TESTS=OFF -DARROW_GGDB_DEBUG=OFF -DARROW_PARQUET=ON -DARROW_DEPENDENCY_USE_SHARED=OFF -DCPACK_PACKAGE_VERSION=${{ env.RELEASE_TAG }} -DCPACK_DEBIAN_PACKAGE_ARCHITECTURE=amd64 -DCMAKE_JOB_POOLS:STRING='link=1' -S /infinity -B /infinity/cmake-build-release && cmake --build /infinity/cmake-build-release --target infinity"
sudo docker exec ${BUILDER_CONTAINER} bash -c "git config --global safe.directory \"*\" && cd /infinity && rm -fr cmake-build-release && mkdir -p cmake-build-release && cmake -G Ninja -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCPACK_PACKAGE_VERSION=${{ env.RELEASE_TAG }} -DCPACK_DEBIAN_PACKAGE_ARCHITECTURE=amd64 -DCMAKE_JOB_POOLS:STRING='link=1' -S /infinity -B /infinity/cmake-build-release && cmake --build /infinity/cmake-build-release --target infinity"
- name: Download resources
run: rm -rf resource && git clone --depth=1 https://github.com/infiniflow/resource.git
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/slow_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
- name: Build release version
if: ${{ !cancelled() && !failure() }}
run: sudo docker exec ${BUILDER_CONTAINER} bash -c "git config --global safe.directory \"*\" && cd /infinity && rm -fr cmake-build-release && mkdir -p cmake-build-release && cmake -G Ninja -DCMAKE_BUILD_TYPE=RelWithDebInfo -DARROW_BUILD_SHARED=OFF -DARROW_ENABLE_TIMING_TESTS=OFF -DARROW_GGDB_DEBUG=OFF -DARROW_PARQUET=ON -DARROW_DEPENDENCY_USE_SHARED=OFF -DCMAKE_JOB_POOLS:STRING=link=4 -S /infinity -B /infinity/cmake-build-release && cmake --build /infinity/cmake-build-release"
run: sudo docker exec ${BUILDER_CONTAINER} bash -c "git config --global safe.directory \"*\" && cd /infinity && rm -fr cmake-build-release && mkdir -p cmake-build-release && cmake -G Ninja -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_JOB_POOLS:STRING=link=4 -S /infinity -B /infinity/cmake-build-release && cmake --build /infinity/cmake-build-release"

- name: Download resources
run: rm -rf resource && git clone --depth=1 https://github.com/infiniflow/resource.git
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ jobs:
- name: Build debug version
if: ${{ !cancelled() && !failure() }}
run: sudo docker exec ${BUILDER_CONTAINER} bash -c "git config --global safe.directory \"*\" && cd /infinity && rm -fr cmake-build-debug && mkdir -p cmake-build-debug && cmake -G Ninja -DCMAKE_BUILD_TYPE=Debug -DARROW_BUILD_SHARED=OFF -DARROW_ENABLE_TIMING_TESTS=OFF -DARROW_GGDB_DEBUG=OFF -DARROW_PARQUET=ON -DARROW_DEPENDENCY_USE_SHARED=OFF -DENABLE_JEMALLOC=OFF -DCMAKE_JOB_POOLS:STRING=link=4 -S /infinity -B /infinity/cmake-build-debug && cmake --build /infinity/cmake-build-debug --target infinity test_main"
run: sudo docker exec ${BUILDER_CONTAINER} bash -c "git config --global safe.directory \"*\" && cd /infinity && rm -fr cmake-build-debug && mkdir -p cmake-build-debug && cmake -G Ninja -DCMAKE_BUILD_TYPE=Debug -DCMAKE_JOB_POOLS:STRING=link=4 -S /infinity -B /infinity/cmake-build-debug && cmake --build /infinity/cmake-build-debug --target infinity test_main"

- name: Unit test debug version
if: ${{ !cancelled() && !failure() }}
Expand Down Expand Up @@ -146,7 +146,7 @@ jobs:
- name: Build release version
if: ${{ !cancelled() && !failure() }}
run: sudo docker exec ${BUILDER_CONTAINER} bash -c "git config --global safe.directory \"*\" && cd /infinity && rm -fr cmake-build-release && mkdir -p cmake-build-release && cmake -G Ninja -DCMAKE_BUILD_TYPE=RelWithDebInfo -DARROW_BUILD_SHARED=OFF -DARROW_ENABLE_TIMING_TESTS=OFF -DARROW_GGDB_DEBUG=OFF -DARROW_PARQUET=ON -DARROW_DEPENDENCY_USE_SHARED=OFF -DENABLE_JEMALLOC=OFF -DCMAKE_JOB_POOLS:STRING=link=4 -S /infinity -B /infinity/cmake-build-release && cmake --build /infinity/cmake-build-release --target infinity test_main knn_import_benchmark knn_query_benchmark"
run: sudo docker exec ${BUILDER_CONTAINER} bash -c "git config --global safe.directory \"*\" && cd /infinity && rm -fr cmake-build-release && mkdir -p cmake-build-release && cmake -G Ninja -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_JOB_POOLS:STRING=link=4 -S /infinity -B /infinity/cmake-build-release && cmake --build /infinity/cmake-build-release --target infinity test_main knn_import_benchmark knn_query_benchmark"

- name: Unit test release version
if: ${{ !cancelled() && !failure() }}
Expand Down
6 changes: 3 additions & 3 deletions docs/getstarted/build_from_source.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ git config --global --add safe.directory infinity
cd infinity && mkdir cmake-build-debug && cd cmake-build-debug
export CC=/usr/bin/clang-18
export CXX=/usr/bin/clang++-18
cmake -G Ninja -DCMAKE_BUILD_TYPE=Debug -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_VERBOSE_MAKEFILE=ON -DARROW_BUILD_SHARED=OFF -DARROW_ENABLE_TIMING_TESTS=OFF -DARROW_GGDB_DEBUG=OFF -DARROW_PARQUET=ON -DENABLE_JEMALLOC=OFF ..
cmake -G Ninja -DCMAKE_BUILD_TYPE=Debug -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_VERBOSE_MAKEFILE=ON ..
cmake --build .
```

Expand Down Expand Up @@ -151,7 +151,7 @@ git config --global --add safe.directory infinity
cd infinity && mkdir cmake-build-debug && cd cmake-build-debug
export CC=/usr/bin/clang-18
export CXX=/usr/bin/clang++-18
cmake -G Ninja -DCMAKE_BUILD_TYPE=Debug -DCMAKE_VERBOSE_MAKEFILE=ON -DARROW_BUILD_SHARED=OFF -DARROW_ENABLE_TIMING_TESTS=OFF -DARROW_GGDB_DEBUG=OFF -DARROW_PARQUET=ON -DENABLE_JEMALLOC=OFF ..
cmake -G Ninja -DCMAKE_BUILD_TYPE=Debug -DCMAKE_VERBOSE_MAKEFILE=ON ..
cmake --build .
```

Expand Down Expand Up @@ -196,7 +196,7 @@ git config --global --add safe.directory infinity
cd infinity && mkdir cmake-build-debug && cd cmake-build-debug
export CC=/usr/bin/clang-18
export CXX=/usr/bin/clang++-18
cmake -G Ninja -DCMAKE_BUILD_TYPE=Debug -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_VERBOSE_MAKEFILE=ON -DARROW_BUILD_SHARED=OFF -DARROW_ENABLE_TIMING_TESTS=OFF -DARROW_GGDB_DEBUG=OFF -DARROW_PARQUET=ON -DENABLE_JEMALLOC=OFF ..
cmake -G Ninja -DCMAKE_BUILD_TYPE=Debug -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_VERBOSE_MAKEFILE=ON ..
cmake --build .
```

Expand Down
160 changes: 133 additions & 27 deletions python/benchmark/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ Today there are many open-source vector databases and search engines, each offer
- GIST(1M, 960 dimensions)
- SIFT(1M, 128 dimensions)
- Cohere(1M/10M, 768 dimensions)
- Sparse Vector Search
- SPLADE(1M)
- Full-text Search
- ES benchmark(rally-tracks)
- Wikipedia(~21GB)
Expand All @@ -38,40 +40,144 @@ Today there are many open-source vector databases and search engines, each offer
- dbpedia(1M, 1536 dimensions)

## How to run the benchmark?
### Options

```
Usage: run_benchmark.py [OPTIONS]
Usage: run.py [OPTIONS]
Options:
--engines
all(default), qdrant, elasticSearch, infinity
--hardware
8c_16g(default)
4c_32g
--mode
test mode: all(default)
gist,
sift,
cohere_1M,
cohere_10M,
geonames,
pmc
--limit-ram
25Gb(default)
--limit-cpu
8(default), 1~your cpu core
--import
perform import data operation
--generate
whether to generate a query
--generate-query-num
1(default)
--generate-term-num
4(default)
default=False
whether generate fulltext query set based on the dataset
--import
default=False
whether import dataset into database engine
--query
perform query operation
default=0
Run the query set only once using given number of clients with recording the result and latency. This is for result validation and latency analysis.
--query-express
default=0
Run the query set randomly using given number of clients without recording the result and latency. This is for QPS measurement.
--concurrency
default="mp"
Choose concurrency mechanism, one of: mp - multiprocessing(recommended), mt - multithreading.
--engine
default="infinity"
Choose database engine to benchmark:
infinity,
qdrant,
elasticsearch,
quickwit
--dataset
default="enwiki"
Choose dataset to benchmark:
gist,
sift,
geonames,
enwiki,
tantivy,
splade
--help
show this message.
```
*Notice that the arguments set for options [--query][--query-express] are not the number of rounds for query set or the number of query. Instead, they indicate the number of client/process/thread.*

### Configurations
The mode configurations are saved as JSON files in the "configs" folder. The mode configuration is for setting up the benchmark being used, which includes the dataset, index parameters, queries, query parameters and etc. Users can customize these configurations, saving them as JSON files, for example, "elasticsearch_sift.json".
For example, if you want to run a qdrant benchmark on sift data set, with:
- Euclidean distance to measure similarities among vectors,
- HNSW index type with M = 16, ef_construct = 200,
- top 100 most similar results,
- precision of the results
```
qdrant_sift.json
{
"name": "qdrant_sift",
"app": "qdrant",
"app_path": "servers/qdrant/",
"connection_url": "http://localhost:6333",
"vector_size": 128,
"distance": "L2",
"mode": "vector",
"data_path": "datasets/sift/sift-128-euclidean.hdf5",
"data_link": "http://ann-benchmarks.com/sift-128-euclidean.hdf5",
"vector_name": "embeddings",
"topK": 100,
"vector_index": {
"type": "HNSW",
"index_params":{
"M": 16,
"ef_construct": 200
}
},
"query_type": "json",
"query_path": "datasets/sift/sift-128-euclidean.hdf5",
"query_link": "http://ann-benchmarks.com/sift-128-euclidean.hdf5",
"insert_batch_size": 8192,
"ground_truth_path": "datasets/sift/sift-128-euclidean.hdf5"
}
```

The hardware and mode configurations are saved as JSON files in the "configs" folder. The hardware configuration is for setting up the database server's hardware parameters, while the mode configuration is for setting up the benchmark being used, which includes the dataset, index parameters, queries, and query parameters. Users can customize these configurations, saving them as JSON files, for example, "elasticsearch_sift.json". To run a test, simply use `--engines elasticsesarch --mode sift`.
Or, if you want to run an infinity benchmark on splade data set, with:
- IP to measure similarities among vectors,
- BMP index type with block_size = 16, compress_type = compress,
- alpha = 0.92, beta = 0.8,
- top 10 most similar results,
- precision of the results
```
infinity_spade.json
{
"name": "infinity_splade",
"app": "infinity",
"host": "127.0.0.1:23817",
"data_path": "datasets/SPLADE/base_1M.csr",
"use_import": "sparse_vector_csr",
"topK": 10,
"mode": "sparse_vector",
"schema": {
"col1": {"type": "sparse,30109,float,int16"}
},
"vector_name": "embeddings",
"metric_type": "ip",
"index": {
"col1": {
"type": "BMP",
"params": {
"block_size":8,
"compress_type":"compress"
}
}
},
"alpha": 0.92,
"beta": 0.8,
"query_path": "datasets/SPLADE/queries.dev.csr",
"batch_size": 8192,
"ground_truth_path": "datasets/SPLADE/base_1M.dev.gt"
}
```

### Steps to run a benchmark

#### Step 1: Prepare the data set
Download the data set for import, query set for query and the ground truth set if you want to validate the results and calculate precision and save them in the "datasets" folder. The benchmark framework also support automatically downloading data set if you give the url link in "data_link" field in your configuration files.
#### Step 2: Customize your configuration files
Customize your configuration files for the specific benchmark you want to run, just like the previous configuration section did. Your configuration should also include the correct data path where you saved data set and be named correctly in pattern "\<engine_name\>_\<dataset_name\>.json".
For example, if you want to run an infinity benchmark on splade data set, then the configuration file name should be "infinity_splade.json".
#### Step 3: Run the database engine
Download and run the database engine, make sure it is accessible when you run the benchmark.
#### Step 4: Import the data set
Import the data set into database engine using the option [--import].
For example, if you want to import splade data set into qdrant, then you shall use:
```commandline
python3 run.py --qdrant --splade --import
```
#### Step 5: Run the query set
Finally, you could run the query set using the option [--query] or [--query-express].
For example, if you want to run a qdrant benchmark on splade data set with single process for result validation and latency analysis, then you shall use:
```commandline
python3 run.py --qdrant --splade --query 1
```
Or, if you want to run an infinity benchmark on sift data set with 16 processes for QPS measurement, then you shall use:
```commandline
python3 run.py --infinity --sift --query-express 16
```
4 changes: 2 additions & 2 deletions python/benchmark/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from generate_queries import generate_query_txt

ENGINES = ["infinity", "qdrant", "elasticsearch", "quickwit"]
DATA_SETS = ["gist", "sift", "geonames", "enwiki", "tantivy"]
DATA_SETS = ["gist", "sift", "geonames", "enwiki", "tantivy", "splade"]


def parse_args() -> argparse.Namespace:
Expand All @@ -35,7 +35,7 @@ def parse_args() -> argparse.Namespace:
type=int,
default=0,
dest="query",
help="Run the query set only once using given number of clients with recording the result and latency. This is for result validation and latency analysis",
help="Run the query set only once using given number of clients with recording the result and latency. This is for result validation and latency analysis.",
)
parser.add_argument(
"--query-express",
Expand Down
11 changes: 11 additions & 0 deletions python/test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,17 @@
# limitations under the License.
import os
from shutil import copyfile
import sys

current_path = os.path.abspath(os.path.dirname(__file__))
parent = os.path.join(os.getcwd(), os.pardir)
pparent = os.path.join(parent, os.pardir)
local_infinity_path = os.path.abspath(pparent)

print(current_path, local_infinity_path)

if local_infinity_path in sys.path:
sys.path.remove(local_infinity_path)

import infinity
import pytest
Expand Down
Loading

0 comments on commit 0f147e9

Please sign in to comment.