gropaul · gropaul · Feb 13, 2024 · Feb 13, 2024 · Feb 16, 2024 · Feb 19, 2024
diff --git a/.github/regression/join.csv b/.github/regression/join.csv
@@ -0,0 +1,3 @@
+benchmark/micro/join/asof_join.benchmark
+benchmark/micro/join/blockwise_nl_join.benchmark
+benchmark/micro/join/range_join_small_rhs.benchmark
diff --git a/scripts/local_regression_test_runner.py b/scripts/local_regression_test_runner.py
@@ -0,0 +1,117 @@
+import subprocess
+import os
+import sys
+from typing import Literal
+
+# Builds the benchmark runner of the current and the main branch and runs the benchmarks.
+
+
+DEFAULT_RUNNER_PATH = "build/release/benchmark/benchmark_runner"  # this is whats getting build by default
+NEW_RUNNER_PATH = "build/release/benchmark/benchmark_runner_new"  # from local branch
+OLD_RUNNER_PATH = "build/release/benchmark/benchmark_runner_old"  # from main branch
+
+
+def build(stash_changes: bool = False):
+    original_branch = get_current_branch()
+
+    # Execute git status with the --porcelain option
+    output = subprocess.check_output(['git', 'status', '--porcelain']).decode('utf-8')
+
+    # Filter out lines that start with "??" (untracked files)
+    changes = [line for line in output.strip().split('\n') if not line.startswith('??')]
+    auto_stashed = False
+    if changes and stash_changes:
+        print("Stashing changes")
+        subprocess.check_output(['git', 'stash'])
+        auto_stashed = True
+    elif changes:
+        print("There are uncommitted changes. Please commit or stash them or use --stash to stash them automatically")
+        exit(1)
+
+    # checkout the main branch and build the runner
+    subprocess.check_output(['git', 'checkout', 'main'])
+    print("Building runner on main branch...")
+    build_runner()
+    subprocess.check_output(['cp', DEFAULT_RUNNER_PATH, OLD_RUNNER_PATH])
+
+    # checkout the original branch and build the runner
+    subprocess.check_output(['git', 'checkout', original_branch])
+
+    if auto_stashed:
+        print("Unstashing changes")
+        subprocess.check_output(['git', 'stash', 'pop'])
+
+    print(f"Building runner on branch {original_branch}...")
+    build_runner()
+    subprocess.check_output(['cp', DEFAULT_RUNNER_PATH, NEW_RUNNER_PATH])
+
+
+def get_current_branch():
+    return subprocess.check_output(['git', 'rev-parse', '--abbrev-ref', 'HEAD']).decode('utf-8').strip()
+
+
+def build_runner():
+    # set env variables to
+    env = {"BUILD_BENCHMARK": "1", "BUILD_TPCH": "1", "BUILD_HTTPFS": "1"}
+    # Add the current environment
+    env.update(os.environ)
+    subprocess.run(["make"], env=env)
+
+
+def run_benchmark(old_runner, new_runner, benchmark_file):
+    "Expected usage: python3 scripts/regression_test_runner.py --old=/old/benchmark_runner --new=/new/benchmark_runner --benchmarks=/benchmark/list.csv"
+
+    if not os.path.isfile(old_runner):
+        print(f"Failed to find old runner {old_runner}")
+        exit(1)
+
+    if not os.path.isfile(new_runner):
+        print(f"Failed to find new runner {new_runner}")
+        exit(1)
+
+    command = [
+        'python3',
+        'scripts/regression_test_runner.py',
+        f'--old={old_runner}',
+        f'--new={new_runner}',
+        f'--benchmarks={benchmark_file}',
+        '--threads=4',
+    ]
+
+    print(f"Running command: {' '.join(command)}")
+
+    # start the existing runner, make sure to pipe the output to the console
+    subprocess.run(command, check=True)
+
+
+def main():
+    benchmark_file = None
+    stash_changes = False
+    for arg in sys.argv:
+        if arg.startswith("--benchmarks="):
+            benchmark_file = arg.replace("--benchmarks=", "")
+        elif arg == "--stash":
+            stash_changes = True
+
+        elif arg == "--help":
+            print("Expected usage: python3 scripts/local_regression_test_runner.py --benchmarks=/benchmark/list.csv")
+            print("Optional: --stas: Stash changes before running the benchmarks")
+            exit(1)
+
+    # make sure that we are in the root directory of the project
+    if not os.path.isfile("scripts/local_regression_test_runner.py"):
+        print("Please run this script from the root directory of the project")
+        exit(1)
+
+    if benchmark_file is None:
+        print(
+            "Expected usage: python3 scripts/local_regression_test_runner.py ---benchmarks=.github/regression/imdb.csv"
+        )
+        exit(1)
+
+    build(stash_changes)
+    run_benchmark(OLD_RUNNER_PATH, NEW_RUNNER_PATH, benchmark_file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/common/row_operations/row_matcher.cpp b/src/common/row_operations/row_matcher.cpp
@@ -197,12 +197,43 @@ void RowMatcher::Initialize(const bool no_match_sel, const TupleDataLayout &layo
 	}
 }
 
+void RowMatcher::Initialize(const bool no_match_sel, const TupleDataLayout &layout, const Predicates &predicates,
+                            vector<column_t> &column_ids_p) {
+
+	// The column_ids must have the same size as the predicates vector
+	D_ASSERT(column_ids_p.size() == predicates.size());
+
+	// The largest column_id must be smaller than the number of columns in the layout
+	D_ASSERT(*max_element(column_ids_p.begin(), column_ids_p.end()) < layout.ColumnCount());
+
+	column_ids = make_uniq<vector<column_t>>(column_ids_p);
+
+	match_functions.reserve(predicates.size());
+	for (idx_t idx = 0; idx < predicates.size(); idx++) {
+		column_t col_idx = (*column_ids)[idx];
+		match_functions.push_back(GetMatchFunction(no_match_sel, layout.GetTypes()[col_idx], predicates[idx]));
+	}
+}
+
 idx_t RowMatcher::Match(DataChunk &lhs, const vector<TupleDataVectorFormat> &lhs_formats, SelectionVector &sel,
                         idx_t count, const TupleDataLayout &rhs_layout, Vector &rhs_row_locations,
                         SelectionVector *no_match_sel, idx_t &no_match_count) {
 	D_ASSERT(!match_functions.empty());
-	for (idx_t col_idx = 0; col_idx < match_functions.size(); col_idx++) {
-		const auto &match_function = match_functions[col_idx];
+
+	if (column_ids) {
+		// The column_ids must have the same size as the match_functions vector
+		D_ASSERT(column_ids->size() == match_functions.size());
+
+		// The largest column_id must be smaller than the number of columns in the lhs
+		D_ASSERT(*max_element(column_ids->begin(), column_ids->end()) < lhs.ColumnCount());
+	}
+
+	for (idx_t fun_idx = 0; fun_idx < match_functions.size(); fun_idx++) {
+		// if we only care about specific columns, we need to use the column_ids to get the correct column index
+		// otherwise, we just use the fun_idx
+		const auto col_idx = column_ids ? (*column_ids)[fun_idx] : fun_idx;
+
+		const auto &match_function = match_functions[fun_idx];
 		count =
 		    match_function.function(lhs.data[col_idx], lhs_formats[col_idx], sel, count, rhs_layout, rhs_row_locations,
 		                            col_idx, match_function.child_functions, no_match_sel, no_match_count);

diff --git a/src/common/types/row/tuple_data_collection.cpp b/src/common/types/row/tuple_data_collection.cpp
@@ -405,6 +405,17 @@ void TupleDataCollection::InitializeChunk(DataChunk &chunk) const {
 	chunk.Initialize(allocator->GetAllocator(), layout.GetTypes());
 }
 
+void TupleDataCollection::InitializeChunk(DataChunk &chunk, const vector<column_t> &column_ids) const {
+	vector<LogicalType> chunk_types(column_ids.size());
+	// keep the order of the columns
+	for (idx_t i = 0; i < column_ids.size(); i++) {
+		auto column_idx = column_ids[i];
+		D_ASSERT(column_idx < layout.ColumnCount());
+		chunk_types[i] = layout.GetTypes()[column_idx];
+	}
+	chunk.Initialize(allocator->GetAllocator(), chunk_types);
+}
+
 void TupleDataCollection::InitializeScanChunk(TupleDataScanState &state, DataChunk &chunk) const {
 	auto &column_ids = state.chunk_state.column_ids;
 	D_ASSERT(!column_ids.empty());