From 5cb6a0e048e0575c2d6e10f702ed24cf56d3a272 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Tue, 7 Jan 2025 12:35:36 +0100 Subject: [PATCH 01/24] Backup in the middle. Signed-off-by: Johannes Kalmbach --- src/engine/CMakeLists.txt | 2 +- src/engine/ExistsScan.cpp | 118 ++++++++++++++++++ src/engine/ExistsScan.h | 55 ++++++++ src/engine/GroupBy.cpp | 22 +--- .../sparqlExpressions/ExistsExpression.cpp | 5 + .../sparqlExpressions/ExistsExpression.h | 39 ++++++ .../sparqlExpressions/SparqlExpression.cpp | 14 +++ .../sparqlExpressions/SparqlExpression.h | 10 ++ 8 files changed, 248 insertions(+), 17 deletions(-) create mode 100644 src/engine/ExistsScan.cpp create mode 100644 src/engine/ExistsScan.h create mode 100644 src/engine/sparqlExpressions/ExistsExpression.cpp create mode 100644 src/engine/sparqlExpressions/ExistsExpression.h diff --git a/src/engine/CMakeLists.txt b/src/engine/CMakeLists.txt index be22a64d5d..c724a8fb39 100644 --- a/src/engine/CMakeLists.txt +++ b/src/engine/CMakeLists.txt @@ -14,5 +14,5 @@ add_library(engine CartesianProductJoin.cpp TextIndexScanForWord.cpp TextIndexScanForEntity.cpp TextLimit.cpp LazyGroupBy.cpp GroupByHashMapOptimization.cpp SpatialJoin.cpp CountConnectedSubgraphs.cpp SpatialJoinAlgorithms.cpp PathSearch.cpp ExecuteUpdate.cpp - Describe.cpp) + Describe.cpp ExistsScan.cpp) qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams s2) diff --git a/src/engine/ExistsScan.cpp b/src/engine/ExistsScan.cpp new file mode 100644 index 0000000000..f42da68f3d --- /dev/null +++ b/src/engine/ExistsScan.cpp @@ -0,0 +1,118 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#include "engine/ExistsScan.h" + +#include "util/JoinAlgorithms/JoinAlgorithms.h" + +// _____________________________________________________________________________ +ExistsScan::ExistsScan(QueryExecutionContext* qec, + std::shared_ptr left, + std::shared_ptr right, + Variable existsVariable) + : Operation{qec}, + left_{std::move(left)}, + right_{std::move(right)}, + existsVariable_{std::move(existsVariable)}, + joinColumns_{QueryExecutionTree::getJoinColumns(*left_, *right_)} {} + +// _____________________________________________________________________________ +string ExistsScan::getCacheKeyImpl() const { + return absl::StrCat("EXISTS SCAN left: ", left_->getCacheKey(), + " right: ", right_->getCacheKey()); +} + +// _____________________________________________________________________________ +string ExistsScan::getDescriptor() const { return "EXISTS scan"; } + +// ____________________________________________________________________________ +VariableToColumnMap ExistsScan::computeVariableToColumnMap() const { + auto res = left_->getVariableColumns(); + AD_CONTRACT_CHECK( + !res.contains(existsVariable_), + "The target variable of an exists scan must be a new variable"); + res[existsVariable_] = makeAlwaysDefinedColumn(getResultWidth() - 1); + return res; +} + +// ____________________________________________________________________________ +size_t ExistsScan::getResultWidth() const { + // We add one column to the input. + return left_->getResultWidth() + 1; +} + +// ____________________________________________________________________________ +vector ExistsScan::resultSortedOn() const { + return left_->resultSortedOn(); +} + +// ____________________________________________________________________________ +float ExistsScan::getMultiplicity(size_t col) { + if (col < getResultWidth() - 1) { + return left_->getMultiplicity(col); + } + // The multiplicity of the boolean column can be a dummy value, as it should + // be never used for joins etc. + return 1; +} + +// ____________________________________________________________________________ +uint64_t ExistsScan::getSizeEstimateBeforeLimit() { + return left_->getSizeEstimate(); +} + +// ____________________________________________________________________________ +size_t ExistsScan::getCostEstimate() { + return left_->getCostEstimate() + right_->getCostEstimate() + + left_->getSizeEstimate() + right_->getSizeEstimate(); +} + +// ____________________________________________________________________________ +ProtoResult ExistsScan::computeResult([[maybe_unused]] bool requestLaziness) { + auto leftRes = left_->getResult(); + auto rightRes = right_->getResult(); + const auto& left = leftRes->idTable(); + const auto& right = rightRes->idTable(); + + ad_utility::JoinColumnMapping joinColumnData{joinColumns_, left.numColumns(), + right.numColumns()}; + + IdTableView<0> joinColumnsLeft = + left.asColumnSubsetView(joinColumnData.jcsLeft()); + IdTableView<0> joinColumnsRight = + right.asColumnSubsetView(joinColumnData.jcsRight()); + + checkCancellation(); + + auto noopRowAdder = [](auto&&...) {}; + + // TODO Memory limit. + std::vector notExistsIndices; + auto actionForNotExisting = + [¬ExistsIndices, begin = joinColumnsLeft.begin()](const auto& itLeft) { + notExistsIndices.push_back(itLeft - begin); + }; + + // TODO Handle UNDEF values correctly (and efficiently) + auto findUndefDispatch = [](const auto& row, It begin, auto end, + bool& outOfOrder) { + return std::array{}; + }; + + auto checkCancellationLambda = [this] { checkCancellation(); }; + [[maybe_unused]] auto numOutOfOrder = ad_utility::zipperJoinWithUndef( + joinColumnsLeft, joinColumnsRight, ql::ranges::lexicographical_compare, + noopRowAdder, findUndefDispatch, findUndefDispatch, actionForNotExisting, + checkCancellationLambda); + + // Set up the result; + IdTable result = left.clone(); + result.addEmptyColumn(); + decltype(auto) existsCol = result.getColumn(getResultWidth() - 1); + ql::ranges::fill(existsCol, Id::makeFromBool(true)); + for (size_t notExistsIndex : notExistsIndices) { + existsCol[notExistsIndex] = Id::makeFromBool(false); + } + return {std::move(result), resultSortedOn(), leftRes->getCopyOfLocalVocab()}; +} diff --git a/src/engine/ExistsScan.h b/src/engine/ExistsScan.h new file mode 100644 index 0000000000..b08e06c542 --- /dev/null +++ b/src/engine/ExistsScan.h @@ -0,0 +1,55 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#pragma once + +#include "engine/Operation.h" +#include "engine/QueryExecutionTree.h" + +class ExistsScan : public Operation { + private: + std::shared_ptr left_; + std::shared_ptr right_; + std::vector> joinColumns_; + + Variable existsVariable_; + + vector _multiplicities; + std::vector> _matchedColumns; + + public: + ExistsScan(QueryExecutionContext* qec, + std::shared_ptr left, + std::shared_ptr right, + Variable existsVariable); + + protected: + string getCacheKeyImpl() const override; + + public: + string getDescriptor() const override; + + size_t getResultWidth() const override; + + vector resultSortedOn() const override; + + bool knownEmptyResult() override { return left_->knownEmptyResult(); } + + float getMultiplicity(size_t col) override; + + private: + uint64_t getSizeEstimateBeforeLimit() override; + + public: + size_t getCostEstimate() override; + + vector getChildren() override { + return {left_.get(), right_.get()}; + } + + private: + ProtoResult computeResult([[maybe_unused]] bool requestLaziness) override; + + VariableToColumnMap computeVariableToColumnMap() const override; +}; diff --git a/src/engine/GroupBy.cpp b/src/engine/GroupBy.cpp index 6fdeca1833..a6ff49bbe1 100644 --- a/src/engine/GroupBy.cpp +++ b/src/engine/GroupBy.cpp @@ -366,8 +366,6 @@ ProtoResult GroupBy::computeResult(bool requestLaziness) { } if (useHashMapOptimization) { - // Helper lambda that calls `computeGroupByForHashMapOptimization` for the - // given `subresults`. auto computeWithHashMap = [this, &metadataForUnsequentialData, &groupByCols](auto&& subresults) { auto doCompute = [&] { @@ -378,10 +376,9 @@ ProtoResult GroupBy::computeResult(bool requestLaziness) { return ad_utility::callFixedSize(groupByCols.size(), doCompute); }; - // Now call `computeWithHashMap` and return the result. It expects a range - // of results, so if the result is fully materialized, we create an array - // with a single element. if (subresult->isFullyMaterialized()) { + // `computeWithHashMap` takes a range, so we artificially create one with + // a single input. return computeWithHashMap( std::array{std::pair{std::cref(subresult->idTable()), std::cref(subresult->localVocab())}}); @@ -1509,36 +1506,29 @@ Result GroupBy::computeGroupByForHashMapOptimization( NUM_GROUP_COLUMNS == 0); LocalVocab localVocab; - // Initialize the data for the aggregates of the GROUP BY operation. + // Initialize aggregation data HashMapAggregationData aggregationData( getExecutionContext()->getAllocator(), aggregateAliases, columnIndices.size()); - // Process the input blocks (pairs of `IdTable` and `LocalVocab`) one after - // the other. ad_utility::Timer lookupTimer{ad_utility::Timer::Stopped}; ad_utility::Timer aggregationTimer{ad_utility::Timer::Stopped}; for (const auto& [inputTableRef, inputLocalVocabRef] : subresults) { + // Also support `std::reference_wrapper` as the input. const IdTable& inputTable = inputTableRef; const LocalVocab& inputLocalVocab = inputLocalVocabRef; - // Merge the local vocab of each input block. - // - // NOTE: If the input blocks have very similar or even identical non-empty - // local vocabs, no deduplication is performed. localVocab.mergeWith(std::span{&inputLocalVocab, 1}); - - // Setup the `EvaluationContext` for this input block. + // Initialize evaluation context sparqlExpression::EvaluationContext evaluationContext( *getExecutionContext(), _subtree->getVariableColumns(), inputTable, getExecutionContext()->getAllocator(), localVocab, cancellationHandle_, deadline_); + evaluationContext._groupedVariables = ad_utility::HashSet{ _groupByVariables.begin(), _groupByVariables.end()}; evaluationContext._isPartOfGroupBy = true; - // Iterate of the rows of this input block. Process (up to) - // `GROUP_BY_HASH_MAP_BLOCK_SIZE` rows at a time. for (size_t i = 0; i < inputTable.size(); i += GROUP_BY_HASH_MAP_BLOCK_SIZE) { checkCancellation(); diff --git a/src/engine/sparqlExpressions/ExistsExpression.cpp b/src/engine/sparqlExpressions/ExistsExpression.cpp new file mode 100644 index 0000000000..6737d3ed7b --- /dev/null +++ b/src/engine/sparqlExpressions/ExistsExpression.cpp @@ -0,0 +1,5 @@ +// +// Created by kalmbacj on 1/7/25. +// + +#include "ExistsExpression.h" diff --git a/src/engine/sparqlExpressions/ExistsExpression.h b/src/engine/sparqlExpressions/ExistsExpression.h new file mode 100644 index 0000000000..5ec68acd61 --- /dev/null +++ b/src/engine/sparqlExpressions/ExistsExpression.h @@ -0,0 +1,39 @@ +// +// Created by kalmbacj on 1/7/25. +// + +#pragma once + +#include + +#include "engine/sparqlExpressions/SparqlExpression.h" +#include "parser/ParsedQuery.h" + +namespace sparqlExpression { +class ExistsExpression : public SparqlExpression { + private: + std::variant argument_; + + public: + auto& argument() { return argument_; } + ExistsExpression(ParsedQuery query) : argument_{std::move(query)} {} + + ExpressionResult evaluate(EvaluationContext* context) const override { + AD_CONTRACT_CHECK(std::holds_alternative(argument_)); + return std::get(argument_); + } + + //_________________________________________________________________________ + [[nodiscard]] string getCacheKey( + const VariableToColumnMap& varColMap) const override { + // TODO get a proper cache key here + AD_CONTRACT_CHECK(std::holds_alternative(argument_)); + return absl::StrCat( + "EXISTS WITH COLUMN ", + varColMap.at(std::get(argument_)).columnIndex_); + } + + private: + std::span childrenImpl() override { return {}; } +}; +} // namespace sparqlExpression diff --git a/src/engine/sparqlExpressions/SparqlExpression.cpp b/src/engine/sparqlExpressions/SparqlExpression.cpp index b5ec3aa0f7..00864b998d 100644 --- a/src/engine/sparqlExpressions/SparqlExpression.cpp +++ b/src/engine/sparqlExpressions/SparqlExpression.cpp @@ -180,4 +180,18 @@ bool SparqlExpression::isInsideAggregate() const { } return isInsideAggregate_; } + +// ________________________________________________________________ +bool SparqlExpression::isExistsExpression() const { return false; } + +// ________________________________________________________________ +void SparqlExpression::getExistsExpressions( + std::vector& result) { + if (isExistsExpression()) { + result.push_back(this); + } + for (auto& child : children()) { + child->getExistsExpressions(result); + } +} } // namespace sparqlExpression diff --git a/src/engine/sparqlExpressions/SparqlExpression.h b/src/engine/sparqlExpressions/SparqlExpression.h index 1378f10520..d5f7248daf 100644 --- a/src/engine/sparqlExpressions/SparqlExpression.h +++ b/src/engine/sparqlExpressions/SparqlExpression.h @@ -123,6 +123,16 @@ class SparqlExpression { // implementation returns `false`. virtual bool isStrExpression() const; + // Returns true iff this expression is an EXISTS(...) expression. Default + // implementation returns `false`. + virtual bool isExistsExpression() const; + + // Return non-null pointers to all `EXISTS` expressions in the subtree. + // The result is passed in as a reference to simplify the recursive + // implementation. + virtual void getExistsExpressions( + std::vector& result) final; + // __________________________________________________________________________ virtual ~SparqlExpression() = default; From e356ee1c831d00aeb74ac17094cffcec1f4b55d1 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Tue, 7 Jan 2025 12:57:41 +0100 Subject: [PATCH 02/24] Add some parsing and add some thoughts. Signed-off-by: Johannes Kalmbach --- .../sparqlParser/SparqlQleverVisitor.cpp | 18 +++++++++++++++--- src/parser/sparqlParser/SparqlQleverVisitor.h | 4 ++-- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.cpp b/src/parser/sparqlParser/SparqlQleverVisitor.cpp index f23530f820..99a943d350 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.cpp +++ b/src/parser/sparqlParser/SparqlQleverVisitor.cpp @@ -15,6 +15,7 @@ #include "absl/time/time.h" #include "engine/sparqlExpressions/CountStarExpression.h" +#include "engine/sparqlExpressions/ExistsExpression.h" #include "engine/sparqlExpressions/GroupConcatExpression.h" #include "engine/sparqlExpressions/LiteralExpression.h" #include "engine/sparqlExpressions/NaryExpression.h" @@ -1366,6 +1367,7 @@ SparqlFilter Visitor::visit(Parser::FilterRContext* ctx) { // expression contains unbound variables, because the variables of the FILTER // might be bound after the filter appears in the query (which is perfectly // legal). + auto pimpl = visitExpressionPimpl(ctx->constraint()); return SparqlFilter{visitExpressionPimpl(ctx->constraint())}; } @@ -2229,6 +2231,10 @@ ExpressionPtr Visitor::visit([[maybe_unused]] Parser::BuiltInCallContext* ctx) { return visit(ctx->substringExpression()); } else if (ctx->strReplaceExpression()) { return visit(ctx->strReplaceExpression()); + } else if (ctx->existsFunc()) { + return visit(ctx->existsFunc()); + } else if (ctx->notExistsFunc()) { + return visit(ctx->notExistsFunc()); } // Get the function name and the arguments. Note that we do not have to check // the number of arguments like for `processIriFunctionCall`, since the number @@ -2418,12 +2424,18 @@ SparqlExpression::Ptr Visitor::visit(Parser::StrReplaceExpressionContext* ctx) { } // ____________________________________________________________________________________ -void Visitor::visit(const Parser::ExistsFuncContext* ctx) { - reportNotSupported(ctx, "The EXISTS function is"); +ExpressionPtr Visitor::visit(Parser::ExistsFuncContext* ctx) { + auto queryBackup = std::exchange(parsedQuery_, ParsedQuery{}); + auto group = visit(ctx->groupGraphPattern()); + ParsedQuery query = std::exchange(parsedQuery_, std::move(queryBackup)); + query.selectClause().setAsterisk(); + query._rootGraphPattern = std::move(group); + return std::make_unique(std::move(query)); } // ____________________________________________________________________________________ -void Visitor::visit(const Parser::NotExistsFuncContext* ctx) { +ExpressionPtr Visitor::visit(Parser::NotExistsFuncContext* ctx) { + // TODO Implement this without duplicating the code for EXISTS. reportNotSupported(ctx, "The NOT EXISTS function is"); } diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.h b/src/parser/sparqlParser/SparqlQleverVisitor.h index fb1cb9c05c..5fb4c95a08 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.h +++ b/src/parser/sparqlParser/SparqlQleverVisitor.h @@ -444,9 +444,9 @@ class SparqlQleverVisitor { ExpressionPtr visit(Parser::StrReplaceExpressionContext* ctx); - [[noreturn]] static void visit(const Parser::ExistsFuncContext* ctx); + ExpressionPtr visit(Parser::ExistsFuncContext* ctx); - [[noreturn]] static void visit(const Parser::NotExistsFuncContext* ctx); + ExpressionPtr visit(Parser::NotExistsFuncContext* ctx); ExpressionPtr visit(Parser::AggregateContext* ctx); From fc2017479677c4bc9a04a4f5d5259c3fe1d6d0de Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Tue, 7 Jan 2025 15:49:29 +0100 Subject: [PATCH 03/24] Also implement NOT EXISTS Signed-off-by: Johannes Kalmbach --- src/engine/ExistsScan.cpp | 6 +++-- src/engine/Filter.cpp | 18 +++++++++++++ src/engine/QueryExecutionTree.h | 3 +++ .../sparqlExpressions/ExistsExpression.h | 24 +++++++++++------- .../sparqlParser/SparqlQleverVisitor.cpp | 25 +++++++++++++++---- src/parser/sparqlParser/SparqlQleverVisitor.h | 5 ++++ 6 files changed, 65 insertions(+), 16 deletions(-) diff --git a/src/engine/ExistsScan.cpp b/src/engine/ExistsScan.cpp index f42da68f3d..1604e353ad 100644 --- a/src/engine/ExistsScan.cpp +++ b/src/engine/ExistsScan.cpp @@ -95,8 +95,10 @@ ProtoResult ExistsScan::computeResult([[maybe_unused]] bool requestLaziness) { }; // TODO Handle UNDEF values correctly (and efficiently) - auto findUndefDispatch = [](const auto& row, It begin, auto end, - bool& outOfOrder) { + auto findUndefDispatch = []([[maybe_unused]] const auto& row, + [[maybe_unused]] It begin, + [[maybe_unused]] auto end, + [[maybe_unused]] bool& outOfOrder) { return std::array{}; }; diff --git a/src/engine/Filter.cpp b/src/engine/Filter.cpp index 9ecdd85f7a..519c0d9da5 100644 --- a/src/engine/Filter.cpp +++ b/src/engine/Filter.cpp @@ -10,10 +10,13 @@ #include "backports/algorithm.h" #include "engine/CallFixedSize.h" +#include "engine/ExistsScan.h" #include "engine/QueryExecutionTree.h" +#include "engine/QueryPlanner.h" #include "engine/sparqlExpressions/SparqlExpression.h" #include "engine/sparqlExpressions/SparqlExpressionGenerators.h" #include "engine/sparqlExpressions/SparqlExpressionValueGetters.h" +#include "sparqlExpressions/ExistsExpression.h" using std::endl; using std::string; @@ -28,6 +31,21 @@ Filter::Filter(QueryExecutionContext* qec, : Operation(qec), _subtree(std::move(subtree)), _expression{std::move(expression)} { + std::vector existsExpressions; + _expression.getPimpl()->getExistsExpressions(existsExpressions); + for (auto* expr : existsExpressions) { + const auto& exists = + dynamic_cast(*expr); + QueryPlanner qp{getExecutionContext(), cancellationHandle_}; + // TODO This can be done by the expression itself, then it is + // automatically duplicated. + auto pq = exists.argument(); + auto tree = + std::make_shared(qp.createExecutionTree(pq)); + _subtree = ad_utility::makeExecutionTree( + getExecutionContext(), std::move(_subtree), std::move(tree), + exists.variable()); + } setPrefilterExpressionForChildren(); } diff --git a/src/engine/QueryExecutionTree.h b/src/engine/QueryExecutionTree.h index 0eac785f16..3c074d6c47 100644 --- a/src/engine/QueryExecutionTree.h +++ b/src/engine/QueryExecutionTree.h @@ -25,7 +25,10 @@ class QueryExecutionTree { std::shared_ptr operation) : QueryExecutionTree(qec) { rootOperation_ = std::move(operation); + // TODO This currently fails for EXISTS but it is also unneeded. + /* readFromCache(); + */ } std::string getCacheKey() const; diff --git a/src/engine/sparqlExpressions/ExistsExpression.h b/src/engine/sparqlExpressions/ExistsExpression.h index 5ec68acd61..d5eff23ba8 100644 --- a/src/engine/sparqlExpressions/ExistsExpression.h +++ b/src/engine/sparqlExpressions/ExistsExpression.h @@ -12,27 +12,33 @@ namespace sparqlExpression { class ExistsExpression : public SparqlExpression { private: - std::variant argument_; + ParsedQuery argument_; + static inline std::atomic indexCounter_ = 0; + size_t index_ = ++indexCounter_; + Variable variable_{absl::StrCat("?ql_internal_exists_", index_)}; public: - auto& argument() { return argument_; } + const auto& argument() const { return argument_; } + const auto& variable() const { return variable_; } ExistsExpression(ParsedQuery query) : argument_{std::move(query)} {} ExpressionResult evaluate(EvaluationContext* context) const override { - AD_CONTRACT_CHECK(std::holds_alternative(argument_)); - return std::get(argument_); + AD_CONTRACT_CHECK(context->_variableToColumnMap.contains(variable_)); + return variable_; } - //_________________________________________________________________________ + //____________________________________________________________________________ [[nodiscard]] string getCacheKey( const VariableToColumnMap& varColMap) const override { // TODO get a proper cache key here - AD_CONTRACT_CHECK(std::holds_alternative(argument_)); - return absl::StrCat( - "EXISTS WITH COLUMN ", - varColMap.at(std::get(argument_)).columnIndex_); + AD_CONTRACT_CHECK(varColMap.contains(variable_)); + return absl::StrCat("EXISTS WITH COL ", + varColMap.at(variable_).columnIndex_); } + // ____________________________________________________________________________ + bool isExistsExpression() const override { return true; } + private: std::span childrenImpl() override { return {}; } }; diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.cpp b/src/parser/sparqlParser/SparqlQleverVisitor.cpp index 99a943d350..903544c96a 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.cpp +++ b/src/parser/sparqlParser/SparqlQleverVisitor.cpp @@ -26,6 +26,7 @@ #include "engine/sparqlExpressions/SampleExpression.h" #include "engine/sparqlExpressions/StdevExpression.h" #include "engine/sparqlExpressions/UuidExpressions.h" +#include "generated/SparqlAutomaticParser.h" #include "global/Constants.h" #include "global/RuntimeParameters.h" #include "parser/GraphPatternOperation.h" @@ -2424,19 +2425,33 @@ SparqlExpression::Ptr Visitor::visit(Parser::StrReplaceExpressionContext* ctx) { } // ____________________________________________________________________________________ -ExpressionPtr Visitor::visit(Parser::ExistsFuncContext* ctx) { +ExpressionPtr Visitor::visitExists(Parser::GroupGraphPatternContext* pattern, + bool negate) { auto queryBackup = std::exchange(parsedQuery_, ParsedQuery{}); - auto group = visit(ctx->groupGraphPattern()); + auto visibleVariablesSoFar = std::move(visibleVariables_); + visibleVariables_.clear(); + auto group = visit(pattern); ParsedQuery query = std::exchange(parsedQuery_, std::move(queryBackup)); query.selectClause().setAsterisk(); query._rootGraphPattern = std::move(group); - return std::make_unique(std::move(query)); + visibleVariables_ = std::move(visibleVariablesSoFar); + auto exists = + std::make_unique(std::move(query)); + if (negate) { + return sparqlExpression::makeUnaryNegateExpression(std::move(exists)); + } else { + return exists; + } +} + +// ____________________________________________________________________________________ +ExpressionPtr Visitor::visit(Parser::ExistsFuncContext* ctx) { + return visitExists(ctx->groupGraphPattern(), false); } // ____________________________________________________________________________________ ExpressionPtr Visitor::visit(Parser::NotExistsFuncContext* ctx) { - // TODO Implement this without duplicating the code for EXISTS. - reportNotSupported(ctx, "The NOT EXISTS function is"); + return visitExists(ctx->groupGraphPattern(), true); } // ____________________________________________________________________________________ diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.h b/src/parser/sparqlParser/SparqlQleverVisitor.h index 5fb4c95a08..3e7b63c3ad 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.h +++ b/src/parser/sparqlParser/SparqlQleverVisitor.h @@ -444,6 +444,11 @@ class SparqlQleverVisitor { ExpressionPtr visit(Parser::StrReplaceExpressionContext* ctx); + // The common implementation of the parsing of `EXISTS` and `NOT EXISTS`. + // The second argument is `true` for `NOT EXISTS`. + ExpressionPtr visitExists(Parser::GroupGraphPatternContext* pattern, + bool negate); + ExpressionPtr visit(Parser::ExistsFuncContext* ctx); ExpressionPtr visit(Parser::NotExistsFuncContext* ctx); From dde296b052dee3c267acdeaec514a3e3b47e5cb9 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Tue, 7 Jan 2025 16:27:14 +0100 Subject: [PATCH 04/24] Fix a small warning, to feed this to the tool. Signed-off-by: Johannes Kalmbach --- src/engine/ExistsScan.cpp | 4 ++-- src/engine/sparqlExpressions/ExistsExpression.h | 13 +++++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/engine/ExistsScan.cpp b/src/engine/ExistsScan.cpp index 1604e353ad..651e8e61cb 100644 --- a/src/engine/ExistsScan.cpp +++ b/src/engine/ExistsScan.cpp @@ -14,8 +14,8 @@ ExistsScan::ExistsScan(QueryExecutionContext* qec, : Operation{qec}, left_{std::move(left)}, right_{std::move(right)}, - existsVariable_{std::move(existsVariable)}, - joinColumns_{QueryExecutionTree::getJoinColumns(*left_, *right_)} {} + joinColumns_{QueryExecutionTree::getJoinColumns(*left_, *right_)}, + existsVariable_{std::move(existsVariable)} {} // _____________________________________________________________________________ string ExistsScan::getCacheKeyImpl() const { diff --git a/src/engine/sparqlExpressions/ExistsExpression.h b/src/engine/sparqlExpressions/ExistsExpression.h index d5eff23ba8..343c195e82 100644 --- a/src/engine/sparqlExpressions/ExistsExpression.h +++ b/src/engine/sparqlExpressions/ExistsExpression.h @@ -30,10 +30,15 @@ class ExistsExpression : public SparqlExpression { //____________________________________________________________________________ [[nodiscard]] string getCacheKey( const VariableToColumnMap& varColMap) const override { - // TODO get a proper cache key here - AD_CONTRACT_CHECK(varColMap.contains(variable_)); - return absl::StrCat("EXISTS WITH COL ", - varColMap.at(variable_).columnIndex_); + if (varColMap.contains(variable_)) { + return absl::StrCat("EXISTS WITH COL ", + varColMap.at(variable_).columnIndex_); + } else { + // This means that the necessary `ExistsScan` hasn't been set up yet. + // It is not possible to cache such incomplete operations, so we return + // a random cache key. + return std::to_string(ad_utility::FastRandomIntGenerator{}()); + } } // ____________________________________________________________________________ From 0d1c788e11f3a2d2b6bb2dfea6bbbc6fba7f1bc3 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 8 Jan 2025 09:07:04 +0100 Subject: [PATCH 05/24] Some cleanups and fixes. Signed-off-by: Johannes Kalmbach --- src/engine/Bind.cpp | 10 +++ src/engine/Bind.h | 6 +- src/engine/ExistsScan.cpp | 75 +++++++++++++++---- src/engine/ExistsScan.h | 5 ++ src/engine/Filter.cpp | 18 +---- src/engine/GroupBy.cpp | 7 ++ src/engine/MultiColumnJoin.cpp | 14 ++-- src/engine/QueryExecutionTree.h | 3 - .../sparqlExpressions/SparqlExpression.cpp | 2 +- .../sparqlExpressions/SparqlExpression.h | 2 +- src/util/JoinAlgorithms/FindUndefRanges.h | 33 ++++++++ 11 files changed, 128 insertions(+), 47 deletions(-) diff --git a/src/engine/Bind.cpp b/src/engine/Bind.cpp index 95de8a4dfe..230ca1cb68 100644 --- a/src/engine/Bind.cpp +++ b/src/engine/Bind.cpp @@ -5,12 +5,22 @@ #include "Bind.h" #include "engine/CallFixedSize.h" +#include "engine/ExistsScan.h" #include "engine/QueryExecutionTree.h" #include "engine/sparqlExpressions/SparqlExpression.h" #include "engine/sparqlExpressions/SparqlExpressionGenerators.h" #include "util/ChunkedForLoop.h" #include "util/Exception.h" +// _____________________________________________________________________________ +Bind::Bind(QueryExecutionContext* qec, + std::shared_ptr subtree, parsedQuery::Bind b) + : Operation(qec), _subtree(std::move(subtree)), _bind(std::move(b)) { + _subtree = ExistsScan::addExistsScansToSubtree( + _bind._expression, std::move(subtree), getExecutionContext(), + cancellationHandle_); +} + // BIND adds exactly one new column size_t Bind::getResultWidth() const { return _subtree->getResultWidth() + 1; } diff --git a/src/engine/Bind.h b/src/engine/Bind.h index 34c515fb54..3336e0ddbc 100644 --- a/src/engine/Bind.h +++ b/src/engine/Bind.h @@ -8,14 +8,14 @@ #include "engine/sparqlExpressions/SparqlExpressionPimpl.h" #include "parser/ParsedQuery.h" -/// BIND operation, currently only supports a very limited subset of expressions +// BIND operation. class Bind : public Operation { public: static constexpr size_t CHUNK_SIZE = 10'000; + // ____________________________________________________________________________ Bind(QueryExecutionContext* qec, std::shared_ptr subtree, - parsedQuery::Bind b) - : Operation(qec), _subtree(std::move(subtree)), _bind(std::move(b)) {} + parsedQuery::Bind b); private: std::shared_ptr _subtree; diff --git a/src/engine/ExistsScan.cpp b/src/engine/ExistsScan.cpp index 651e8e61cb..26fde12984 100644 --- a/src/engine/ExistsScan.cpp +++ b/src/engine/ExistsScan.cpp @@ -4,6 +4,9 @@ #include "engine/ExistsScan.h" +#include "engine/QueryPlanner.h" +#include "engine/sparqlExpressions/ExistsExpression.h" +#include "engine/sparqlExpressions/SparqlExpression.h" #include "util/JoinAlgorithms/JoinAlgorithms.h" // _____________________________________________________________________________ @@ -15,7 +18,10 @@ ExistsScan::ExistsScan(QueryExecutionContext* qec, left_{std::move(left)}, right_{std::move(right)}, joinColumns_{QueryExecutionTree::getJoinColumns(*left_, *right_)}, - existsVariable_{std::move(existsVariable)} {} + existsVariable_{std::move(existsVariable)} { + std::tie(left_, right_) = QueryExecutionTree::createSortedTrees( + std::move(left_), std::move(right_), joinColumns_); +} // _____________________________________________________________________________ string ExistsScan::getCacheKeyImpl() const { @@ -85,28 +91,41 @@ ProtoResult ExistsScan::computeResult([[maybe_unused]] bool requestLaziness) { checkCancellation(); + // `isCheap` is true iff there are no UNDEF values in the join columns. In + // this case we can use a much cheaper algorithm. + // TODO There are many other cases where a cheaper implementation can + // be chosen, but we leave those for another PR, this is the most common case. + namespace stdr = ql::ranges; + size_t numJoinColumns = joinColumnsLeft.size(); + AD_CORRECTNESS_CHECK(numJoinColumns == joinColumnsRight.size()); + bool isCheap = stdr::none_of( + ad_utility::integerRange(numJoinColumns), [&](const auto& col) { + return (stdr::any_of(joinColumnsRight.getColumn(col), + &Id::isUndefined)) || + (stdr::any_of(joinColumnsLeft.getColumn(col), &Id::isUndefined)); + }); + auto noopRowAdder = [](auto&&...) {}; - // TODO Memory limit. - std::vector notExistsIndices; + std::vector> notExistsIndices{ + allocator()}; auto actionForNotExisting = [¬ExistsIndices, begin = joinColumnsLeft.begin()](const auto& itLeft) { notExistsIndices.push_back(itLeft - begin); }; - // TODO Handle UNDEF values correctly (and efficiently) - auto findUndefDispatch = []([[maybe_unused]] const auto& row, - [[maybe_unused]] It begin, - [[maybe_unused]] auto end, - [[maybe_unused]] bool& outOfOrder) { - return std::array{}; - }; - auto checkCancellationLambda = [this] { checkCancellation(); }; - [[maybe_unused]] auto numOutOfOrder = ad_utility::zipperJoinWithUndef( - joinColumnsLeft, joinColumnsRight, ql::ranges::lexicographical_compare, - noopRowAdder, findUndefDispatch, findUndefDispatch, actionForNotExisting, - checkCancellationLambda); + auto runZipperJoin = [&](auto findUndef) { + [[maybe_unused]] auto numOutOfOrder = ad_utility::zipperJoinWithUndef( + joinColumnsLeft, joinColumnsRight, ql::ranges::lexicographical_compare, + noopRowAdder, findUndef, findUndef, actionForNotExisting, + checkCancellationLambda); + }; + if (isCheap) { + runZipperJoin(ad_utility::noop); + } else { + runZipperJoin(ad_utility::findSmallerUndefRanges); + } // Set up the result; IdTable result = left.clone(); @@ -118,3 +137,29 @@ ProtoResult ExistsScan::computeResult([[maybe_unused]] bool requestLaziness) { } return {std::move(result), resultSortedOn(), leftRes->getCopyOfLocalVocab()}; } + +// _____________________________________________________________________________ +std::shared_ptr ExistsScan::addExistsScansToSubtree( + const sparqlExpression::SparqlExpressionPimpl& expression, + std::shared_ptr subtree, QueryExecutionContext* qec, + const ad_utility::SharedCancellationHandle& cancellationHandle) { + std::vector existsExpressions; + expression.getPimpl()->getExistsExpressions(existsExpressions); + for (auto* expr : existsExpressions) { + const auto& exists = + dynamic_cast(*expr); + // Currently some FILTERs are applied multiple times especially when there + // are OPTIONAL joins in the query. In these cases we have to make sure that + // the `ExistsScan` is added only once. + if (subtree->isVariableCovered(exists.variable())) { + continue; + } + QueryPlanner qp{qec, cancellationHandle}; + auto pq = exists.argument(); + auto tree = + std::make_shared(qp.createExecutionTree(pq)); + subtree = ad_utility::makeExecutionTree( + qec, std::move(subtree), std::move(tree), exists.variable()); + } + return subtree; +} diff --git a/src/engine/ExistsScan.h b/src/engine/ExistsScan.h index b08e06c542..dbd947d302 100644 --- a/src/engine/ExistsScan.h +++ b/src/engine/ExistsScan.h @@ -24,6 +24,11 @@ class ExistsScan : public Operation { std::shared_ptr right, Variable existsVariable); + static std::shared_ptr addExistsScansToSubtree( + const sparqlExpression::SparqlExpressionPimpl& expression, + std::shared_ptr subtree, QueryExecutionContext* qec, + const ad_utility::SharedCancellationHandle& cancellationHandle); + protected: string getCacheKeyImpl() const override; diff --git a/src/engine/Filter.cpp b/src/engine/Filter.cpp index 519c0d9da5..ff8edc1fc1 100644 --- a/src/engine/Filter.cpp +++ b/src/engine/Filter.cpp @@ -31,21 +31,9 @@ Filter::Filter(QueryExecutionContext* qec, : Operation(qec), _subtree(std::move(subtree)), _expression{std::move(expression)} { - std::vector existsExpressions; - _expression.getPimpl()->getExistsExpressions(existsExpressions); - for (auto* expr : existsExpressions) { - const auto& exists = - dynamic_cast(*expr); - QueryPlanner qp{getExecutionContext(), cancellationHandle_}; - // TODO This can be done by the expression itself, then it is - // automatically duplicated. - auto pq = exists.argument(); - auto tree = - std::make_shared(qp.createExecutionTree(pq)); - _subtree = ad_utility::makeExecutionTree( - getExecutionContext(), std::move(_subtree), std::move(tree), - exists.variable()); - } + _subtree = ExistsScan::addExistsScansToSubtree( + _expression, std::move(_subtree), getExecutionContext(), + cancellationHandle_); setPrefilterExpressionForChildren(); } diff --git a/src/engine/GroupBy.cpp b/src/engine/GroupBy.cpp index a6ff49bbe1..0fe65fd00e 100644 --- a/src/engine/GroupBy.cpp +++ b/src/engine/GroupBy.cpp @@ -9,6 +9,7 @@ #include #include "engine/CallFixedSize.h" +#include "engine/ExistsScan.h" #include "engine/IndexScan.h" #include "engine/Join.h" #include "engine/LazyGroupBy.h" @@ -52,6 +53,12 @@ GroupBy::GroupBy(QueryExecutionContext* qec, vector groupByVariables, ql::ranges::sort(_groupByVariables, std::less<>{}, &Variable::name); auto sortColumns = computeSortColumns(subtree.get()); + + for (const auto& alias : _aliases) { + _subtree = ExistsScan::addExistsScansToSubtree( + alias._expression, std::move(subtree), getExecutionContext(), + cancellationHandle_); + } _subtree = QueryExecutionTree::createSortedTree(std::move(subtree), sortColumns); } diff --git a/src/engine/MultiColumnJoin.cpp b/src/engine/MultiColumnJoin.cpp index bb3e4e5995..b605616ecb 100644 --- a/src/engine/MultiColumnJoin.cpp +++ b/src/engine/MultiColumnJoin.cpp @@ -237,17 +237,11 @@ void MultiColumnJoin::computeMultiColumnJoin( rowAdder.addRow(itLeft - beginLeft, itRight - beginRight); }; - auto findUndef = [](const auto& row, auto begin, auto end, - bool& resultMightBeUnsorted) { - return ad_utility::findSmallerUndefRanges(row, begin, end, - resultMightBeUnsorted); - }; - // `isCheap` is true iff there are no UNDEF values in the join columns. In // this case we can use a much cheaper algorithm. // TODO There are many other cases where a cheaper implementation can // be chosen, but we leave those for another PR, this is the most common case. - namespace stdr = std::ranges; + namespace stdr = ql::ranges; bool isCheap = stdr::none_of(joinColumns, [&](const auto& jcs) { auto [leftCol, rightCol] = jcs; return (stdr::any_of(right.getColumn(rightCol), &Id::isUndefined)) || @@ -265,8 +259,10 @@ void MultiColumnJoin::computeMultiColumnJoin( } else { return ad_utility::zipperJoinWithUndef( leftJoinColumns, rightJoinColumns, - ql::ranges::lexicographical_compare, addRow, findUndef, findUndef, - ad_utility::noop, checkCancellationLambda); + ql::ranges::lexicographical_compare, addRow, + ad_utility::findSmallerUndefRanges, + ad_utility::findSmallerUndefRanges, ad_utility::noop, + checkCancellationLambda); } }(); *result = std::move(rowAdder).resultTable(); diff --git a/src/engine/QueryExecutionTree.h b/src/engine/QueryExecutionTree.h index 3c074d6c47..0eac785f16 100644 --- a/src/engine/QueryExecutionTree.h +++ b/src/engine/QueryExecutionTree.h @@ -25,10 +25,7 @@ class QueryExecutionTree { std::shared_ptr operation) : QueryExecutionTree(qec) { rootOperation_ = std::move(operation); - // TODO This currently fails for EXISTS but it is also unneeded. - /* readFromCache(); - */ } std::string getCacheKey() const; diff --git a/src/engine/sparqlExpressions/SparqlExpression.cpp b/src/engine/sparqlExpressions/SparqlExpression.cpp index 00864b998d..099933020f 100644 --- a/src/engine/sparqlExpressions/SparqlExpression.cpp +++ b/src/engine/sparqlExpressions/SparqlExpression.cpp @@ -186,7 +186,7 @@ bool SparqlExpression::isExistsExpression() const { return false; } // ________________________________________________________________ void SparqlExpression::getExistsExpressions( - std::vector& result) { + std::vector& result) const { if (isExistsExpression()) { result.push_back(this); } diff --git a/src/engine/sparqlExpressions/SparqlExpression.h b/src/engine/sparqlExpressions/SparqlExpression.h index d5f7248daf..7f5c551127 100644 --- a/src/engine/sparqlExpressions/SparqlExpression.h +++ b/src/engine/sparqlExpressions/SparqlExpression.h @@ -131,7 +131,7 @@ class SparqlExpression { // The result is passed in as a reference to simplify the recursive // implementation. virtual void getExistsExpressions( - std::vector& result) final; + std::vector& result) const final; // __________________________________________________________________________ virtual ~SparqlExpression() = default; diff --git a/src/util/JoinAlgorithms/FindUndefRanges.h b/src/util/JoinAlgorithms/FindUndefRanges.h index 7b3f3296cb..cbdbc1b4fd 100644 --- a/src/util/JoinAlgorithms/FindUndefRanges.h +++ b/src/util/JoinAlgorithms/FindUndefRanges.h @@ -165,6 +165,38 @@ auto findSmallerUndefRangesArbitrary(const auto& row, It begin, It end, // have additional information about the input (most notably which of the join // columns contain no UNDEF at all) and therefore a more specialized routine // should be chosen. +struct FindSmallerUndefRanges { + template + auto operator()(const auto& row, It begin, It end, + bool& resultMightBeUnsorted) -> cppcoro::generator { + size_t numLastUndefined = 0; + assert(row.size() > 0); + auto it = ql::ranges::rbegin(row); + auto rend = ql::ranges::rend(row); + for (; it < rend; ++it) { + if (*it != Id::makeUndefined()) { + break; + } + ++numLastUndefined; + } + + for (; it < rend; ++it) { + if (*it == Id::makeUndefined()) { + return findSmallerUndefRangesArbitrary(row, begin, end, + resultMightBeUnsorted); + } + } + if (numLastUndefined == 0) { + return findSmallerUndefRangesForRowsWithoutUndef(row, begin, end, + resultMightBeUnsorted); + } else { + return findSmallerUndefRangesForRowsWithUndefInLastColumns( + row, numLastUndefined, begin, end, resultMightBeUnsorted); + } + } +}; +constexpr FindSmallerUndefRanges findSmallerUndefRanges; +/* template auto findSmallerUndefRanges(const auto& row, It begin, It end, bool& resultMightBeUnsorted) @@ -194,4 +226,5 @@ auto findSmallerUndefRanges(const auto& row, It begin, It end, row, numLastUndefined, begin, end, resultMightBeUnsorted); } } +*/ } // namespace ad_utility From 7ff49c97404cd9604bc16fe0e775a61b8b0ef6b3 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 8 Jan 2025 09:10:10 +0100 Subject: [PATCH 06/24] Fix compilation. Signed-off-by: Johannes Kalmbach --- src/util/JoinAlgorithms/FindUndefRanges.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util/JoinAlgorithms/FindUndefRanges.h b/src/util/JoinAlgorithms/FindUndefRanges.h index cbdbc1b4fd..bf15685f37 100644 --- a/src/util/JoinAlgorithms/FindUndefRanges.h +++ b/src/util/JoinAlgorithms/FindUndefRanges.h @@ -168,7 +168,7 @@ auto findSmallerUndefRangesArbitrary(const auto& row, It begin, It end, struct FindSmallerUndefRanges { template auto operator()(const auto& row, It begin, It end, - bool& resultMightBeUnsorted) -> cppcoro::generator { + bool& resultMightBeUnsorted) const -> cppcoro::generator { size_t numLastUndefined = 0; assert(row.size() > 0); auto it = ql::ranges::rbegin(row); From 7ec8947c759514efdbd0a533a5c00a545d5ecc4c Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 8 Jan 2025 09:24:02 +0100 Subject: [PATCH 07/24] Fix the many many segfaults. Signed-off-by: Johannes Kalmbach --- src/engine/Bind.cpp | 2 +- src/engine/GroupBy.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/engine/Bind.cpp b/src/engine/Bind.cpp index 230ca1cb68..bdccf14488 100644 --- a/src/engine/Bind.cpp +++ b/src/engine/Bind.cpp @@ -17,7 +17,7 @@ Bind::Bind(QueryExecutionContext* qec, std::shared_ptr subtree, parsedQuery::Bind b) : Operation(qec), _subtree(std::move(subtree)), _bind(std::move(b)) { _subtree = ExistsScan::addExistsScansToSubtree( - _bind._expression, std::move(subtree), getExecutionContext(), + _bind._expression, std::move(_subtree), getExecutionContext(), cancellationHandle_); } diff --git a/src/engine/GroupBy.cpp b/src/engine/GroupBy.cpp index 0fe65fd00e..cfa8621709 100644 --- a/src/engine/GroupBy.cpp +++ b/src/engine/GroupBy.cpp @@ -55,7 +55,7 @@ GroupBy::GroupBy(QueryExecutionContext* qec, vector groupByVariables, auto sortColumns = computeSortColumns(subtree.get()); for (const auto& alias : _aliases) { - _subtree = ExistsScan::addExistsScansToSubtree( + subtree = ExistsScan::addExistsScansToSubtree( alias._expression, std::move(subtree), getExecutionContext(), cancellationHandle_); } From c03f3e59f2097c3f14bb9cb214eb6ddfadec2992 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 8 Jan 2025 09:34:56 +0100 Subject: [PATCH 08/24] Fix another bug. Signed-off-by: Johannes Kalmbach --- src/engine/ExistsScan.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/engine/ExistsScan.cpp b/src/engine/ExistsScan.cpp index 26fde12984..c416d1dc41 100644 --- a/src/engine/ExistsScan.cpp +++ b/src/engine/ExistsScan.cpp @@ -96,8 +96,8 @@ ProtoResult ExistsScan::computeResult([[maybe_unused]] bool requestLaziness) { // TODO There are many other cases where a cheaper implementation can // be chosen, but we leave those for another PR, this is the most common case. namespace stdr = ql::ranges; - size_t numJoinColumns = joinColumnsLeft.size(); - AD_CORRECTNESS_CHECK(numJoinColumns == joinColumnsRight.size()); + size_t numJoinColumns = joinColumnsLeft.numColumns(); + AD_CORRECTNESS_CHECK(numJoinColumns == joinColumnsRight.numColumns()); bool isCheap = stdr::none_of( ad_utility::integerRange(numJoinColumns), [&](const auto& col) { return (stdr::any_of(joinColumnsRight.getColumn(col), From 2da52abc6aea83dac6eb55dc536ef1f4e184fb1b Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 8 Jan 2025 10:55:21 +0100 Subject: [PATCH 09/24] Fix another bug. Signed-off-by: Johannes Kalmbach --- src/engine/Bind.cpp | 4 +- src/engine/CMakeLists.txt | 2 +- src/engine/{ExistsScan.cpp => ExistsJoin.cpp} | 26 ++++++------- src/engine/{ExistsScan.h => ExistsJoin.h} | 4 +- src/engine/Filter.cpp | 4 +- src/engine/GroupBy.cpp | 4 +- src/util/JoinAlgorithms/FindUndefRanges.h | 31 --------------- test/QueryPlannerTest.cpp | 13 ++++++- test/QueryPlannerTestHelpers.h | 7 ++++ test/SparqlAntlrParserTest.cpp | 39 +++++++++++++++++++ 10 files changed, 80 insertions(+), 54 deletions(-) rename src/engine/{ExistsScan.cpp => ExistsJoin.cpp} (90%) rename src/engine/{ExistsScan.h => ExistsJoin.h} (95%) diff --git a/src/engine/Bind.cpp b/src/engine/Bind.cpp index bdccf14488..276f04e9fc 100644 --- a/src/engine/Bind.cpp +++ b/src/engine/Bind.cpp @@ -5,7 +5,7 @@ #include "Bind.h" #include "engine/CallFixedSize.h" -#include "engine/ExistsScan.h" +#include "engine/ExistsJoin.h" #include "engine/QueryExecutionTree.h" #include "engine/sparqlExpressions/SparqlExpression.h" #include "engine/sparqlExpressions/SparqlExpressionGenerators.h" @@ -16,7 +16,7 @@ Bind::Bind(QueryExecutionContext* qec, std::shared_ptr subtree, parsedQuery::Bind b) : Operation(qec), _subtree(std::move(subtree)), _bind(std::move(b)) { - _subtree = ExistsScan::addExistsScansToSubtree( + _subtree = ExistsJoin::addExistsScansToSubtree( _bind._expression, std::move(_subtree), getExecutionContext(), cancellationHandle_); } diff --git a/src/engine/CMakeLists.txt b/src/engine/CMakeLists.txt index c724a8fb39..a3750a07e5 100644 --- a/src/engine/CMakeLists.txt +++ b/src/engine/CMakeLists.txt @@ -14,5 +14,5 @@ add_library(engine CartesianProductJoin.cpp TextIndexScanForWord.cpp TextIndexScanForEntity.cpp TextLimit.cpp LazyGroupBy.cpp GroupByHashMapOptimization.cpp SpatialJoin.cpp CountConnectedSubgraphs.cpp SpatialJoinAlgorithms.cpp PathSearch.cpp ExecuteUpdate.cpp - Describe.cpp ExistsScan.cpp) + Describe.cpp ExistsJoin.cpp) qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams s2) diff --git a/src/engine/ExistsScan.cpp b/src/engine/ExistsJoin.cpp similarity index 90% rename from src/engine/ExistsScan.cpp rename to src/engine/ExistsJoin.cpp index c416d1dc41..d8d3f564d1 100644 --- a/src/engine/ExistsScan.cpp +++ b/src/engine/ExistsJoin.cpp @@ -2,7 +2,7 @@ // Chair of Algorithms and Data Structures. // Author: Johannes Kalmbach -#include "engine/ExistsScan.h" +#include "engine/ExistsJoin.h" #include "engine/QueryPlanner.h" #include "engine/sparqlExpressions/ExistsExpression.h" @@ -10,7 +10,7 @@ #include "util/JoinAlgorithms/JoinAlgorithms.h" // _____________________________________________________________________________ -ExistsScan::ExistsScan(QueryExecutionContext* qec, +ExistsJoin::ExistsJoin(QueryExecutionContext* qec, std::shared_ptr left, std::shared_ptr right, Variable existsVariable) @@ -24,16 +24,16 @@ ExistsScan::ExistsScan(QueryExecutionContext* qec, } // _____________________________________________________________________________ -string ExistsScan::getCacheKeyImpl() const { +string ExistsJoin::getCacheKeyImpl() const { return absl::StrCat("EXISTS SCAN left: ", left_->getCacheKey(), " right: ", right_->getCacheKey()); } // _____________________________________________________________________________ -string ExistsScan::getDescriptor() const { return "EXISTS scan"; } +string ExistsJoin::getDescriptor() const { return "EXISTS scan"; } // ____________________________________________________________________________ -VariableToColumnMap ExistsScan::computeVariableToColumnMap() const { +VariableToColumnMap ExistsJoin::computeVariableToColumnMap() const { auto res = left_->getVariableColumns(); AD_CONTRACT_CHECK( !res.contains(existsVariable_), @@ -43,18 +43,18 @@ VariableToColumnMap ExistsScan::computeVariableToColumnMap() const { } // ____________________________________________________________________________ -size_t ExistsScan::getResultWidth() const { +size_t ExistsJoin::getResultWidth() const { // We add one column to the input. return left_->getResultWidth() + 1; } // ____________________________________________________________________________ -vector ExistsScan::resultSortedOn() const { +vector ExistsJoin::resultSortedOn() const { return left_->resultSortedOn(); } // ____________________________________________________________________________ -float ExistsScan::getMultiplicity(size_t col) { +float ExistsJoin::getMultiplicity(size_t col) { if (col < getResultWidth() - 1) { return left_->getMultiplicity(col); } @@ -64,18 +64,18 @@ float ExistsScan::getMultiplicity(size_t col) { } // ____________________________________________________________________________ -uint64_t ExistsScan::getSizeEstimateBeforeLimit() { +uint64_t ExistsJoin::getSizeEstimateBeforeLimit() { return left_->getSizeEstimate(); } // ____________________________________________________________________________ -size_t ExistsScan::getCostEstimate() { +size_t ExistsJoin::getCostEstimate() { return left_->getCostEstimate() + right_->getCostEstimate() + left_->getSizeEstimate() + right_->getSizeEstimate(); } // ____________________________________________________________________________ -ProtoResult ExistsScan::computeResult([[maybe_unused]] bool requestLaziness) { +ProtoResult ExistsJoin::computeResult([[maybe_unused]] bool requestLaziness) { auto leftRes = left_->getResult(); auto rightRes = right_->getResult(); const auto& left = leftRes->idTable(); @@ -139,7 +139,7 @@ ProtoResult ExistsScan::computeResult([[maybe_unused]] bool requestLaziness) { } // _____________________________________________________________________________ -std::shared_ptr ExistsScan::addExistsScansToSubtree( +std::shared_ptr ExistsJoin::addExistsScansToSubtree( const sparqlExpression::SparqlExpressionPimpl& expression, std::shared_ptr subtree, QueryExecutionContext* qec, const ad_utility::SharedCancellationHandle& cancellationHandle) { @@ -158,7 +158,7 @@ std::shared_ptr ExistsScan::addExistsScansToSubtree( auto pq = exists.argument(); auto tree = std::make_shared(qp.createExecutionTree(pq)); - subtree = ad_utility::makeExecutionTree( + subtree = ad_utility::makeExecutionTree( qec, std::move(subtree), std::move(tree), exists.variable()); } return subtree; diff --git a/src/engine/ExistsScan.h b/src/engine/ExistsJoin.h similarity index 95% rename from src/engine/ExistsScan.h rename to src/engine/ExistsJoin.h index dbd947d302..9b9c7483ce 100644 --- a/src/engine/ExistsScan.h +++ b/src/engine/ExistsJoin.h @@ -7,7 +7,7 @@ #include "engine/Operation.h" #include "engine/QueryExecutionTree.h" -class ExistsScan : public Operation { +class ExistsJoin : public Operation { private: std::shared_ptr left_; std::shared_ptr right_; @@ -19,7 +19,7 @@ class ExistsScan : public Operation { std::vector> _matchedColumns; public: - ExistsScan(QueryExecutionContext* qec, + ExistsJoin(QueryExecutionContext* qec, std::shared_ptr left, std::shared_ptr right, Variable existsVariable); diff --git a/src/engine/Filter.cpp b/src/engine/Filter.cpp index ff8edc1fc1..9da7c12724 100644 --- a/src/engine/Filter.cpp +++ b/src/engine/Filter.cpp @@ -10,7 +10,7 @@ #include "backports/algorithm.h" #include "engine/CallFixedSize.h" -#include "engine/ExistsScan.h" +#include "engine/ExistsJoin.h" #include "engine/QueryExecutionTree.h" #include "engine/QueryPlanner.h" #include "engine/sparqlExpressions/SparqlExpression.h" @@ -31,7 +31,7 @@ Filter::Filter(QueryExecutionContext* qec, : Operation(qec), _subtree(std::move(subtree)), _expression{std::move(expression)} { - _subtree = ExistsScan::addExistsScansToSubtree( + _subtree = ExistsJoin::addExistsScansToSubtree( _expression, std::move(_subtree), getExecutionContext(), cancellationHandle_); setPrefilterExpressionForChildren(); diff --git a/src/engine/GroupBy.cpp b/src/engine/GroupBy.cpp index cfa8621709..3e8af1cb29 100644 --- a/src/engine/GroupBy.cpp +++ b/src/engine/GroupBy.cpp @@ -9,7 +9,7 @@ #include #include "engine/CallFixedSize.h" -#include "engine/ExistsScan.h" +#include "engine/ExistsJoin.h" #include "engine/IndexScan.h" #include "engine/Join.h" #include "engine/LazyGroupBy.h" @@ -55,7 +55,7 @@ GroupBy::GroupBy(QueryExecutionContext* qec, vector groupByVariables, auto sortColumns = computeSortColumns(subtree.get()); for (const auto& alias : _aliases) { - subtree = ExistsScan::addExistsScansToSubtree( + subtree = ExistsJoin::addExistsScansToSubtree( alias._expression, std::move(subtree), getExecutionContext(), cancellationHandle_); } diff --git a/src/util/JoinAlgorithms/FindUndefRanges.h b/src/util/JoinAlgorithms/FindUndefRanges.h index bf15685f37..6313bea887 100644 --- a/src/util/JoinAlgorithms/FindUndefRanges.h +++ b/src/util/JoinAlgorithms/FindUndefRanges.h @@ -196,35 +196,4 @@ struct FindSmallerUndefRanges { } }; constexpr FindSmallerUndefRanges findSmallerUndefRanges; -/* -template -auto findSmallerUndefRanges(const auto& row, It begin, It end, - bool& resultMightBeUnsorted) - -> cppcoro::generator { - size_t numLastUndefined = 0; - assert(row.size() > 0); - auto it = ql::ranges::rbegin(row); - auto rend = ql::ranges::rend(row); - for (; it < rend; ++it) { - if (*it != Id::makeUndefined()) { - break; - } - ++numLastUndefined; - } - - for (; it < rend; ++it) { - if (*it == Id::makeUndefined()) { - return findSmallerUndefRangesArbitrary(row, begin, end, - resultMightBeUnsorted); - } - } - if (numLastUndefined == 0) { - return findSmallerUndefRangesForRowsWithoutUndef(row, begin, end, - resultMightBeUnsorted); - } else { - return findSmallerUndefRangesForRowsWithUndefInLastColumns( - row, numLastUndefined, begin, end, resultMightBeUnsorted); - } -} -*/ } // namespace ad_utility diff --git a/test/QueryPlannerTest.cpp b/test/QueryPlannerTest.cpp index 90462f3cc3..c7d806319e 100644 --- a/test/QueryPlannerTest.cpp +++ b/test/QueryPlannerTest.cpp @@ -2906,10 +2906,21 @@ TEST(QueryPlanner, Describe) { } // ____________________________________________________________________________ -TEST(QueryPlanner, GroupByRedundanteParensAndVariables) { +TEST(QueryPlanner, GroupByRedundantParensAndVariables) { auto matcher = h::GroupBy({Variable{"?x"}}, {}, h::IndexScanFromStrings("?x", "?y", "?z")); h::expect("SELECT ?x { ?x ?y ?z} GROUP BY (?x)", matcher); h::expect("SELECT ?x { ?x ?y ?z} GROUP BY ?x ?x", matcher); h::expect("SELECT ?x { ?x ?y ?z} GROUP BY ?x ?x (?x)", matcher); } + +// ____________________________________________________________________________ +TEST(QueryPlanner, Exists) { + auto xyz = h::IndexScanFromStrings("?x", "?y", "?z"); + auto a = h::IndexScanFromStrings("?x", "?y", "?z"); + h::expect( + "SELECT * { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", + h::Filter("EXISTS {?a ?b ?c}", + h::ExistsJoin(h::IndexScanFromStrings("?x", "?y", "?z"), + h::IndexScanFromStrings("?a", "?b", "?c")))); +} diff --git a/test/QueryPlannerTestHelpers.h b/test/QueryPlannerTestHelpers.h index c300bf0d5f..f53f30c5bb 100644 --- a/test/QueryPlannerTestHelpers.h +++ b/test/QueryPlannerTestHelpers.h @@ -15,6 +15,7 @@ #include "engine/CartesianProductJoin.h" #include "engine/CountAvailablePredicates.h" #include "engine/Describe.h" +#include "engine/ExistsJoin.h" #include "engine/Filter.h" #include "engine/GroupBy.h" #include "engine/IndexScan.h" @@ -405,6 +406,12 @@ inline QetMatcher Describe( AD_PROPERTY(::Describe, getDescribe, describeMatcher))); } +// Match an `ExistsJoin` +inline QetMatcher ExistsJoin(const QetMatcher& leftChild, + const QetMatcher& rightChild) { + return RootOperation<::ExistsJoin>(AllOf(children(leftChild, rightChild))); +} + // inline QetMatcher QetWithWarnings( const std::vector& warningSubstrings, diff --git a/test/SparqlAntlrParserTest.cpp b/test/SparqlAntlrParserTest.cpp index 0803f96f03..f5a65169b2 100644 --- a/test/SparqlAntlrParserTest.cpp +++ b/test/SparqlAntlrParserTest.cpp @@ -4,6 +4,7 @@ // Julian Mundhahs // Hannah Bast +#include #include #include @@ -14,6 +15,7 @@ #include "./SparqlExpressionTestHelpers.h" #include "./util/GTestHelpers.h" #include "./util/TripleComponentTestHelpers.h" +#include "QueryPlannerTestHelpers.h" #include "SparqlAntlrParserTestHelpers.h" #include "engine/sparqlExpressions/CountStarExpression.h" #include "engine/sparqlExpressions/GroupConcatExpression.h" @@ -1860,6 +1862,43 @@ TEST(SparqlParser, binaryStringExpressions) { expectBuiltInCall("STRBEFORE(?x, ?y)", makeMatcher(&makeStrBeforeExpression)); } +// Matchers for EXISTS and NOT EXISTS functions. +namespace existsTestHelpers { +using namespace sparqlExpression; +using namespace ::testing; + +// Match an EXISTS function +auto existsMatcher(Matcher pattern) { + return Pointee(WhenDynamicCastTo( + AD_PROPERTY(ExistsExpression, argument, pattern))); +} +// Match a NOT EXISTS function +auto notExistsMatcher(Matcher pattern) { + return builtInCallTestHelpers::matchNaryWithChildrenMatchers( + &makeUnaryNegateExpression, existsMatcher(pattern)); +} +} // namespace existsTestHelpers + +// _____________________________________________________________________________ +TEST(SparqlParser, Exists) { + using namespace existsTestHelpers; + auto expectBuiltInCall = ExpectCompleteParse<&Parser::builtInCall>{}; + // A matcher that matches the query `SELECT * { ?x ?foo}`, where the + // FROM and FROM NAMED clauses can still be specified via arguments. + using Graphs = ScanSpecificationAsTripleComponent::Graphs; + auto selectABarFooMatcher = [](Graphs defaultGraphs = std::nullopt, + Graphs namedGraphs = std::nullopt) { + return testing::AllOf(m::SelectQuery( + m::AsteriskSelect(), + m::GraphPattern(m::Triples({{Var{"?a"}, "", Var{"?foo"}}})), + defaultGraphs, namedGraphs)); + }; + expectBuiltInCall("EXISTS {?a ?foo}", + existsMatcher(selectABarFooMatcher())); + expectBuiltInCall("NOT EXISTS {?a ?foo}", + notExistsMatcher(selectABarFooMatcher())); +} + namespace aggregateTestHelpers { using namespace sparqlExpression; From cbbc771c64251f3ec69b342bbcda02fc691a5c74 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 8 Jan 2025 11:00:59 +0100 Subject: [PATCH 10/24] Fix another bug. Signed-off-by: Johannes Kalmbach --- test/QueryPlannerTest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/QueryPlannerTest.cpp b/test/QueryPlannerTest.cpp index c7d806319e..8d68a4b1e5 100644 --- a/test/QueryPlannerTest.cpp +++ b/test/QueryPlannerTest.cpp @@ -2917,7 +2917,7 @@ TEST(QueryPlanner, GroupByRedundantParensAndVariables) { // ____________________________________________________________________________ TEST(QueryPlanner, Exists) { auto xyz = h::IndexScanFromStrings("?x", "?y", "?z"); - auto a = h::IndexScanFromStrings("?x", "?y", "?z"); + auto ab = h::IndexScanFromStrings("?x", "?y", "?z"); h::expect( "SELECT * { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", h::Filter("EXISTS {?a ?b ?c}", From 91e5802c33d798e1b9cb49326079a9ddba1b902a Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 8 Jan 2025 11:03:12 +0100 Subject: [PATCH 11/24] blub. Signed-off-by: Johannes Kalmbach --- test/QueryPlannerTest.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/test/QueryPlannerTest.cpp b/test/QueryPlannerTest.cpp index 8d68a4b1e5..6f8f40d47e 100644 --- a/test/QueryPlannerTest.cpp +++ b/test/QueryPlannerTest.cpp @@ -2917,10 +2917,7 @@ TEST(QueryPlanner, GroupByRedundantParensAndVariables) { // ____________________________________________________________________________ TEST(QueryPlanner, Exists) { auto xyz = h::IndexScanFromStrings("?x", "?y", "?z"); - auto ab = h::IndexScanFromStrings("?x", "?y", "?z"); - h::expect( - "SELECT * { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", - h::Filter("EXISTS {?a ?b ?c}", - h::ExistsJoin(h::IndexScanFromStrings("?x", "?y", "?z"), - h::IndexScanFromStrings("?a", "?b", "?c")))); + auto abc = h::IndexScanFromStrings("?a", "?b", "?c"); + h::expect("SELECT * { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", + h::Filter("EXISTS {?a ?b ?c}", h::ExistsJoin(xyz, abc))); } From c3a9a7df4b46ac5e0e720c2ca4a40e9d1f5a0b0e Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 8 Jan 2025 12:31:54 +0100 Subject: [PATCH 12/24] Added some more tests. Signed-off-by: Johannes Kalmbach --- .../sparqlParser/SparqlQleverVisitor.cpp | 14 +++++---- src/parser/sparqlParser/SparqlQleverVisitor.h | 1 + test/QueryPlannerTest.cpp | 29 +++++++++++++++++++ 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.cpp b/src/parser/sparqlParser/SparqlQleverVisitor.cpp index 903544c96a..32b050db9b 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.cpp +++ b/src/parser/sparqlParser/SparqlQleverVisitor.cpp @@ -268,6 +268,7 @@ ParsedQuery Visitor::visit(Parser::ConstructQueryContext* ctx) { ParsedQuery query; query.datasetClauses_ = parsedQuery::DatasetClauses::fromClauses( visitVector(ctx->datasetClause())); + activeDatasetClauses_ = query.datasetClauses_; if (ctx->constructTemplate()) { query._clause = visit(ctx->constructTemplate()) .value_or(parsedQuery::ConstructClause{}); @@ -303,9 +304,9 @@ ParsedQuery Visitor::visit(Parser::DescribeQueryContext* ctx) { } // Parse the FROM and FROM NAMED clauses. - auto datasetClauses = parsedQuery::DatasetClauses::fromClauses( + activeDatasetClauses_ = parsedQuery::DatasetClauses::fromClauses( visitVector(ctx->datasetClause())); - describeClause.datasetClauses_ = datasetClauses; + describeClause.datasetClauses_ = activeDatasetClauses_; // Parse the WHERE clause and construct a SELECT query from it. For `DESCRIBE // *`, add each visible variable as a resource to describe. @@ -336,7 +337,7 @@ ParsedQuery Visitor::visit(Parser::DescribeQueryContext* ctx) { parsedQuery_.addSolutionModifiers(visit(ctx->solutionModifier())); parsedQuery_._rootGraphPattern._graphPatterns.emplace_back( std::move(describeClause)); - parsedQuery_.datasetClauses_ = datasetClauses; + parsedQuery_.datasetClauses_ = activeDatasetClauses_; auto constructClause = ParsedQuery::ConstructClause{}; using G = GraphTerm; using V = Variable; @@ -352,6 +353,7 @@ ParsedQuery Visitor::visit(Parser::AskQueryContext* ctx) { parsedQuery_._clause = ParsedQuery::AskClause{}; parsedQuery_.datasetClauses_ = parsedQuery::DatasetClauses::fromClauses( visitVector(ctx->datasetClause())); + activeDatasetClauses_ = parsedQuery_.datasetClauses_; visitWhereClause(ctx->whereClause(), parsedQuery_); // NOTE: It can make sense to have solution modifiers with an ASK query, for // example, a GROUP BY with a HAVING. @@ -595,6 +597,8 @@ ParsedQuery Visitor::visit(Parser::ModifyContext* ctx) { }; AD_CORRECTNESS_CHECK(visibleVariables_.empty()); auto graphPattern = visit(ctx->groupGraphPattern()); + parsedQuery_.datasetClauses_ = + parsedQuery::DatasetClauses::fromClauses(visitVector(ctx->usingClause())); parsedQuery_._rootGraphPattern = std::move(graphPattern); parsedQuery_.registerVariablesVisibleInQueryBody(visibleVariables_); visibleVariables_.clear(); @@ -605,8 +609,6 @@ ParsedQuery Visitor::visit(Parser::ModifyContext* ctx) { checkTriples(op.toDelete_); visitIf(&op.with_, ctx->iri()); parsedQuery_._clause = parsedQuery::UpdateClause{op}; - parsedQuery_.datasetClauses_ = - parsedQuery::DatasetClauses::fromClauses(visitVector(ctx->usingClause())); return parsedQuery_; } @@ -1174,6 +1176,7 @@ ParsedQuery Visitor::visit(Parser::SelectQueryContext* ctx) { parsedQuery_._clause = visit(ctx->selectClause()); parsedQuery_.datasetClauses_ = parsedQuery::DatasetClauses::fromClauses( visitVector(ctx->datasetClause())); + activeDatasetClauses_ = parsedQuery_.datasetClauses_; visitWhereClause(ctx->whereClause(), parsedQuery_); parsedQuery_.addSolutionModifiers(visit(ctx->solutionModifier())); return parsedQuery_; @@ -2434,6 +2437,7 @@ ExpressionPtr Visitor::visitExists(Parser::GroupGraphPatternContext* pattern, ParsedQuery query = std::exchange(parsedQuery_, std::move(queryBackup)); query.selectClause().setAsterisk(); query._rootGraphPattern = std::move(group); + query.datasetClauses_ = activeDatasetClauses_; visibleVariables_ = std::move(visibleVariablesSoFar); auto exists = std::make_unique(std::move(query)); diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.h b/src/parser/sparqlParser/SparqlQleverVisitor.h index 3e7b63c3ad..3d7aa0dd86 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.h +++ b/src/parser/sparqlParser/SparqlQleverVisitor.h @@ -78,6 +78,7 @@ class SparqlQleverVisitor { // query. This may contain duplicates. A variable is added via // `addVisibleVariable`. std::vector visibleVariables_{}; + ParsedQuery::DatasetClauses activeDatasetClauses_; PrefixMap prefixMap_{}; // We need to remember the prologue (prefix declarations) when we encounter it // because we need it when we encounter a SERVICE query. When there is no diff --git a/test/QueryPlannerTest.cpp b/test/QueryPlannerTest.cpp index 6f8f40d47e..518833bb02 100644 --- a/test/QueryPlannerTest.cpp +++ b/test/QueryPlannerTest.cpp @@ -2918,6 +2918,35 @@ TEST(QueryPlanner, GroupByRedundantParensAndVariables) { TEST(QueryPlanner, Exists) { auto xyz = h::IndexScanFromStrings("?x", "?y", "?z"); auto abc = h::IndexScanFromStrings("?a", "?b", "?c"); + using V = Variable; + // Simple tests for EXISTS with FILTER, BIND, and GROUP BY. h::expect("SELECT * { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", h::Filter("EXISTS {?a ?b ?c}", h::ExistsJoin(xyz, abc))); + h::expect("SELECT * { ?x ?y ?z BIND(EXISTS {?a ?b ?c} as ?bound)}", + h::Bind(h::ExistsJoin(xyz, abc), "EXISTS {?a ?b ?c}", + Variable("?bound"))); + h::expect( + "SELECT ?x (SAMPLE(EXISTS{?a ?b ?c}) as ?s) { ?x ?y ?z } GROUP BY ?x", + h::GroupBy({V{"?x"}}, {"(SAMPLE(EXISTS{?a ?b ?c}) as ?s)"}, + h::ExistsJoin(xyz, abc))); + + // Test the interaction of FROM [NAMED] with EXISTS. + + using H = ad_utility::HashSet; + auto xyzg = h::IndexScanFromStrings("?x", "?y", "?z", {}, H{""}); + auto abcg = h::IndexScanFromStrings("?a", "?b", "?c", {}, H{""}); + + auto existsJoin = h::ExistsJoin(xyzg, abcg); + auto filter = h::Filter("EXISTS {?a ?b ?c}", existsJoin); + + // Test all different kinds of queries. + // TODO There is a more elegant way to reduce the code duplication + // (use a lambda that only changes the beginning of the query). + h::expect("SELECT * FROM { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", filter); + h::expect("ASK FROM { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", filter); + h::expect( + "CONSTRUCT { } FROM { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", + filter); + h::expect("Describe ?x FROM { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", + h::Describe(::testing::_, filter)); } From 0adbfa609e5a22c799e7ec6c737a58637697c198 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 8 Jan 2025 17:25:35 +0100 Subject: [PATCH 13/24] Add some tests at least for the parser and query planner. Signed-off-by: Johannes Kalmbach --- .../sparqlParser/SparqlQleverVisitor.cpp | 1 - test/QueryPlannerTest.cpp | 18 +++++++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.cpp b/src/parser/sparqlParser/SparqlQleverVisitor.cpp index 32b050db9b..41e297120c 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.cpp +++ b/src/parser/sparqlParser/SparqlQleverVisitor.cpp @@ -7,7 +7,6 @@ #include "parser/sparqlParser/SparqlQleverVisitor.h" -#include #include #include diff --git a/test/QueryPlannerTest.cpp b/test/QueryPlannerTest.cpp index 518833bb02..89601732e8 100644 --- a/test/QueryPlannerTest.cpp +++ b/test/QueryPlannerTest.cpp @@ -2930,17 +2930,13 @@ TEST(QueryPlanner, Exists) { h::GroupBy({V{"?x"}}, {"(SAMPLE(EXISTS{?a ?b ?c}) as ?s)"}, h::ExistsJoin(xyz, abc))); - // Test the interaction of FROM [NAMED] with EXISTS. - + // Test the interaction of FROM with EXISTS. using H = ad_utility::HashSet; auto xyzg = h::IndexScanFromStrings("?x", "?y", "?z", {}, H{""}); auto abcg = h::IndexScanFromStrings("?a", "?b", "?c", {}, H{""}); auto existsJoin = h::ExistsJoin(xyzg, abcg); auto filter = h::Filter("EXISTS {?a ?b ?c}", existsJoin); - - // Test all different kinds of queries. - // TODO There is a more elegant way to reduce the code duplication // (use a lambda that only changes the beginning of the query). h::expect("SELECT * FROM { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", filter); h::expect("ASK FROM { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", filter); @@ -2949,4 +2945,16 @@ TEST(QueryPlanner, Exists) { filter); h::expect("Describe ?x FROM { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", h::Describe(::testing::_, filter)); + + // Test the interaction of FROM NAMES with EXISTS + auto varG = std::vector{Variable{"?g"}}; + std::vector graphCol{ADDITIONAL_COLUMN_GRAPH_ID}; + auto uvcg = + h::IndexScanFromStrings("?u", "?v", "?c", {}, H{""}, varG, graphCol); + existsJoin = h::ExistsJoin(xyzg, h::UnorderedJoins(abcg, uvcg)); + filter = h::Filter("EXISTS {?a ?b ?c. GRAPH ?g { ?u ?v ?c}}", existsJoin); + h::expect( + "SELECT * FROM FROM NAMED { ?x ?y ?z FILTER EXISTS {?a ?b ?c. " + "GRAPH ?g { ?u ?v ?c}}}", + filter); } From babd2940a203258cd95fb9dc332c93e24476ebc1 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 9 Jan 2025 09:38:21 +0100 Subject: [PATCH 14/24] Some more tests. As a next step, I want to write some comments. Signed-off-by: Johannes Kalmbach --- test/engine/CMakeLists.txt | 1 + test/engine/ExistsJoinTest.cpp | 94 ++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 test/engine/ExistsJoinTest.cpp diff --git a/test/engine/CMakeLists.txt b/test/engine/CMakeLists.txt index fef9ffed39..41b2b463ad 100644 --- a/test/engine/CMakeLists.txt +++ b/test/engine/CMakeLists.txt @@ -12,3 +12,4 @@ addLinkAndDiscoverTest(BindTest engine) addLinkAndRunAsSingleTest(SpatialJoinAlgorithmsTest engine) addLinkAndDiscoverTestSerial(QueryExecutionTreeTest engine) addLinkAndDiscoverTestSerial(DescribeTest engine) +addLinkAndDiscoverTestSerial(ExistsJoinTest engine) diff --git a/test/engine/ExistsJoinTest.cpp b/test/engine/ExistsJoinTest.cpp new file mode 100644 index 0000000000..af72e5fbb6 --- /dev/null +++ b/test/engine/ExistsJoinTest.cpp @@ -0,0 +1,94 @@ +// Copyright 2024, University of Freiburg +// Chair of Algorithms and Data Structures +// Author: Johannes Kalmbach + +#include + +#include "../util/GTestHelpers.h" +#include "../util/IdTableHelpers.h" +#include "../util/IndexTestHelpers.h" +#include "engine/ExistsJoin.h" +#include "engine/IndexScan.h" +#include "engine/NeutralElementOperation.h" +#include "engine/QueryExecutionTree.h" + +using namespace ad_utility::testing; + +namespace { +void testExists(const VectorTable& leftInput, const VectorTable& rightInput, + std::vector expectedAsBool, size_t numJoinColumns) { + AD_CORRECTNESS_CHECK(leftInput.size() == expectedAsBool.size()); + auto left = makeIdTableFromVector(leftInput); + auto right = makeIdTableFromVector(rightInput); + AD_CORRECTNESS_CHECK(left.numColumns() >= numJoinColumns); + AD_CORRECTNESS_CHECK(right.numColumns() >= numJoinColumns); + + auto qec = getQec(); + using V = Variable; + using Vars = std::vector>; + + // TODO Support more than one join column. + // TODO also randomly permute the join columns. + + auto joinCol = [](size_t i) { return V{absl::StrCat("?joinCol_", i)}; }; + auto nonJoinCol = [i = 0]() mutable { + return V{absl::StrCat("?nonJoinCol_", i++)}; + }; + + auto makeChild = [&](const IdTable& input) { + Vars vars; + for (size_t i : ad_utility::integerRange(numJoinColumns)) { + vars.push_back(joinCol(i)); + }; + for ([[maybe_unused]] size_t i : + ql::views::iota(numJoinColumns, input.numColumns())) { + vars.push_back(nonJoinCol()); + } + return ad_utility::makeExecutionTree(qec, input.clone(), + vars); + }; + + auto exists = + ExistsJoin{qec, makeChild(left), makeChild(right), V{"?exists"}}; + + EXPECT_EQ(exists.getResultWidth(), left.numColumns() + 1); + + auto res = exists.computeResultOnlyForTesting(); + const auto& table = res.idTable(); + ASSERT_EQ(table.numRows(), left.size()); + IdTable expected = left.clone(); + expected.addEmptyColumn(); + ql::ranges::transform(expectedAsBool, expected.getColumn(2).begin(), + &Id::makeFromBool); + EXPECT_THAT(table, matchesIdTable(expected)); +} +} // namespace + +TEST(Exists, computeResult) { + // Single join column. + testExists({{3, 6}, {4, 7}, {5, 8}}, {{3, 15}, {3, 19}, {5, 37}}, + {true, false, true}, 1); + + // UNDEF matches everything + auto U = Id::makeUndefined(); + testExists({{U, 13}, {3, 6}, {4, 7}, {5, 8}}, {{3, 15}, {3, 19}, {5, 37}}, + {true, true, false, true}, 1); + testExists({{3, 6}, {4, 7}, {5, 8}}, {{U, 15}}, {true, true, true}, 1); + + // Two join columns + testExists({{3, 6}, {4, 7}, {5, 8}}, {{3, 15}, {3, 19}, {5, 37}}, + {false, false, false}, 2); + testExists({{3, 6}, {4, 7}, {5, 8}}, + {{3, 6, 11}, {3, 19, 7}, {4, 8, 0}, {5, 8, 37}}, + {true, false, true}, 2); + + // Two join columns with UNDEF + testExists({{2, 2}, {3, U}, {4, 8}, {5, 8}}, + {{U, 8}, {3, 15}, {3, 19}, {5, U}, {5, 37}}, + {false, true, true, true}, 2); + testExists({{U, U}}, {{13, 17}}, {true}, 2); + testExists({{13, 17}, {25, 38}}, {{U, U}}, {true, true}, 2); + + // TODO Add tests with unsorted inputs. + // TODO Test empty inputs on one side. +} From 6766af39ca5e073d1669807dfa3e832a29fe964c Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 9 Jan 2025 10:35:08 +0100 Subject: [PATCH 15/24] Added some comments. Signed-off-by: Johannes Kalmbach --- src/engine/Bind.cpp | 2 +- src/engine/ExistsJoin.cpp | 31 +++++++++++++++---- src/engine/ExistsJoin.h | 22 ++++++++++--- src/engine/Filter.cpp | 4 +-- src/engine/GroupBy.cpp | 2 +- .../sparqlExpressions/ExistsExpression.cpp | 5 --- .../sparqlExpressions/ExistsExpression.h | 29 +++++++++++------ 7 files changed, 66 insertions(+), 29 deletions(-) delete mode 100644 src/engine/sparqlExpressions/ExistsExpression.cpp diff --git a/src/engine/Bind.cpp b/src/engine/Bind.cpp index 276f04e9fc..ed98495d72 100644 --- a/src/engine/Bind.cpp +++ b/src/engine/Bind.cpp @@ -16,7 +16,7 @@ Bind::Bind(QueryExecutionContext* qec, std::shared_ptr subtree, parsedQuery::Bind b) : Operation(qec), _subtree(std::move(subtree)), _bind(std::move(b)) { - _subtree = ExistsJoin::addExistsScansToSubtree( + _subtree = ExistsJoin::addExistsJoinsToSubtree( _bind._expression, std::move(_subtree), getExecutionContext(), cancellationHandle_); } diff --git a/src/engine/ExistsJoin.cpp b/src/engine/ExistsJoin.cpp index d8d3f564d1..7ca230c799 100644 --- a/src/engine/ExistsJoin.cpp +++ b/src/engine/ExistsJoin.cpp @@ -1,4 +1,4 @@ -// Copyright 2023, University of Freiburg, +// Copyright 2025, University of Freiburg, // Chair of Algorithms and Data Structures. // Author: Johannes Kalmbach @@ -25,12 +25,12 @@ ExistsJoin::ExistsJoin(QueryExecutionContext* qec, // _____________________________________________________________________________ string ExistsJoin::getCacheKeyImpl() const { - return absl::StrCat("EXISTS SCAN left: ", left_->getCacheKey(), + return absl::StrCat("EXISTS JOIN left: ", left_->getCacheKey(), " right: ", right_->getCacheKey()); } // _____________________________________________________________________________ -string ExistsJoin::getDescriptor() const { return "EXISTS scan"; } +string ExistsJoin::getDescriptor() const { return "Exists Join"; } // ____________________________________________________________________________ VariableToColumnMap ExistsJoin::computeVariableToColumnMap() const { @@ -70,6 +70,7 @@ uint64_t ExistsJoin::getSizeEstimateBeforeLimit() { // ____________________________________________________________________________ size_t ExistsJoin::getCostEstimate() { + // The implementation is a linear zipper join. return left_->getCostEstimate() + right_->getCostEstimate() + left_->getSizeEstimate() + right_->getSizeEstimate(); } @@ -81,9 +82,16 @@ ProtoResult ExistsJoin::computeResult([[maybe_unused]] bool requestLaziness) { const auto& left = leftRes->idTable(); const auto& right = rightRes->idTable(); + // We reuse the generic `zipperJoinWithUndef` utility in the following way: + // It has (among others) two callbacks: One for each matching pair of rows + // from left and right, and one for rows in the left input that have no + // matching counterpart in the right input. The first callback can be a noop, + // and the second callback gives us exactly `NOT EXISTS`. + + // Only extract the join columns from both inputs to make the following code + // easier. ad_utility::JoinColumnMapping joinColumnData{joinColumns_, left.numColumns(), right.numColumns()}; - IdTableView<0> joinColumnsLeft = left.asColumnSubsetView(joinColumnData.jcsLeft()); IdTableView<0> joinColumnsRight = @@ -105,15 +113,20 @@ ProtoResult ExistsJoin::computeResult([[maybe_unused]] bool requestLaziness) { (stdr::any_of(joinColumnsLeft.getColumn(col), &Id::isUndefined)); }); - auto noopRowAdder = [](auto&&...) {}; + // Nothing to do for the actual matches. + auto noopRowAdder = ad_utility::noop; + // Store the indices of rows for which `exists` is `false`. std::vector> notExistsIndices{ allocator()}; + // The callback is called with iterators, so we convert them back to indices. auto actionForNotExisting = [¬ExistsIndices, begin = joinColumnsLeft.begin()](const auto& itLeft) { notExistsIndices.push_back(itLeft - begin); }; + // Run the actual zipper join, with the possible optimization if we know, that + // there can be no UNDEF values. auto checkCancellationLambda = [this] { checkCancellation(); }; auto runZipperJoin = [&](auto findUndef) { [[maybe_unused]] auto numOutOfOrder = ad_utility::zipperJoinWithUndef( @@ -135,16 +148,22 @@ ProtoResult ExistsJoin::computeResult([[maybe_unused]] bool requestLaziness) { for (size_t notExistsIndex : notExistsIndices) { existsCol[notExistsIndex] = Id::makeFromBool(false); } + + // The result is a copy of the left input + and additional columns with only + // boolean values, so the local vocab of the left input is sufficient. return {std::move(result), resultSortedOn(), leftRes->getCopyOfLocalVocab()}; } // _____________________________________________________________________________ -std::shared_ptr ExistsJoin::addExistsScansToSubtree( +std::shared_ptr ExistsJoin::addExistsJoinsToSubtree( const sparqlExpression::SparqlExpressionPimpl& expression, std::shared_ptr subtree, QueryExecutionContext* qec, const ad_utility::SharedCancellationHandle& cancellationHandle) { + // First extract all the `EXISTS` functions from the expression. std::vector existsExpressions; expression.getPimpl()->getExistsExpressions(existsExpressions); + + // For each of the EXISTS functions add one `ExistsJoin` for (auto* expr : existsExpressions) { const auto& exists = dynamic_cast(*expr); diff --git a/src/engine/ExistsJoin.h b/src/engine/ExistsJoin.h index 9b9c7483ce..4ff44fe94c 100644 --- a/src/engine/ExistsJoin.h +++ b/src/engine/ExistsJoin.h @@ -7,28 +7,42 @@ #include "engine/Operation.h" #include "engine/QueryExecutionTree.h" +// The implementation of the SPARQL `EXISTS` function. It takes two subtrees, +// and returns the left subtree with an additional boolean column that is `true` +// iff at least one matching row is contained in the right subtree. class ExistsJoin : public Operation { private: + // The left and right child. std::shared_ptr left_; std::shared_ptr right_; std::vector> joinColumns_; + // The variable of the added result column. Variable existsVariable_; - vector _multiplicities; - std::vector> _matchedColumns; - public: + // Constructor. The `existsVariable` (the variable for the added boolean + // column) must not yet be bound by `left`. ExistsJoin(QueryExecutionContext* qec, std::shared_ptr left, std::shared_ptr right, Variable existsVariable); - static std::shared_ptr addExistsScansToSubtree( + // For a given subtree and a given expression, extract all the + // `ExistsExpressions` from the expression and add one `ExistsJoin` per + // `ExistsExpression` to the subtree. The left side of the `ExistsJoin` is the + // input subtree, the right hand side of the `ExistsJoin` as well as the + // variable to which the result is bound are extracted from the + // `ExistsExpression`. The returned subtree can then be used to evaluate the + // `expression`. Note: `ExistsExpression` is a simple dummy that only reads + // the values of the column that is added by the `ExistsJoin`. + static std::shared_ptr addExistsJoinsToSubtree( const sparqlExpression::SparqlExpressionPimpl& expression, std::shared_ptr subtree, QueryExecutionContext* qec, const ad_utility::SharedCancellationHandle& cancellationHandle); + // All following functions are inherited from `Operation`, see there for + // comments. protected: string getCacheKeyImpl() const override; diff --git a/src/engine/Filter.cpp b/src/engine/Filter.cpp index 9da7c12724..08393d9fb5 100644 --- a/src/engine/Filter.cpp +++ b/src/engine/Filter.cpp @@ -12,11 +12,9 @@ #include "engine/CallFixedSize.h" #include "engine/ExistsJoin.h" #include "engine/QueryExecutionTree.h" -#include "engine/QueryPlanner.h" #include "engine/sparqlExpressions/SparqlExpression.h" #include "engine/sparqlExpressions/SparqlExpressionGenerators.h" #include "engine/sparqlExpressions/SparqlExpressionValueGetters.h" -#include "sparqlExpressions/ExistsExpression.h" using std::endl; using std::string; @@ -31,7 +29,7 @@ Filter::Filter(QueryExecutionContext* qec, : Operation(qec), _subtree(std::move(subtree)), _expression{std::move(expression)} { - _subtree = ExistsJoin::addExistsScansToSubtree( + _subtree = ExistsJoin::addExistsJoinsToSubtree( _expression, std::move(_subtree), getExecutionContext(), cancellationHandle_); setPrefilterExpressionForChildren(); diff --git a/src/engine/GroupBy.cpp b/src/engine/GroupBy.cpp index 3e8af1cb29..65c7b85d11 100644 --- a/src/engine/GroupBy.cpp +++ b/src/engine/GroupBy.cpp @@ -55,7 +55,7 @@ GroupBy::GroupBy(QueryExecutionContext* qec, vector groupByVariables, auto sortColumns = computeSortColumns(subtree.get()); for (const auto& alias : _aliases) { - subtree = ExistsJoin::addExistsScansToSubtree( + subtree = ExistsJoin::addExistsJoinsToSubtree( alias._expression, std::move(subtree), getExecutionContext(), cancellationHandle_); } diff --git a/src/engine/sparqlExpressions/ExistsExpression.cpp b/src/engine/sparqlExpressions/ExistsExpression.cpp deleted file mode 100644 index 6737d3ed7b..0000000000 --- a/src/engine/sparqlExpressions/ExistsExpression.cpp +++ /dev/null @@ -1,5 +0,0 @@ -// -// Created by kalmbacj on 1/7/25. -// - -#include "ExistsExpression.h" diff --git a/src/engine/sparqlExpressions/ExistsExpression.h b/src/engine/sparqlExpressions/ExistsExpression.h index 343c195e82..1313b342b0 100644 --- a/src/engine/sparqlExpressions/ExistsExpression.h +++ b/src/engine/sparqlExpressions/ExistsExpression.h @@ -1,6 +1,6 @@ -// -// Created by kalmbacj on 1/7/25. -// +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach #pragma once @@ -9,19 +9,28 @@ #include "engine/sparqlExpressions/SparqlExpression.h" #include "parser/ParsedQuery.h" +// The expression that corresponds to the `EXISTS` function. +// The implementation only reads the value of a precomputed variable. The actual +// computation of EXISTS is done by the `ExistsJoin` class. namespace sparqlExpression { class ExistsExpression : public SparqlExpression { private: + // The argument (a group graph pattern) of the EXISTS. This is set during the + // parsing and is required and read by the `ExistsJoin` class. ParsedQuery argument_; + + // Each `ExistsExpression` has a unique index and a unique variable name that + // is used to communicate between the `ExistsExpression` and the `ExistsJoin`. static inline std::atomic indexCounter_ = 0; size_t index_ = ++indexCounter_; Variable variable_{absl::StrCat("?ql_internal_exists_", index_)}; public: + explicit ExistsExpression(ParsedQuery query) : argument_{std::move(query)} {} const auto& argument() const { return argument_; } const auto& variable() const { return variable_; } - ExistsExpression(ParsedQuery query) : argument_{std::move(query)} {} + // Evaluate only reads the variable which is written by the `ExistsJoin`. ExpressionResult evaluate(EvaluationContext* context) const override { AD_CONTRACT_CHECK(context->_variableToColumnMap.contains(variable_)); return variable_; @@ -31,17 +40,19 @@ class ExistsExpression : public SparqlExpression { [[nodiscard]] string getCacheKey( const VariableToColumnMap& varColMap) const override { if (varColMap.contains(variable_)) { - return absl::StrCat("EXISTS WITH COL ", + return absl::StrCat("ExistsExpression col# ", varColMap.at(variable_).columnIndex_); } else { - // This means that the necessary `ExistsScan` hasn't been set up yet. - // It is not possible to cache such incomplete operations, so we return - // a random cache key. + // This means that the necessary `ExistsJoin` hasn't been set up yet. This + // can for example happen if the parsing (which sets up the + // `ExistsExpression`) is completed, but the query planning (which sets up + // the `ExistsJoin` is still in progress). It is not possible to cache + // such incomplete operations, so we return a random cache key. return std::to_string(ad_utility::FastRandomIntGenerator{}()); } } - // ____________________________________________________________________________ + // This is in fact an `ExistsExpression`. bool isExistsExpression() const override { return true; } private: From 3a574eab1a8ad78482ff2f781bb6ecad108abc7d Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 9 Jan 2025 10:55:08 +0100 Subject: [PATCH 16/24] This is commented and very clean. The only thing that is missing, is some corner case tests, and maybe cleaning up the parsing of the active dataset clauses. Signed-off-by: Johannes Kalmbach --- src/engine/GroupBy.cpp | 21 +++++++++++++----- .../sparqlParser/SparqlQleverVisitor.cpp | 22 ++++++++++++------- src/parser/sparqlParser/SparqlQleverVisitor.h | 3 +++ 3 files changed, 32 insertions(+), 14 deletions(-) diff --git a/src/engine/GroupBy.cpp b/src/engine/GroupBy.cpp index 65c7b85d11..46ff7a410a 100644 --- a/src/engine/GroupBy.cpp +++ b/src/engine/GroupBy.cpp @@ -373,6 +373,8 @@ ProtoResult GroupBy::computeResult(bool requestLaziness) { } if (useHashMapOptimization) { + // Helper lambda that calls `computeGroupByForHashMapOptimization` for the + // given `subresults`. auto computeWithHashMap = [this, &metadataForUnsequentialData, &groupByCols](auto&& subresults) { auto doCompute = [&] { @@ -383,9 +385,10 @@ ProtoResult GroupBy::computeResult(bool requestLaziness) { return ad_utility::callFixedSize(groupByCols.size(), doCompute); }; + // Now call `computeWithHashMap` and return the result. It expects a range + // of results, so if the result is fully materialized, we create an array + // with a single element. if (subresult->isFullyMaterialized()) { - // `computeWithHashMap` takes a range, so we artificially create one with - // a single input. return computeWithHashMap( std::array{std::pair{std::cref(subresult->idTable()), std::cref(subresult->localVocab())}}); @@ -1513,29 +1516,35 @@ Result GroupBy::computeGroupByForHashMapOptimization( NUM_GROUP_COLUMNS == 0); LocalVocab localVocab; - // Initialize aggregation data + // Initialize the data for the aggregates of the GROUP BY operation. HashMapAggregationData aggregationData( getExecutionContext()->getAllocator(), aggregateAliases, columnIndices.size()); + // Process the input blocks (pairs of `IdTable` and `LocalVocab`) one after + // the other. ad_utility::Timer lookupTimer{ad_utility::Timer::Stopped}; ad_utility::Timer aggregationTimer{ad_utility::Timer::Stopped}; for (const auto& [inputTableRef, inputLocalVocabRef] : subresults) { - // Also support `std::reference_wrapper` as the input. const IdTable& inputTable = inputTableRef; const LocalVocab& inputLocalVocab = inputLocalVocabRef; + // Merge the local vocab of each input block. + // + // NOTE: If the input blocks have very similar or even identical non-empty + // local vocabs, no deduplication is performed. localVocab.mergeWith(std::span{&inputLocalVocab, 1}); - // Initialize evaluation context + // Setup the `EvaluationContext` for this input block. sparqlExpression::EvaluationContext evaluationContext( *getExecutionContext(), _subtree->getVariableColumns(), inputTable, getExecutionContext()->getAllocator(), localVocab, cancellationHandle_, deadline_); - evaluationContext._groupedVariables = ad_utility::HashSet{ _groupByVariables.begin(), _groupByVariables.end()}; evaluationContext._isPartOfGroupBy = true; + // Iterate of the rows of this input block. Process (up to) + // `GROUP_BY_HASH_MAP_BLOCK_SIZE` rows at a time. for (size_t i = 0; i < inputTable.size(); i += GROUP_BY_HASH_MAP_BLOCK_SIZE) { checkCancellation(); diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.cpp b/src/parser/sparqlParser/SparqlQleverVisitor.cpp index 41e297120c..6c1bf6d7eb 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.cpp +++ b/src/parser/sparqlParser/SparqlQleverVisitor.cpp @@ -25,7 +25,6 @@ #include "engine/sparqlExpressions/SampleExpression.h" #include "engine/sparqlExpressions/StdevExpression.h" #include "engine/sparqlExpressions/UuidExpressions.h" -#include "generated/SparqlAutomaticParser.h" #include "global/Constants.h" #include "global/RuntimeParameters.h" #include "parser/GraphPatternOperation.h" @@ -1370,7 +1369,6 @@ SparqlFilter Visitor::visit(Parser::FilterRContext* ctx) { // expression contains unbound variables, because the variables of the FILTER // might be bound after the filter appears in the query (which is perfectly // legal). - auto pimpl = visitExpressionPimpl(ctx->constraint()); return SparqlFilter{visitExpressionPimpl(ctx->constraint())}; } @@ -2429,17 +2427,25 @@ SparqlExpression::Ptr Visitor::visit(Parser::StrReplaceExpressionContext* ctx) { // ____________________________________________________________________________________ ExpressionPtr Visitor::visitExists(Parser::GroupGraphPatternContext* pattern, bool negate) { + // The argument of the EXISTS is a completely independent GroupGraphPattern + // (except for the FROM [NAMED] clauses), so we have to back up and restore + // all global state when parsing EXISTS. auto queryBackup = std::exchange(parsedQuery_, ParsedQuery{}); auto visibleVariablesSoFar = std::move(visibleVariables_); visibleVariables_.clear(); + + // Parse the argument of EXISTS. auto group = visit(pattern); - ParsedQuery query = std::exchange(parsedQuery_, std::move(queryBackup)); - query.selectClause().setAsterisk(); - query._rootGraphPattern = std::move(group); - query.datasetClauses_ = activeDatasetClauses_; + ParsedQuery argumentOfExists = + std::exchange(parsedQuery_, std::move(queryBackup)); + argumentOfExists.selectClause().setAsterisk(); + argumentOfExists._rootGraphPattern = std::move(group); + + // EXISTS inherits the FROM [NAMED] clauses from the outer argumentOfExists. + argumentOfExists.datasetClauses_ = activeDatasetClauses_; visibleVariables_ = std::move(visibleVariablesSoFar); - auto exists = - std::make_unique(std::move(query)); + auto exists = std::make_unique( + std::move(argumentOfExists)); if (negate) { return sparqlExpression::makeUnaryNegateExpression(std::move(exists)); } else { diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.h b/src/parser/sparqlParser/SparqlQleverVisitor.h index 3d7aa0dd86..2fd0d6bc9b 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.h +++ b/src/parser/sparqlParser/SparqlQleverVisitor.h @@ -78,6 +78,9 @@ class SparqlQleverVisitor { // query. This may contain duplicates. A variable is added via // `addVisibleVariable`. std::vector visibleVariables_{}; + + // The FROM [NAMED] clauses of the query that is currently being parsed. + // Those are currently needed when parsing an EXISTS clause inside the query. ParsedQuery::DatasetClauses activeDatasetClauses_; PrefixMap prefixMap_{}; // We need to remember the prologue (prefix declarations) when we encounter it From 256e38a1bc9207514b269c0dc27376a14779a044 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 9 Jan 2025 13:20:02 +0100 Subject: [PATCH 17/24] In the middle of patching these things. Signed-off-by: Johannes Kalmbach --- src/engine/CheckUsePatternTrick.cpp | 106 ++++++++++++++++++++++++---- 1 file changed, 94 insertions(+), 12 deletions(-) diff --git a/src/engine/CheckUsePatternTrick.cpp b/src/engine/CheckUsePatternTrick.cpp index e7da58ea14..7c21893f46 100644 --- a/src/engine/CheckUsePatternTrick.cpp +++ b/src/engine/CheckUsePatternTrick.cpp @@ -82,13 +82,94 @@ bool isVariableContainedInGraphPatternOperation( }); } +using ValuesClause = parsedQuery::Values; +// TODO How many possible return values do we need here. +bool addValuesClauseToPattern(const Variable& variable, + parsedQuery::GraphPatternOperation& operation, + const SparqlTriple* tripleToIgnore, + const ValuesClause& clause); + +// __________________________________________________________________________ +bool addValuesClause(const Variable& variable, + ParsedQuery::GraphPattern& graphPattern, + const SparqlTriple* tripleToIgnore, + const ValuesClause& result) { + bool containedInFilter = ql::ranges::any_of( + graphPattern._filters, [&variable](const SparqlFilter& filter) { + return filter.expression_.isVariableContained(variable); + }); + auto check = [&](const parsedQuery::GraphPatternOperation& op) { + return addValuesClauseToPattern(variable, op, tripleToIgnore, result); + }; + if (ql::ranges::any_of(graphPattern._graphPatterns, check) || + containedInFilter) { + graphPattern._graphPatterns.insert(graphPattern._graphPatterns.begin(), + result); + } + // Does this need to return false? + return false; +} + +// __________________________________________________________________________ +bool addValuesClauseToPattern(const Variable& variable, + parsedQuery::GraphPatternOperation& operation, + const SparqlTriple* tripleToIgnore, + const ValuesClause& result) { + auto check = [&](parsedQuery::GraphPattern& pattern) { + return addValuesClause(variable, pattern, tripleToIgnore, result); + }; + return operation.visit([&](auto&& arg) -> bool { + using T = std::decay_t; + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v) { + return check(arg._child); + } else if constexpr (std::is_same_v) { + return check(arg._child1) || check(arg._child2); + } else if constexpr (std::is_same_v) { + // Subqueries always are SELECT clauses. + const auto& selectClause = arg.get().selectClause(); + return ad_utility::contains(selectClause.getSelectedVariables(), + variable); + } else if constexpr (std::is_same_v) { + return ad_utility::contains(arg.containedVariables(), variable); + } else if constexpr (std::is_same_v) { + return ad_utility::contains_if( + arg._triples, [&](const SparqlTriple& triple) { + if (&triple == tripleToIgnore) { + return false; + } + return (triple.s_ == variable || + // Complex property paths are not allowed to contain + // variables in SPARQL, so this check is sufficient. + // TODO Still make the interface of the + // `PropertyPath` class typesafe. + triple.p_.asString() == variable.name() || + triple.o_ == variable); + }); + } else if constexpr (std::is_same_v) { + return (&arg != &result) && + ad_utility::contains(arg._inlineValues._variables, variable); + } else if constexpr (std::is_same_v) { + return ad_utility::contains(arg.visibleVariables_, variable); + } else { + static_assert( + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v); + // The `TransPath` is set up later in the query planning, when this + // function should not be called anymore. + AD_FAIL(); + } + }); +} + // Internal helper function. -// Modify the `triples` s.t. the patterns for `subAndPred.subject_` will appear -// in a column with the variable `subAndPred.predicate_` when evaluating and -// joining all the triples. This can be either done by retrieving one of the -// additional columns where the patterns are stored in the PSO and POS -// permutation or, if no triple suitable for adding this column exists, by -// adding a triple `?subject ql:has-pattern ?predicate`. +// Modify the `triples` s.t. the patterns for `subAndPred.subject_` will +// appear in a column with the variable `subAndPred.predicate_` when +// evaluating and joining all the triples. This can be either done by +// retrieving one of the additional columns where the patterns are stored in +// the PSO and POS permutation or, if no triple suitable for adding this +// column exists, by adding a triple `?subject ql:has-pattern ?predicate`. static void rewriteTriplesForPatternTrick(const PatternTrickTuple& subAndPred, std::vector& triples) { // The following lambda tries to find a triple in the `triples` that has the @@ -96,8 +177,8 @@ static void rewriteTriplesForPatternTrick(const PatternTrickTuple& subAndPred, // either the subject or the object) and a fixed predicate (no variable). If // such a triple is found, it is modified s.t. it also scans the // `additionalScanColumn` which has to be the index of the column where the - // patterns of the `triplePosition` are stored in the POS and PSO permutation. - // Return true iff such a triple was found and replaced. + // patterns of the `triplePosition` are stored in the POS and PSO + // permutation. Return true iff such a triple was found and replaced. auto findAndRewriteMatchingTriple = [&subAndPred, &triples]( auto triplePosition, size_t additionalScanColumn) { @@ -133,8 +214,9 @@ static void rewriteTriplesForPatternTrick(const PatternTrickTuple& subAndPred, // Check if any of the triples in the `graphPattern` has the form `?s // ql:has-predicate ?p` or `?s ?p ?o` and that the other conditions for the // pattern trick are fulfilled (nameley that the variables `?p` and if present -// `?o` don't appear elsewhere in the `parsedQuery`. If such a triple is found, -// the query is modified such that it behaves as if the triple was replace by +// `?o` don't appear elsewhere in the `parsedQuery`. If such a triple is +// found, the query is modified such that it behaves as if the triple was +// replace by // `?s ql:has-pattern ?p`. See the documentation of // `rewriteTriplesForPatternTrick` above. static std::optional findPatternTrickTuple( @@ -183,8 +265,8 @@ std::optional checkUsePatternTrick( } // We currently accept the pattern trick triple anywhere in the query. - // TODO This loop can be made much easier using ranges and view once - // they are supported by clang. + // TODO This loop can be made much easier using ranges and view + // once they are supported by clang. for (auto& pattern : parsedQuery->children()) { auto* curPattern = std::get_if(&pattern); if (!curPattern) { From f29efc6c129d4e3cb7a734102b38dcf1fbd00837 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 9 Jan 2025 14:58:26 +0100 Subject: [PATCH 18/24] Also account for the filters when counting the subgraphs. Signed-off-by: Johannes Kalmbach --- src/engine/QueryPlanner.cpp | 23 ++++++++++++++++++++--- src/engine/QueryPlanner.h | 5 +++-- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/src/engine/QueryPlanner.cpp b/src/engine/QueryPlanner.cpp index 9dd6b5599c..1df7f6d917 100644 --- a/src/engine/QueryPlanner.cpp +++ b/src/engine/QueryPlanner.cpp @@ -1338,13 +1338,28 @@ QueryPlanner::runDynamicProgrammingOnConnectedComponent( // _____________________________________________________________________________ size_t QueryPlanner::countSubgraphs( - std::vector graph, size_t budget) { + std::vector graph, + const std::vector& filters, size_t budget) { // Remove duplicate plans from `graph`. auto getId = [](const SubtreePlan* v) { return v->_idsOfIncludedNodes; }; ql::ranges::sort(graph, ql::ranges::less{}, getId); graph.erase(std::ranges::unique(graph, ql::ranges::equal_to{}, getId).begin(), graph.end()); + std::vector dummyPlansForFilter; + for (const auto& filter : filters) { + const auto& vars = filter.expression_.containedVariables(); + parsedQuery::SparqlValues values; + for (auto* var : vars) { + values._variables.push_back(*var); + } + dummyPlansForFilter.push_back( + makeSubtreePlan(_qec, std::move(values))); + } + for (const auto& filterPlan : dummyPlansForFilter) { + graph.push_back(&filterPlan); + } + // Qlever currently limits the number of triples etc. per group to be <= 64 // anyway, so we can simply assert here. AD_CORRECTNESS_CHECK(graph.size() <= 64, @@ -1366,7 +1381,9 @@ size_t QueryPlanner::countSubgraphs( g.push_back(v); } - return countConnectedSubgraphs::countSubgraphs(g, budget); + auto result = countConnectedSubgraphs::countSubgraphs(g, budget); + LOG(INFO) << "number of subgraphs inside a component " << result << std::endl; + return result; } // _____________________________________________________________________________ @@ -1424,7 +1441,7 @@ vector> QueryPlanner::fillDpTab( g.push_back(&plan); } const size_t budget = RuntimeParameters().get<"query-planning-budget">(); - bool useGreedyPlanning = countSubgraphs(g, budget) > budget; + bool useGreedyPlanning = countSubgraphs(g, filters, budget) > budget; if (useGreedyPlanning) { LOG(INFO) << "Using the greedy query planner for a large connected component" diff --git a/src/engine/QueryPlanner.h b/src/engine/QueryPlanner.h index 52ee540a0a..6271236d58 100644 --- a/src/engine/QueryPlanner.h +++ b/src/engine/QueryPlanner.h @@ -485,8 +485,9 @@ class QueryPlanner { // if the number of subgraphs is `> budget`. This is used to analyze the // complexity of the query graph and to choose between the DP and the greedy // query planner see above. - static size_t countSubgraphs(std::vector graph, - size_t budget); + size_t countSubgraphs(std::vector graph, + const std::vector& filters, + size_t budget); // Creates a SubtreePlan for the given text leaf node in the triple graph. // While doing this the TextLimitMetaObjects are created and updated according From 740d18609f1584dcb44a7cc71a3dce120a975c9b Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 9 Jan 2025 15:33:14 +0100 Subject: [PATCH 19/24] Added some comments. Signed-off-by: Johannes Kalmbach --- src/engine/QueryPlanner.cpp | 9 ++++++--- src/engine/QueryPlanner.h | 2 ++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/engine/QueryPlanner.cpp b/src/engine/QueryPlanner.cpp index 1df7f6d917..50c6213e10 100644 --- a/src/engine/QueryPlanner.cpp +++ b/src/engine/QueryPlanner.cpp @@ -1346,9 +1346,14 @@ size_t QueryPlanner::countSubgraphs( graph.erase(std::ranges::unique(graph, ql::ranges::equal_to{}, getId).begin(), graph.end()); + // We also have to consider the `filters`. To make life easy, we temporarily + // create simple `SubtreePlans` for them which just have the correct + // variables. std::vector dummyPlansForFilter; for (const auto& filter : filters) { const auto& vars = filter.expression_.containedVariables(); + // We use a `VALUES` clause as the dummy because this operation is the + // easiest to setup for a number of given variables. parsedQuery::SparqlValues values; for (auto* var : vars) { values._variables.push_back(*var); @@ -1381,9 +1386,7 @@ size_t QueryPlanner::countSubgraphs( g.push_back(v); } - auto result = countConnectedSubgraphs::countSubgraphs(g, budget); - LOG(INFO) << "number of subgraphs inside a component " << result << std::endl; - return result; + return countConnectedSubgraphs::countSubgraphs(g, budget); } // _____________________________________________________________________________ diff --git a/src/engine/QueryPlanner.h b/src/engine/QueryPlanner.h index 6271236d58..11dbaedb50 100644 --- a/src/engine/QueryPlanner.h +++ b/src/engine/QueryPlanner.h @@ -485,6 +485,8 @@ class QueryPlanner { // if the number of subgraphs is `> budget`. This is used to analyze the // complexity of the query graph and to choose between the DP and the greedy // query planner see above. + // Note: We also need the added filters, because they behave like additional + // graph nodes wrt the performance of the DP based query planner. size_t countSubgraphs(std::vector graph, const std::vector& filters, size_t budget); From 2050af81fffd2d109786c42c356d2b1225c642c2 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 9 Jan 2025 20:02:46 +0100 Subject: [PATCH 20/24] A small fix. Signed-off-by: Johannes Kalmbach --- src/engine/QueryPlanner.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/engine/QueryPlanner.cpp b/src/engine/QueryPlanner.cpp index 50c6213e10..4bad1b26cc 100644 --- a/src/engine/QueryPlanner.cpp +++ b/src/engine/QueryPlanner.cpp @@ -1361,6 +1361,8 @@ size_t QueryPlanner::countSubgraphs( dummyPlansForFilter.push_back( makeSubtreePlan(_qec, std::move(values))); } + + const size_t numPlansWithoutFilters = graph.size(); for (const auto& filterPlan : dummyPlansForFilter) { graph.push_back(&filterPlan); } @@ -1378,7 +1380,11 @@ size_t QueryPlanner::countSubgraphs( for (size_t i = 0; i < graph.size(); ++i) { countConnectedSubgraphs::Node v{0}; for (size_t k = 0; k < graph.size(); ++k) { + // Don't connect nodes to themselves, don't connect filters with other + // filters, otherwise connect `i` and `k` if they have at least one + // variable in common. if ((k != i) && + (k < numPlansWithoutFilters || i < numPlansWithoutFilters) && !QueryPlanner::getJoinColumns(*graph.at(k), *graph.at(i)).empty()) { v.neighbors_ |= (1ULL << k); } From 52943570743cb6c9db292f6331e1b8304ab32379 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Fri, 10 Jan 2025 03:59:43 +0100 Subject: [PATCH 21/24] Made a pass over `ExistsJoin.h` and `ExistsJoin.cpp` --- src/engine/ExistsJoin.cpp | 69 ++++++++++++++++++++++++--------------- src/engine/ExistsJoin.h | 28 +++++++++------- 2 files changed, 58 insertions(+), 39 deletions(-) diff --git a/src/engine/ExistsJoin.cpp b/src/engine/ExistsJoin.cpp index 7ca230c799..4e0b3b5bde 100644 --- a/src/engine/ExistsJoin.cpp +++ b/src/engine/ExistsJoin.cpp @@ -1,6 +1,6 @@ -// Copyright 2025, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Johannes Kalmbach +// Copyright 2025, University of Freiburg +// Chair of Algorithms and Data Structures +// Author: Johannes Kalmbach #include "engine/ExistsJoin.h" @@ -19,6 +19,7 @@ ExistsJoin::ExistsJoin(QueryExecutionContext* qec, right_{std::move(right)}, joinColumns_{QueryExecutionTree::getJoinColumns(*left_, *right_)}, existsVariable_{std::move(existsVariable)} { + // Make sure that the left and right input are sorted on the join columns. std::tie(left_, right_) = QueryExecutionTree::createSortedTrees( std::move(left_), std::move(right_), joinColumns_); } @@ -37,7 +38,7 @@ VariableToColumnMap ExistsJoin::computeVariableToColumnMap() const { auto res = left_->getVariableColumns(); AD_CONTRACT_CHECK( !res.contains(existsVariable_), - "The target variable of an exists scan must be a new variable"); + "The target variable of an EXISTS join must be a new variable"); res[existsVariable_] = makeAlwaysDefinedColumn(getResultWidth() - 1); return res; } @@ -50,16 +51,20 @@ size_t ExistsJoin::getResultWidth() const { // ____________________________________________________________________________ vector ExistsJoin::resultSortedOn() const { + // We add one column to `left_`, but do not change the order of the rows. return left_->resultSortedOn(); } // ____________________________________________________________________________ float ExistsJoin::getMultiplicity(size_t col) { + // The multiplicities of all columns except the last one are the same as in + // `left_`. if (col < getResultWidth() - 1) { return left_->getMultiplicity(col); } - // The multiplicity of the boolean column can be a dummy value, as it should - // be never used for joins etc. + // For the added (Boolean) column we take a dummy value, assuming that it + // will not be used for subsequent joins or other operations that make use of + // the multiplicities. return 1; } @@ -82,13 +87,17 @@ ProtoResult ExistsJoin::computeResult([[maybe_unused]] bool requestLaziness) { const auto& left = leftRes->idTable(); const auto& right = rightRes->idTable(); - // We reuse the generic `zipperJoinWithUndef` utility in the following way: - // It has (among others) two callbacks: One for each matching pair of rows - // from left and right, and one for rows in the left input that have no - // matching counterpart in the right input. The first callback can be a noop, - // and the second callback gives us exactly `NOT EXISTS`. - - // Only extract the join columns from both inputs to make the following code + // We reuse the generic `zipperJoinWithUndef` function, which has two two + // callbacks: one for each matching pair of rows from `left` and `right`, and + // one for rows in the left input that have no matching counterpart in the + // right input. The first callback can be a noop, and the second callback + // gives us exactly those rows, where the value in the to-be-added result + // column should be `false`. + // + // the inverse of the value needed for the added Boolean + // column. + + // Extract the join columns from both inputs to make the following code // easier. ad_utility::JoinColumnMapping joinColumnData{joinColumns_, left.numColumns(), right.numColumns()}; @@ -96,11 +105,11 @@ ProtoResult ExistsJoin::computeResult([[maybe_unused]] bool requestLaziness) { left.asColumnSubsetView(joinColumnData.jcsLeft()); IdTableView<0> joinColumnsRight = right.asColumnSubsetView(joinColumnData.jcsRight()); - checkCancellation(); - // `isCheap` is true iff there are no UNDEF values in the join columns. In - // this case we can use a much cheaper algorithm. + // Compute `isCheap`, which is true iff there are no UNDEF values in the join + // columns (in which case we can use a simpler and cheaper join algorithm). + // // TODO There are many other cases where a cheaper implementation can // be chosen, but we leave those for another PR, this is the most common case. namespace stdr = ql::ranges; @@ -116,7 +125,8 @@ ProtoResult ExistsJoin::computeResult([[maybe_unused]] bool requestLaziness) { // Nothing to do for the actual matches. auto noopRowAdder = ad_utility::noop; - // Store the indices of rows for which `exists` is `false`. + // Store the indices of rows for which the value of the `EXISTS` (in the added + // Boolean column) should be `false`. std::vector> notExistsIndices{ allocator()}; // The callback is called with iterators, so we convert them back to indices. @@ -125,8 +135,9 @@ ProtoResult ExistsJoin::computeResult([[maybe_unused]] bool requestLaziness) { notExistsIndices.push_back(itLeft - begin); }; - // Run the actual zipper join, with the possible optimization if we know, that - // there can be no UNDEF values. + // Run `zipperJoinWithUndef` with the described callbacks and the mentioned + // optimization in case we know that there are no UNDEF values in the join + // columns. auto checkCancellationLambda = [this] { checkCancellation(); }; auto runZipperJoin = [&](auto findUndef) { [[maybe_unused]] auto numOutOfOrder = ad_utility::zipperJoinWithUndef( @@ -140,7 +151,8 @@ ProtoResult ExistsJoin::computeResult([[maybe_unused]] bool requestLaziness) { runZipperJoin(ad_utility::findSmallerUndefRanges); } - // Set up the result; + // Add the result column from the computed `notExistsIndices` (which tell us + // where the value should be `false`). IdTable result = left.clone(); result.addEmptyColumn(); decltype(auto) existsCol = result.getColumn(getResultWidth() - 1); @@ -149,8 +161,8 @@ ProtoResult ExistsJoin::computeResult([[maybe_unused]] bool requestLaziness) { existsCol[notExistsIndex] = Id::makeFromBool(false); } - // The result is a copy of the left input + and additional columns with only - // boolean values, so the local vocab of the left input is sufficient. + // The added column only contains Boolean values, and adds no new words to the + // local vocabulary, so we can simply copy the local vocab from `leftRes`. return {std::move(result), resultSortedOn(), leftRes->getCopyOfLocalVocab()}; } @@ -159,17 +171,20 @@ std::shared_ptr ExistsJoin::addExistsJoinsToSubtree( const sparqlExpression::SparqlExpressionPimpl& expression, std::shared_ptr subtree, QueryExecutionContext* qec, const ad_utility::SharedCancellationHandle& cancellationHandle) { - // First extract all the `EXISTS` functions from the expression. + // Extract all `EXISTS` functions from the given `expression`. std::vector existsExpressions; expression.getPimpl()->getExistsExpressions(existsExpressions); - // For each of the EXISTS functions add one `ExistsJoin` + // For each `EXISTS` function, add the corresponding `ExistsJoin`. for (auto* expr : existsExpressions) { const auto& exists = dynamic_cast(*expr); - // Currently some FILTERs are applied multiple times especially when there - // are OPTIONAL joins in the query. In these cases we have to make sure that - // the `ExistsScan` is added only once. + // Currently some FILTERs are applied multiple times (in particular, this + // happens when there are OPTIONAL joins in the query). In these cases we + // have to make sure that the `ExistsJoin` is added only once. + // + // TODO(question from Hannah's review): Why does the following implement + // what the preceding comment says? if (subtree->isVariableCovered(exists.variable())) { continue; } diff --git a/src/engine/ExistsJoin.h b/src/engine/ExistsJoin.h index 4ff44fe94c..b319c304c9 100644 --- a/src/engine/ExistsJoin.h +++ b/src/engine/ExistsJoin.h @@ -1,13 +1,14 @@ -// Copyright 2025, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Johannes Kalmbach +// Copyright 2025, University of Freiburg +// Chair of Algorithms and Data Structures +// Author: Johannes Kalmbach #pragma once #include "engine/Operation.h" #include "engine/QueryExecutionTree.h" -// The implementation of the SPARQL `EXISTS` function. It takes two subtrees, +// The implementation of an "EXISTS join", which we use to realize the semantics +// of the SPARQL `EXISTS` function. The join takes two subtrees as input, and // and returns the left subtree with an additional boolean column that is `true` // iff at least one matching row is contained in the right subtree. class ExistsJoin : public Operation { @@ -17,25 +18,28 @@ class ExistsJoin : public Operation { std::shared_ptr right_; std::vector> joinColumns_; - // The variable of the added result column. + // The variable of the added (Boolean) result column. Variable existsVariable_; public: - // Constructor. The `existsVariable` (the variable for the added boolean - // column) must not yet be bound by `left`. + // Constructor. The `existsVariable` (the variable for the added column) must + // not yet be bound by `left`. ExistsJoin(QueryExecutionContext* qec, std::shared_ptr left, std::shared_ptr right, Variable existsVariable); // For a given subtree and a given expression, extract all the - // `ExistsExpressions` from the expression and add one `ExistsJoin` per - // `ExistsExpression` to the subtree. The left side of the `ExistsJoin` is the - // input subtree, the right hand side of the `ExistsJoin` as well as the + // `ExistsExpression`s from the expression and add one `ExistsJoin` per + // `ExistsExpression` to the subtree. The left side of the `ExistsJoin` is + // the input subtree, the right hand side of the `ExistsJoin` as well as the // variable to which the result is bound are extracted from the // `ExistsExpression`. The returned subtree can then be used to evaluate the - // `expression`. Note: `ExistsExpression` is a simple dummy that only reads - // the values of the column that is added by the `ExistsJoin`. + // `expression`. + // + // NOTE: `ExistsExpression` is a dummy that only reads the values of the + // column that is added by the `ExistsJoin`. The main work is done by the + // latter and not by the former. static std::shared_ptr addExistsJoinsToSubtree( const sparqlExpression::SparqlExpressionPimpl& expression, std::shared_ptr subtree, QueryExecutionContext* qec, From 982cff756dc2d031e44813db62ea73aeb6ac33a7 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Fri, 10 Jan 2025 08:19:43 +0100 Subject: [PATCH 22/24] Clean up, this should work with a reasonable threshold for the query-planning-budget. Signed-off-by: Johannes Kalmbach --- src/engine/QueryPlanner.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/engine/QueryPlanner.cpp b/src/engine/QueryPlanner.cpp index 4bad1b26cc..c27d185a4a 100644 --- a/src/engine/QueryPlanner.cpp +++ b/src/engine/QueryPlanner.cpp @@ -1348,18 +1348,27 @@ size_t QueryPlanner::countSubgraphs( // We also have to consider the `filters`. To make life easy, we temporarily // create simple `SubtreePlans` for them which just have the correct - // variables. + // variables. We only create one subtree plan for each set of variables that + // is contained in the `filters`, because this will bring the estimate of this + // function closer to the actual behavior of the DP query planner (it always + // applies either all possible filters at once, or none of them). std::vector dummyPlansForFilter; + ad_utility::HashSet> + deduplicatedFilterVariables; for (const auto& filter : filters) { const auto& vars = filter.expression_.containedVariables(); + ad_utility::HashSet varSet; // We use a `VALUES` clause as the dummy because this operation is the // easiest to setup for a number of given variables. parsedQuery::SparqlValues values; for (auto* var : vars) { values._variables.push_back(*var); + varSet.insert(*var); + } + if (deduplicatedFilterVariables.insert(std::move(varSet)).second) { + dummyPlansForFilter.push_back( + makeSubtreePlan(_qec, std::move(values))); } - dummyPlansForFilter.push_back( - makeSubtreePlan(_qec, std::move(values))); } const size_t numPlansWithoutFilters = graph.size(); From eced22bce36fc2227b3ed253f224a4a47a98327b Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Fri, 10 Jan 2025 09:42:49 +0100 Subject: [PATCH 23/24] A first working version of this values thing.There is a lot to do: * testing * commenting * possibly getting rid of redundant copies. * limiting the size. Signed-off-by: Johannes Kalmbach --- src/engine/CheckUsePatternTrick.cpp | 122 +++++++++++++++++++--------- src/engine/CheckUsePatternTrick.h | 6 ++ src/engine/QueryPlanner.cpp | 3 + 3 files changed, 92 insertions(+), 39 deletions(-) diff --git a/src/engine/CheckUsePatternTrick.cpp b/src/engine/CheckUsePatternTrick.cpp index 7c21893f46..0cae765d5f 100644 --- a/src/engine/CheckUsePatternTrick.cpp +++ b/src/engine/CheckUsePatternTrick.cpp @@ -82,42 +82,73 @@ bool isVariableContainedInGraphPatternOperation( }); } -using ValuesClause = parsedQuery::Values; +using ValuesClause = std::optional; // TODO How many possible return values do we need here. -bool addValuesClauseToPattern(const Variable& variable, - parsedQuery::GraphPatternOperation& operation, - const SparqlTriple* tripleToIgnore, +bool addValuesClauseToPattern(parsedQuery::GraphPatternOperation& operation, const ValuesClause& clause); // __________________________________________________________________________ -bool addValuesClause(const Variable& variable, - ParsedQuery::GraphPattern& graphPattern, - const SparqlTriple* tripleToIgnore, - const ValuesClause& result) { +void addValuesClause(ParsedQuery::GraphPattern& graphPattern, + const ValuesClause& values, bool recurse) { + // TODO Do we want to do this, or do we only want this if the values + // clause hasn't been handled downstream. + /* bool containedInFilter = ql::ranges::any_of( - graphPattern._filters, [&variable](const SparqlFilter& filter) { - return filter.expression_.isVariableContained(variable); + graphPattern._filters, [&values](const SparqlFilter& filter) { + return ql::ranges::any_of( + values._inlineValues._variables, [&filter](const Variable& var) { + return filter.expression_.isVariableContained(var); + }); }); - auto check = [&](const parsedQuery::GraphPatternOperation& op) { - return addValuesClauseToPattern(variable, op, tripleToIgnore, result); + */ + [[maybe_unused]] const bool containedInFilter = false; + auto check = [&](parsedQuery::GraphPatternOperation& op) { + return addValuesClauseToPattern(op, values); }; - if (ql::ranges::any_of(graphPattern._graphPatterns, check) || - containedInFilter) { - graphPattern._graphPatterns.insert(graphPattern._graphPatterns.begin(), - result); + // TODO We have to figure out the correct positioning of the values + // clause, s.t. we don't get cartesian products because of optimization + // barriers like bind/Optional/Minus etc. + std::optional insertPosition; + if (values.has_value()) { + for (const auto& [i, pattern] : + ::ranges::views::enumerate(graphPattern._graphPatterns)) { + if (check(pattern)) { + insertPosition = i; + } + } + } + + if (!recurse) { + return; + } + if (insertPosition.has_value()) { + graphPattern._graphPatterns.insert( + graphPattern._graphPatterns.begin() + insertPosition.value(), + values.value()); + } + + std::vector foundClauses; + for (const auto& pattern : graphPattern._graphPatterns) { + if (auto* foundValues = std::get_if(&pattern)) { + foundClauses.push_back(*foundValues); + } + } + for (const auto& foundValue : foundClauses) { + addValuesClause(graphPattern, foundValue, false); } - // Does this need to return false? - return false; } // __________________________________________________________________________ -bool addValuesClauseToPattern(const Variable& variable, - parsedQuery::GraphPatternOperation& operation, - const SparqlTriple* tripleToIgnore, +bool addValuesClauseToPattern(parsedQuery::GraphPatternOperation& operation, const ValuesClause& result) { auto check = [&](parsedQuery::GraphPattern& pattern) { - return addValuesClause(variable, pattern, tripleToIgnore, result); + addValuesClause(pattern, result); + return false; }; + // TODO Don't pass an optional to this function. + AD_CORRECTNESS_CHECK(result.has_value()); + const auto& variables = result.value()._inlineValues._variables; + auto anyVar = [&](auto f) { return ql::ranges::any_of(variables, f); }; return operation.visit([&](auto&& arg) -> bool { using T = std::decay_t; if constexpr (std::is_same_v || @@ -125,33 +156,46 @@ bool addValuesClauseToPattern(const Variable& variable, std::is_same_v) { return check(arg._child); } else if constexpr (std::is_same_v) { - return check(arg._child1) || check(arg._child2); + check(arg._child1); + check(arg._child2); + return false; } else if constexpr (std::is_same_v) { // Subqueries always are SELECT clauses. const auto& selectClause = arg.get().selectClause(); - return ad_utility::contains(selectClause.getSelectedVariables(), - variable); + + if (anyVar([&selectClause](const auto& var) { + return ad_utility::contains(selectClause.getSelectedVariables(), + var); + })) { + return check(arg.get()._rootGraphPattern); + } else { + return false; + } } else if constexpr (std::is_same_v) { - return ad_utility::contains(arg.containedVariables(), variable); + return ql::ranges::any_of(variables, [&](const auto& variable) { + return ad_utility::contains(arg.containedVariables(), variable); + }); } else if constexpr (std::is_same_v) { return ad_utility::contains_if( arg._triples, [&](const SparqlTriple& triple) { - if (&triple == tripleToIgnore) { - return false; - } - return (triple.s_ == variable || - // Complex property paths are not allowed to contain - // variables in SPARQL, so this check is sufficient. - // TODO Still make the interface of the - // `PropertyPath` class typesafe. - triple.p_.asString() == variable.name() || - triple.o_ == variable); + return anyVar([&](const auto& variable) { + return (triple.s_ == variable || + // Complex property paths are not allowed to contain + // variables in SPARQL, so this check is sufficient. + // TODO Still make the interface of the + // `PropertyPath` class typesafe. + triple.p_.asString() == variable.name() || + triple.o_ == variable); + }); }); } else if constexpr (std::is_same_v) { - return (&arg != &result) && - ad_utility::contains(arg._inlineValues._variables, variable); + return anyVar([&](const auto& variable) { + return ad_utility::contains(arg._inlineValues._variables, variable); + }); } else if constexpr (std::is_same_v) { - return ad_utility::contains(arg.visibleVariables_, variable); + return anyVar([&](const auto& variable) { + return ad_utility::contains(arg.visibleVariables_, variable); + }); } else { static_assert( std::is_same_v || std::is_same_v || diff --git a/src/engine/CheckUsePatternTrick.h b/src/engine/CheckUsePatternTrick.h index 47db399638..52ddfc2924 100644 --- a/src/engine/CheckUsePatternTrick.h +++ b/src/engine/CheckUsePatternTrick.h @@ -57,4 +57,10 @@ bool isVariableContainedInGraphPatternOperation( const parsedQuery::GraphPatternOperation& operation, const SparqlTriple* tripleToIgnore); +// __________________________________________________________________________ +void addValuesClause( + ParsedQuery::GraphPattern& graphPattern, + const std::optional& values = std::nullopt, + bool recurse = true); + } // namespace checkUsePatternTrick diff --git a/src/engine/QueryPlanner.cpp b/src/engine/QueryPlanner.cpp index 9dd6b5599c..8a6e302de3 100644 --- a/src/engine/QueryPlanner.cpp +++ b/src/engine/QueryPlanner.cpp @@ -208,6 +208,9 @@ std::vector QueryPlanner::createExecutionTrees( QueryExecutionTree QueryPlanner::createExecutionTree(ParsedQuery& pq, bool isSubquery) { try { + if (!isSubquery) { + checkUsePatternTrick::addValuesClause(pq._rootGraphPattern); + } auto lastRow = createExecutionTrees(pq, isSubquery); auto minInd = findCheapestExecutionTree(lastRow); LOG(DEBUG) << "Done creating execution plan" << std::endl; From 70319f7b9c3dd858bf23101eb7820f1713bcd602 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Fri, 10 Jan 2025 15:45:23 +0100 Subject: [PATCH 24/24] Try out if that fixes the error... Signed-off-by: Johannes Kalmbach --- src/engine/CheckUsePatternTrick.cpp | 22 ++++++++++++++++------ src/engine/QueryPlanner.cpp | 7 +++++-- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/src/engine/CheckUsePatternTrick.cpp b/src/engine/CheckUsePatternTrick.cpp index 0cae765d5f..b4c9a76447 100644 --- a/src/engine/CheckUsePatternTrick.cpp +++ b/src/engine/CheckUsePatternTrick.cpp @@ -136,6 +136,12 @@ void addValuesClause(ParsedQuery::GraphPattern& graphPattern, for (const auto& foundValue : foundClauses) { addValuesClause(graphPattern, foundValue, false); } + + if (foundClauses.empty()) { + for (auto& pattern : graphPattern._graphPatterns) { + addValuesClauseToPattern(pattern, std::nullopt); + } + } } // __________________________________________________________________________ @@ -145,9 +151,9 @@ bool addValuesClauseToPattern(parsedQuery::GraphPatternOperation& operation, addValuesClause(pattern, result); return false; }; - // TODO Don't pass an optional to this function. - AD_CORRECTNESS_CHECK(result.has_value()); - const auto& variables = result.value()._inlineValues._variables; + const std::vector emptyVars{}; + const auto& variables = + result.has_value() ? result.value()._inlineValues._variables : emptyVars; auto anyVar = [&](auto f) { return ql::ranges::any_of(variables, f); }; return operation.visit([&](auto&& arg) -> bool { using T = std::decay_t; @@ -169,6 +175,9 @@ bool addValuesClauseToPattern(parsedQuery::GraphPatternOperation& operation, })) { return check(arg.get()._rootGraphPattern); } else { + // Also recurse into the subquery, but not with the given `VALUES` + // clause. + addValuesClause(arg.get()._rootGraphPattern, std::nullopt); return false; } } else if constexpr (std::is_same_v) { @@ -200,9 +209,10 @@ bool addValuesClauseToPattern(parsedQuery::GraphPatternOperation& operation, static_assert( std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v); - // The `TransPath` is set up later in the query planning, when this - // function should not be called anymore. - AD_FAIL(); + // TODO This is just an optimization, so we can always just omit + // it, but it would be nice to also apply this optimization for those + // types of queries. + return false; } }); } diff --git a/src/engine/QueryPlanner.cpp b/src/engine/QueryPlanner.cpp index 8a6e302de3..d03000abf4 100644 --- a/src/engine/QueryPlanner.cpp +++ b/src/engine/QueryPlanner.cpp @@ -205,12 +205,15 @@ std::vector QueryPlanner::createExecutionTrees( } // _____________________________________________________________________________ -QueryExecutionTree QueryPlanner::createExecutionTree(ParsedQuery& pq, +QueryExecutionTree QueryPlanner::createExecutionTree(ParsedQuery& pqIn, bool isSubquery) { try { + ParsedQuery copy; if (!isSubquery) { - checkUsePatternTrick::addValuesClause(pq._rootGraphPattern); + copy = pqIn; + checkUsePatternTrick::addValuesClause(copy._rootGraphPattern); } + auto& pq = isSubquery ? pqIn : copy; auto lastRow = createExecutionTrees(pq, isSubquery); auto minInd = findCheapestExecutionTree(lastRow); LOG(DEBUG) << "Done creating execution plan" << std::endl;