From 3213257bf7fc9dd18817b9c5072ab732fd6274f0 Mon Sep 17 00:00:00 2001 From: Felix Meisen <85636111+Flixtastic@users.noreply.github.com> Date: Wed, 22 Jan 2025 08:59:44 +0100 Subject: [PATCH 1/2] Refactor the parsing of the text index builder (#1695) Split up large functions, modernize code, choose better names and add some documentation --- src/global/IndexTypes.h | 1 + src/index/IndexImpl.Text.cpp | 139 ++++++++-------- src/index/IndexImpl.h | 18 ++- src/parser/CMakeLists.txt | 2 +- src/parser/ContextFileParser.cpp | 46 ------ src/parser/ContextFileParser.h | 45 ------ src/parser/WordsAndDocsFileParser.cpp | 61 +++++++ src/parser/WordsAndDocsFileParser.h | 192 +++++++++++++++++++++++ test/CMakeLists.txt | 2 +- test/ContextFileParserTest.cpp | 59 ------- test/WordsAndDocsFileLineCreator.h | 22 +++ test/WordsAndDocsFileParserTest.cpp | 165 +++++++++++++++++++ test/engine/TextIndexScanForWordTest.cpp | 83 +++++----- test/engine/TextIndexScanTestHelpers.h | 14 -- 14 files changed, 571 insertions(+), 278 deletions(-) delete mode 100644 src/parser/ContextFileParser.cpp delete mode 100644 src/parser/ContextFileParser.h create mode 100644 src/parser/WordsAndDocsFileParser.cpp create mode 100644 src/parser/WordsAndDocsFileParser.h delete mode 100644 test/ContextFileParserTest.cpp create mode 100644 test/WordsAndDocsFileLineCreator.h create mode 100644 test/WordsAndDocsFileParserTest.cpp diff --git a/src/global/IndexTypes.h b/src/global/IndexTypes.h index 08ee960d00..4868e59694 100644 --- a/src/global/IndexTypes.h +++ b/src/global/IndexTypes.h @@ -16,3 +16,4 @@ using LocalVocabIndex = const LocalVocabEntry*; using TextRecordIndex = ad_utility::TypedIndex; using WordVocabIndex = ad_utility::TypedIndex; using BlankNodeIndex = ad_utility::TypedIndex; +using DocumentIndex = ad_utility::TypedIndex; diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp index 76c0015974..d1e6a70777 100644 --- a/src/index/IndexImpl.Text.cpp +++ b/src/index/IndexImpl.Text.cpp @@ -17,39 +17,22 @@ #include "backports/algorithm.h" #include "engine/CallFixedSize.h" #include "index/FTSAlgorithms.h" -#include "parser/ContextFileParser.h" +#include "parser/WordsAndDocsFileParser.h" #include "util/Conversions.h" #include "util/Simple8bCode.h" -namespace { - -// Custom delimiter class for tokenization of literals using `absl::StrSplit`. -// The `Find` function returns the next delimiter in `text` after the given -// `pos` or an empty substring if there is no next delimiter. -struct LiteralsTokenizationDelimiter { - absl::string_view Find(absl::string_view text, size_t pos) { - auto isWordChar = [](char c) -> bool { return std::isalnum(c); }; - auto found = std::find_if_not(text.begin() + pos, text.end(), isWordChar); - if (found == text.end()) return text.substr(text.size()); - return {found, found + 1}; - } -}; - -} // namespace - // _____________________________________________________________________________ -cppcoro::generator IndexImpl::wordsInTextRecords( - const std::string& contextFile, bool addWordsFromLiterals) { +cppcoro::generator IndexImpl::wordsInTextRecords( + std::string contextFile, bool addWordsFromLiterals) const { auto localeManager = textVocab_.getLocaleManager(); // ROUND 1: If context file aka wordsfile is not empty, read words from there. // Remember the last context id for the (optional) second round. TextRecordIndex contextId = TextRecordIndex::make(0); if (!contextFile.empty()) { - ContextFileParser::Line line; - ContextFileParser p(contextFile, localeManager); + WordsFileParser p(contextFile, localeManager); ad_utility::HashSet items; - while (p.getLine(line)) { - contextId = line._contextId; + for (auto& line : p) { + contextId = line.contextId_; co_yield line; } if (contextId > TextRecordIndex::make(0)) { @@ -65,15 +48,13 @@ cppcoro::generator IndexImpl::wordsInTextRecords( if (!isLiteral(text)) { continue; } - ContextFileParser::Line entityLine{text, true, contextId, 1, true}; + WordsFileLine entityLine{text, true, contextId, 1, true}; co_yield entityLine; std::string_view textView = text; textView = textView.substr(0, textView.rfind('"')); textView.remove_prefix(1); - for (auto word : absl::StrSplit(textView, LiteralsTokenizationDelimiter{}, - absl::SkipEmpty{})) { - auto wordNormalized = localeManager.getLowercaseUtf8(word); - ContextFileParser::Line wordLine{wordNormalized, false, contextId, 1}; + for (auto word : tokenizeAndNormalizeText(textView, localeManager)) { + WordsFileLine wordLine{std::move(word), false, contextId, 1}; co_yield wordLine; } contextId = contextId.incremented(); @@ -81,6 +62,56 @@ cppcoro::generator IndexImpl::wordsInTextRecords( } } +// _____________________________________________________________________________ +void IndexImpl::processEntityCaseDuringInvertedListProcessing( + const WordsFileLine& line, + ad_utility::HashMap& entitiesInContext, size_t& nofLiterals, + size_t& entityNotFoundErrorMsgCount) const { + VocabIndex eid; + // TODO Currently only IRIs and strings from the vocabulary can + // be tagged entities in the text index (no doubles, ints, etc). + if (getVocab().getId(line.word_, &eid)) { + // Note that `entitiesInContext` is a HashMap, so the `Id`s don't have + // to be contiguous. + entitiesInContext[Id::makeFromVocabIndex(eid)] += line.score_; + if (line.isLiteralEntity_) { + ++nofLiterals; + } + } else { + logEntityNotFound(line.word_, entityNotFoundErrorMsgCount); + } +} + +// _____________________________________________________________________________ +void IndexImpl::processWordCaseDuringInvertedListProcessing( + const WordsFileLine& line, + ad_utility::HashMap& wordsInContext) const { + // TODO Let the `textVocab_` return a `WordIndex` directly. + WordVocabIndex vid; + bool ret = textVocab_.getId(line.word_, &vid); + WordIndex wid = vid.get(); + if (!ret) { + LOG(ERROR) << "ERROR: word \"" << line.word_ << "\" " + << "not found in textVocab. Terminating\n"; + AD_FAIL(); + } + wordsInContext[wid] += line.score_; +} + +// _____________________________________________________________________________ +void IndexImpl::logEntityNotFound(const string& word, + size_t& entityNotFoundErrorMsgCount) const { + if (entityNotFoundErrorMsgCount < 20) { + LOG(WARN) << "Entity from text not in KB: " << word << '\n'; + if (++entityNotFoundErrorMsgCount == 20) { + LOG(WARN) << "There are more entities not in the KB..." + << " suppressing further warnings...\n"; + } + } else { + entityNotFoundErrorMsgCount++; + } +} + // _____________________________________________________________________________ void IndexImpl::addTextFromContextFile(const string& contextFile, bool addWordsFromLiterals) { @@ -214,12 +245,12 @@ size_t IndexImpl::processWordsForVocabulary(string const& contextFile, for (auto line : wordsInTextRecords(contextFile, addWordsFromLiterals)) { ++numLines; // LOG(INFO) << "LINE: " - // << std::setw(50) << line._word << " " - // << line._isEntity << "\t" - // << line._contextId.get() << "\t" - // << line._score << std::endl; - if (!line._isEntity) { - distinctWords.insert(line._word); + // << std::setw(50) << line.word_ << " " + // << line.isEntity_ << "\t" + // << line.contextId_.get() << "\t" + // << line.score_ << std::endl; + if (!line.isEntity_) { + distinctWords.insert(line.word_); } } textVocab_.createFromSet(distinctWords, onDiskBase_ + ".text.vocabulary"); @@ -243,49 +274,21 @@ void IndexImpl::processWordsForInvertedLists(const string& contextFile, size_t nofLiterals = 0; for (auto line : wordsInTextRecords(contextFile, addWordsFromLiterals)) { - if (line._contextId != currentContext) { + if (line.contextId_ != currentContext) { ++nofContexts; addContextToVector(writer, currentContext, wordsInContext, entitiesInContext); - currentContext = line._contextId; + currentContext = line.contextId_; wordsInContext.clear(); entitiesInContext.clear(); } - if (line._isEntity) { + if (line.isEntity_) { ++nofEntityPostings; - // TODO Currently only IRIs and strings from the vocabulary can - // be tagged entities in the text index (no doubles, ints, etc). - VocabIndex eid; - if (getVocab().getId(line._word, &eid)) { - // Note that `entitiesInContext` is a HashMap, so the `Id`s don't have - // to be contiguous. - entitiesInContext[Id::makeFromVocabIndex(eid)] += line._score; - if (line._isLiteralEntity) { - ++nofLiterals; - } - } else { - if (entityNotFoundErrorMsgCount < 20) { - LOG(WARN) << "Entity from text not in KB: " << line._word << '\n'; - if (++entityNotFoundErrorMsgCount == 20) { - LOG(WARN) << "There are more entities not in the KB..." - << " suppressing further warnings...\n"; - } - } else { - entityNotFoundErrorMsgCount++; - } - } + processEntityCaseDuringInvertedListProcessing( + line, entitiesInContext, nofLiterals, entityNotFoundErrorMsgCount); } else { ++nofWordPostings; - // TODO Let the `textVocab_` return a `WordIndex` directly. - WordVocabIndex vid; - bool ret = textVocab_.getId(line._word, &vid); - WordIndex wid = vid.get(); - if (!ret) { - LOG(ERROR) << "ERROR: word \"" << line._word << "\" " - << "not found in textVocab. Terminating\n"; - AD_FAIL(); - } - wordsInContext[wid] += line._score; + processWordCaseDuringInvertedListProcessing(line, wordsInContext); } } if (entityNotFoundErrorMsgCount > 0) { diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index d9ec19eb14..ac0003db87 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -29,9 +29,9 @@ #include "index/TextMetaData.h" #include "index/Vocabulary.h" #include "index/VocabularyMerger.h" -#include "parser/ContextFileParser.h" #include "parser/RdfParser.h" #include "parser/TripleComponent.h" +#include "parser/WordsAndDocsFileParser.h" #include "util/BufferedVector.h" #include "util/CancellationHandle.h" #include "util/File.h" @@ -521,8 +521,20 @@ class IndexImpl { // TODO: So far, this is limited to the internal vocabulary (still in the // testing phase, once it works, it should be easy to include the IRIs and // literals from the external vocabulary as well). - cppcoro::generator wordsInTextRecords( - const std::string& contextFile, bool addWordsFromLiterals); + cppcoro::generator wordsInTextRecords( + std::string contextFile, bool addWordsFromLiterals) const; + + void processEntityCaseDuringInvertedListProcessing( + const WordsFileLine& line, + ad_utility::HashMap& entitiesInContxt, size_t& nofLiterals, + size_t& entityNotFoundErrorMsgCount) const; + + void processWordCaseDuringInvertedListProcessing( + const WordsFileLine& line, + ad_utility::HashMap& wordsInContext) const; + + void logEntityNotFound(const string& word, + size_t& entityNotFoundErrorMsgCount) const; size_t processWordsForVocabulary(const string& contextFile, bool addWordsFromLiterals); diff --git a/src/parser/CMakeLists.txt b/src/parser/CMakeLists.txt index be4b3db44c..6fa123a793 100644 --- a/src/parser/CMakeLists.txt +++ b/src/parser/CMakeLists.txt @@ -10,7 +10,7 @@ add_library(parser ParsedQuery.cpp RdfParser.cpp Tokenizer.cpp - ContextFileParser.cpp + WordsAndDocsFileParser.cpp TurtleTokenId.h ParallelBuffer.cpp SparqlParserHelpers.cpp diff --git a/src/parser/ContextFileParser.cpp b/src/parser/ContextFileParser.cpp deleted file mode 100644 index 523bde486b..0000000000 --- a/src/parser/ContextFileParser.cpp +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2015, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) - -#include "./ContextFileParser.h" - -#include - -#include "../util/Exception.h" -#include "../util/StringUtils.h" - -// _____________________________________________________________________________ -ContextFileParser::ContextFileParser(const string& contextFile, - LocaleManager localeManager) - : _in(contextFile), _localeManager(std::move(localeManager)) {} - -// _____________________________________________________________________________ -ContextFileParser::~ContextFileParser() { _in.close(); } - -// _____________________________________________________________________________ -bool ContextFileParser::getLine(ContextFileParser::Line& line) { - string l; - if (std::getline(_in, l)) { - size_t i = l.find('\t'); - assert(i != string::npos); - size_t j = i + 2; - assert(j + 3 < l.size()); - size_t k = l.find('\t', j + 2); - assert(k != string::npos); - line._isEntity = (l[i + 1] == '1'); - line._word = - (line._isEntity ? l.substr(0, i) - : _localeManager.getLowercaseUtf8(l.substr(0, i))); - line._contextId = - TextRecordIndex::make(atol(l.substr(j + 1, k - j - 1).c_str())); - line._score = static_cast(atol(l.substr(k + 1).c_str())); -#ifndef NDEBUG - if (_lastCId > line._contextId) { - AD_THROW("ContextFile has to be sorted by context Id."); - } - _lastCId = line._contextId; -#endif - return true; - } - return false; -} diff --git a/src/parser/ContextFileParser.h b/src/parser/ContextFileParser.h deleted file mode 100644 index ba8d7bac9c..0000000000 --- a/src/parser/ContextFileParser.h +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2015, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) - -#pragma once - -#include - -#include -#include - -#include "../global/Id.h" -#include "../index/StringSortComparator.h" - -using std::string; - -class ContextFileParser { - public: - struct Line { - string _word; - bool _isEntity; - TextRecordIndex _contextId; - Score _score; - bool _isLiteralEntity = false; - }; - - explicit ContextFileParser(const string& contextFile, - LocaleManager localeManager); - ~ContextFileParser(); - // Don't allow copy & assignment - explicit ContextFileParser(const ContextFileParser& other) = delete; - ContextFileParser& operator=(const ContextFileParser& other) = delete; - - // Get the next line from the file. - // Returns true if something was stored. - bool getLine(Line&); - - private: - std::ifstream _in; -#ifndef NDEBUG - // Only used for sanity checks in debug builds - TextRecordIndex _lastCId = TextRecordIndex::make(0); -#endif - LocaleManager _localeManager; -}; diff --git a/src/parser/WordsAndDocsFileParser.cpp b/src/parser/WordsAndDocsFileParser.cpp new file mode 100644 index 0000000000..e7d36974c6 --- /dev/null +++ b/src/parser/WordsAndDocsFileParser.cpp @@ -0,0 +1,61 @@ +// Copyright 2015, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) +// Felix Meisen (fesemeisen@outlook.de) + +#include "parser/WordsAndDocsFileParser.h" + +#include + +#include "util/Exception.h" +#include "util/StringUtils.h" + +// _____________________________________________________________________________ +WordsAndDocsFileParser::WordsAndDocsFileParser( + const string& wordsOrDocsFile, const LocaleManager& localeManager) + : in_(wordsOrDocsFile), localeManager_(localeManager) {} + +// _____________________________________________________________________________ +ad_utility::InputRangeFromGet::Storage WordsFileParser::get() { + WordsFileLine line; + string l; + if (!std::getline(getInputStream(), l)) { + return std::nullopt; + } + std::string_view lineView(l); + size_t i = lineView.find('\t'); + assert(i != string::npos); + size_t j = i + 2; + assert(j + 3 < lineView.size()); + size_t k = lineView.find('\t', j + 2); + assert(k != string::npos); + line.isEntity_ = (lineView[i + 1] == '1'); + line.word_ = + (line.isEntity_ + ? lineView.substr(0, i) + : getLocaleManager().getLowercaseUtf8(lineView.substr(0, i))); + line.contextId_ = + TextRecordIndex::make(atol(lineView.substr(j + 1, k - j - 1).data())); + line.score_ = static_cast(atol(lineView.substr(k + 1).data())); +#ifndef NDEBUG + if (lastCId_ > line.contextId_) { + AD_THROW("ContextFile has to be sorted by context Id."); + } + lastCId_ = line.contextId_; +#endif + return line; +} + +// _____________________________________________________________________________ +ad_utility::InputRangeFromGet::Storage DocsFileParser::get() { + string l; + if (!std::getline(getInputStream(), l)) { + return std::nullopt; + } + DocsFileLine line; + size_t i = l.find('\t'); + assert(i != string::npos); + line.docId_ = DocumentIndex::make(atol(l.substr(0, i).c_str())); + line.docContent_ = l.substr(i + 1); + return line; +} diff --git a/src/parser/WordsAndDocsFileParser.h b/src/parser/WordsAndDocsFileParser.h new file mode 100644 index 0000000000..1fc80523ff --- /dev/null +++ b/src/parser/WordsAndDocsFileParser.h @@ -0,0 +1,192 @@ +// Copyright 2015, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) +// Felix Meisen (fesemeisen@outlook.de) + +#pragma once + +#include +#include + +#include +#include + +#include "global/Id.h" +#include "index/StringSortComparator.h" +#include "util/Iterators.h" +#include "util/Views.h" + +using std::string; + +/** + * @brief Represents a line in the words file. + * + * This struct holds information about a word or entity as it appears in the + * words file. + * + * The Fields are ordered in the same way the values follow in a line. + * Short field overview: string word_, bool isEntity, TextRecordIndex contextId, + * Score score_, bool isLiteralEntity (not found in + * wordsfile) + * + * @details + * + * Fields: + * - string word_: The string of the word, if it is an entity it will be + * . + * - bool isEntity_: True if the given word is an entity, false if it's a word. + * - TextRecordIndex contextId_: When creating the wordsfile docs from the + * docsfile get split into so called contexts. + * Those contexts overlap, meaning words and + * entities are covered multiple times. Each + * contextId corresponds to the next bigger or + * equal docId. + * - Score score_: Either 1 or 0 if isEntity is false. 0, 1, 100, 150 if + * isEntity is true. (this info is only constructed on the + * scientists.wordsfile.tsv) The score in the wordsfile is only + * relevant for the counting scoring metric. Because of the + * overlap of contexts the score is 1 if the word really has + * been seen for the first time and 0 if not. If a doc contains + * multiple mentions of a word there should be exactly as many + * wordsfile lines of that word with score 1 as there are + * mentions. The score for entities seems rather random and + * since no clear explanation of the creation of wordsfiles + * has been found yet they will stay rather random. + * - bool isLiteralEntity_: This does not directly stem from the wordsfile. + * When building the text index with literals, for + * every literal there will be WordsFileLines for all + * words in that literal. Additionally the whole + * literal itself will be added as word with isEntity + * being true. The need to count this comes only from + * a trick used in testing right now. To be specific + * the method getTextRecordFromResultTable + */ +struct WordsFileLine { + string word_; + bool isEntity_; + TextRecordIndex contextId_; + Score score_; + bool isLiteralEntity_ = false; +}; + +/** + * @brief Represents a line from the docsfile.tsv. + * + * This struct stores everything given in a line of the docsfile.tsv. + * + * The Fields are ordered in the same way the values follow in a line. + * Short field overview: DocumentIndex docId_, string docContent_ + * + * @details + * + * Fields: + * - DocumentIndex docId_: The docId is needed to build inverted indices for + * scoring and building of the docsDB. It is also used + * to return actual texts when searching for a word. + * The word (and entity) search returns a table with + * TextRecordIndex as type of one column. Those get + * mapped to the next bigger or equal docId which is + * then used to extract the text from the docsDB. + * - string docContent_: The whole text given after the first tab of a line of + * docsfile. + */ +struct DocsFileLine { + DocumentIndex docId_; + string docContent_; +}; + +// Custom delimiter class for tokenization of literals using `absl::StrSplit`. +// The `Find` function returns the next delimiter in `text` after the given +// `pos` or an empty substring if there is no next delimiter. +struct LiteralsTokenizationDelimiter { + absl::string_view Find(absl::string_view text, size_t pos) const { + auto isWordChar = [](char c) -> bool { return std::isalnum(c); }; + auto found = std::find_if_not(text.begin() + pos, text.end(), isWordChar); + if (found == text.end()) return text.substr(text.size()); + return {found, found + 1}; + } +}; + +/** + * @brief A function that can be used to tokenize and normalize a given text. + * @warning Both params are const refs where the original objects have to be + * kept alive during the usage of the returned object. + * @param text The text to be tokenized and normalized. + * @param localeManager The localeManager to be used for normalization. + * @details This function can be used in the following way: + * for (auto normalizedWord : tokenizeAndNormalizeText(text, localeManager)) { + * code; + * } + */ +inline auto tokenizeAndNormalizeText(std::string_view text, + const LocaleManager& localeManager) { + std::vector split{ + absl::StrSplit(text, LiteralsTokenizationDelimiter{}, absl::SkipEmpty{})}; + return ql::views::transform(ad_utility::OwningView{std::move(split)}, + [&localeManager](const auto& str) { + return localeManager.getLowercaseUtf8(str); + }); +} +/** + * @brief This class is the parent class of WordsFileParser and DocsFileParser + * + * @details It exists to reduce code duplication since the only difference + * between the child classes is the line type returned. + */ +class WordsAndDocsFileParser { + public: + explicit WordsAndDocsFileParser(const string& wordsOrDocsFile, + const LocaleManager& localeManager); + explicit WordsAndDocsFileParser(const WordsAndDocsFileParser& other) = delete; + WordsAndDocsFileParser& operator=(const WordsAndDocsFileParser& other) = + delete; + + protected: + std::ifstream& getInputStream() { return in_; } + const LocaleManager& getLocaleManager() const { return localeManager_; } + + private: + std::ifstream in_; + LocaleManager localeManager_; +}; + +/** + * @brief This class takes in the a pathToWordsFile and a localeManager. It then + * can be used to iterate the wordsFile while already normalizing the words + * using the localeManager. (If words are entities it doesn't normalize them) + * + * @details An object of this class can be iterated as follows: + * for (auto wordsFileLine : WordsFileParser{wordsFile, localeManager}) { + * code; + * } + * The type of the value returned when iterating is WordsFileLine + */ +class WordsFileParser : public WordsAndDocsFileParser, + public ad_utility::InputRangeFromGet { + public: + using WordsAndDocsFileParser::WordsAndDocsFileParser; + Storage get() override; + +#ifndef NDEBUG + private: + // Only used for sanity checks in debug builds + TextRecordIndex lastCId_ = TextRecordIndex::make(0); +#endif +}; + +/** + * @brief This class takes in the a pathToDocsFile and a localeManager. It then + * can be used to iterate over the docsFile to get the lines. + * + * @details An object of this class can be iterated as follows: + * for (auto docsFileLine : DocsFileParser{docsFile, localeManager}) { + * code; + * } + * The type of the value returned when iterating is DocsFileLine + */ +class DocsFileParser : public WordsAndDocsFileParser, + public ad_utility::InputRangeFromGet { + public: + using WordsAndDocsFileParser::WordsAndDocsFileParser; + Storage get() override; +}; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index bd375f4826..b9581312e8 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -137,7 +137,7 @@ addLinkAndDiscoverTestSerial(FileTest) addLinkAndDiscoverTest(Simple8bTest) -addLinkAndDiscoverTest(ContextFileParserTest parser) +addLinkAndDiscoverTest(WordsAndDocsFileParserTest parser) addLinkAndDiscoverTest(IndexMetaDataTest index) diff --git a/test/ContextFileParserTest.cpp b/test/ContextFileParserTest.cpp deleted file mode 100644 index 2b27c0f34d..0000000000 --- a/test/ContextFileParserTest.cpp +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2015, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) - -#include - -#include -#include - -#include "../src/parser/ContextFileParser.h" - -TEST(ContextFileParserTest, getLineTest) { - char* locale = setlocale(LC_CTYPE, ""); - std::cout << "Set locale LC_CTYPE to: " << locale << std::endl; - - std::fstream f("_testtmp.contexts.tsv", std::ios_base::out); - f << "Foo\t0\t0\t2\n" - "foo\t0\t0\t2\n" - "Bär\t1\t0\t1\n" - "Äü\t0\t0\t1\n" - "X\t0\t1\t1\n"; - - f.close(); - ContextFileParser p("_testtmp.contexts.tsv", - LocaleManager("en", "US", false)); - ContextFileParser::Line a; - ASSERT_TRUE(p.getLine(a)); - ASSERT_EQ("foo", a._word); - ASSERT_FALSE(a._isEntity); - ASSERT_EQ(0u, a._contextId.get()); - ASSERT_EQ(2u, a._score); - - ASSERT_TRUE(p.getLine(a)); - ASSERT_EQ("foo", a._word); - ASSERT_FALSE(a._isEntity); - ASSERT_EQ(0u, a._contextId.get()); - ASSERT_EQ(2u, a._score); - - ASSERT_TRUE(p.getLine(a)); - ASSERT_EQ("Bär", a._word); - ASSERT_TRUE(a._isEntity); - ASSERT_EQ(0u, a._contextId.get()); - ASSERT_EQ(1u, a._score); - - ASSERT_TRUE(p.getLine(a)); - ASSERT_EQ("äü", a._word); - ASSERT_FALSE(a._isEntity); - ASSERT_EQ(0u, a._contextId.get()); - ASSERT_EQ(1u, a._score); - - ASSERT_TRUE(p.getLine(a)); - ASSERT_EQ("x", a._word); - ASSERT_FALSE(a._isEntity); - ASSERT_EQ(1u, a._contextId.get()); - ASSERT_EQ(1u, a._score); - - ASSERT_FALSE(p.getLine(a)); - remove("_testtmp.contexts.tsv"); -}; diff --git a/test/WordsAndDocsFileLineCreator.h b/test/WordsAndDocsFileLineCreator.h new file mode 100644 index 0000000000..cb151216fd --- /dev/null +++ b/test/WordsAndDocsFileLineCreator.h @@ -0,0 +1,22 @@ +// Copyright 2024, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Felix Meisen (fesemeisen@outlook.de) + +#pragma once + +#include + +constexpr std::string_view inlineSeparator = "\t"; +constexpr std::string_view lineSeparator = "\n"; + +inline std::string createWordsFileLineAsString(std::string_view word, + bool isEntity, size_t contextId, + size_t score) { + return absl::StrCat(word, inlineSeparator, isEntity, inlineSeparator, + contextId, inlineSeparator, score, lineSeparator); +}; + +inline std::string createDocsFileLineAsString(size_t docId, + std::string_view docContent) { + return absl::StrCat(docId, inlineSeparator, docContent, lineSeparator); +}; diff --git a/test/WordsAndDocsFileParserTest.cpp b/test/WordsAndDocsFileParserTest.cpp new file mode 100644 index 0000000000..de7216ada7 --- /dev/null +++ b/test/WordsAndDocsFileParserTest.cpp @@ -0,0 +1,165 @@ +// Copyright 2015, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) + +#include + +#include +#include + +#include "./WordsAndDocsFileLineCreator.h" +#include "parser/WordsAndDocsFileParser.h" + +// All lambdas and type aliases used in this file contained here +namespace { + +/// Type aliases + +// Word, isEntity, contextId, score +using WordLine = std::tuple; +using WordLineVec = std::vector; + +// docId, docContent +using DocLine = std::tuple; +using DocLineVec = std::vector; + +using StringVec = std::vector; + +/// Lambdas + +auto getLocaleManager = []() -> LocaleManager { + return LocaleManager("en", "US", false); +}; + +auto wordsFileLineToWordLine = + [](const WordsFileLine& wordsFileLine) -> WordLine { + return std::make_tuple(wordsFileLine.word_, wordsFileLine.isEntity_, + static_cast(wordsFileLine.contextId_.get()), + static_cast(wordsFileLine.score_)); +}; + +// Lambda that takes in a path to wordsFile to initialize the Parser and an +// expectedResult that is compared against the parsers outputs. +auto testWordsFileParser = [](const std::string& wordsFilePath, + const WordLineVec& expectedResult) { + size_t i = 0; + LocaleManager localeManager = getLocaleManager(); + for (auto wordsFileLine : WordsFileParser{wordsFilePath, localeManager}) { + ASSERT_TRUE(i < expectedResult.size()); + WordLine testLine = wordsFileLineToWordLine(wordsFileLine); + + // Not testing the whole tuples against each other to have a cleaner + // indication what exactly caused the assertion to fail + ASSERT_EQ(std::get<0>(testLine), std::get<0>(expectedResult.at(i))); + ASSERT_EQ(std::get<1>(testLine), std::get<1>(expectedResult.at(i))); + ASSERT_EQ(std::get<2>(testLine), std::get<2>(expectedResult.at(i))); + ASSERT_EQ(std::get<3>(testLine), std::get<3>(expectedResult.at(i))); + + ++i; + } + ASSERT_EQ(i, expectedResult.size()); +}; + +auto docsFileLineToDocLine = [](const DocsFileLine& docsFileLine) -> DocLine { + return std::make_tuple(static_cast(docsFileLine.docId_.get()), + docsFileLine.docContent_); +}; + +// Same as testWordsFileParser but for docsFile +auto testDocsFileParser = [](const std::string& docsFilePath, + const DocLineVec& expectedResult) { + size_t i = 0; + LocaleManager localeManager = getLocaleManager(); + for (auto docsFileLine : DocsFileParser{docsFilePath, localeManager}) { + ASSERT_TRUE(i < expectedResult.size()); + DocLine testLine = docsFileLineToDocLine(docsFileLine); + + // Not testing the whole tuples against each other to have a cleaner + // indication what exactly caused the assertion to fail + ASSERT_EQ(std::get<0>(testLine), std::get<0>(expectedResult.at(i))); + ASSERT_EQ(std::get<1>(testLine), std::get<1>(expectedResult.at(i))); + + ++i; + } +}; + +// Passing the testText as copy to make sure it stays alive during the usage of +// tokenizer +auto testTokenizeAndNormalizeText = [](std::string testText, + const StringVec& normalizedTextAsVec) { + size_t i = 0; + LocaleManager localeManager = getLocaleManager(); + for (auto normalizedWord : + tokenizeAndNormalizeText(testText, localeManager)) { + ASSERT_TRUE(i < normalizedTextAsVec.size()); + ASSERT_EQ(normalizedWord, normalizedTextAsVec.at(i)); + + ++i; + } + ASSERT_EQ(i, normalizedTextAsVec.size()); +}; + +} // namespace + +TEST(WordsAndDocsFileParserTest, wordsFileParserTest) { + char* locale = setlocale(LC_CTYPE, ""); + std::cout << "Set locale LC_CTYPE to: " << locale << std::endl; + + std::fstream f("_testtmp.contexts.tsv", std::ios_base::out); + f << createWordsFileLineAsString("Foo", false, 0, 2) + << createWordsFileLineAsString("foo", false, 0, 2) + << createWordsFileLineAsString("Bär", true, 0, 1) + << createWordsFileLineAsString("Äü", false, 0, 1) + << createWordsFileLineAsString("X", false, 1, 1); + f.close(); + + WordLineVec expected = {{"foo", false, 0, 2}, + {"foo", false, 0, 2}, + {"Bär", true, 0, 1}, + {"äü", false, 0, 1}, + {"x", false, 1, 1}}; + + testWordsFileParser("_testtmp.contexts.tsv", expected); + remove("_testtmp.contexts.tsv"); +}; + +TEST(WordsAndDocsFileParser, docsFileParserTest) { + char* locale = setlocale(LC_CTYPE, ""); + std::cout << "Set locale LC_CTYPE to: " << locale << std::endl; + + std::fstream f("_testtmp.documents.tsv", std::ios_base::out); + f << createDocsFileLineAsString(4, "This TeSt is OnlyCharcters") + << createDocsFileLineAsString(7, "Wh4t h4pp3ns t0 num83rs") + << createDocsFileLineAsString(8, "An( sp@ci*l ch.ar,:act=_er+s") + << createDocsFileLineAsString(190293, "Large docId"); + f.close(); + + DocLineVec expected = {{4, "This TeSt is OnlyCharcters"}, + {7, "Wh4t h4pp3ns t0 num83rs"}, + {8, "An( sp@ci*l ch.ar,:act=_er+s"}, + {190293, "Large docId"}}; + + testDocsFileParser("_testtmp.documents.tsv", expected); + remove("_testtmp.documents.tsv"); +} + +TEST(TokenizeAndNormalizeText, tokenizeAndNormalizeTextTest) { + char* locale = setlocale(LC_CTYPE, ""); + std::cout << "Set locale LC_CTYPE to: " << locale << std::endl; + + // Test 1 + testTokenizeAndNormalizeText("already normalized text", + {"already", "normalized", "text"}); + + // Test 2 + testTokenizeAndNormalizeText("TeXt WITH UpperCASe", + {"text", "with", "uppercase"}); + + // Test 3 + testTokenizeAndNormalizeText("41ph4num3r1c t3xt", {"41ph4num3r1c", "t3xt"}); + + // Test 4 + testTokenizeAndNormalizeText( + "test\twith\ndifferent,separators.here ,.\t", + {"test", "with", "different", "separators", "here"}); +} diff --git a/test/engine/TextIndexScanForWordTest.cpp b/test/engine/TextIndexScanForWordTest.cpp index eac3cb0d2f..cc9b685ec8 100644 --- a/test/engine/TextIndexScanForWordTest.cpp +++ b/test/engine/TextIndexScanForWordTest.cpp @@ -5,6 +5,7 @@ #include #include +#include "../WordsAndDocsFileLineCreator.h" #include "../printers/VariablePrinters.h" #include "../util/GTestHelpers.h" #include "../util/IdTableHelpers.h" @@ -26,45 +27,45 @@ std::string kg = ". . . ."; std::string wordsFileContent = - h::createWordsFileLine("astronomer", false, 1, 1) + - h::createWordsFileLine("", true, 1, 0) + - h::createWordsFileLine("scientist", false, 1, 1) + - h::createWordsFileLine("field", false, 1, 1) + - h::createWordsFileLine("astronomy", false, 1, 1) + - h::createWordsFileLine("astronomer", false, 2, 0) + - h::createWordsFileLine("", true, 2, 0) + - h::createWordsFileLine(":s:firstsentence", false, 2, 0) + - h::createWordsFileLine("scientist", false, 2, 0) + - h::createWordsFileLine("field", false, 2, 0) + - h::createWordsFileLine("astronomy", false, 2, 0) + - h::createWordsFileLine("astronomy", false, 3, 1) + - h::createWordsFileLine("concentrates", false, 3, 1) + - h::createWordsFileLine("studies", false, 3, 1) + - h::createWordsFileLine("specific", false, 3, 1) + - h::createWordsFileLine("question", false, 3, 1) + - h::createWordsFileLine("outside", false, 3, 1) + - h::createWordsFileLine("scope", false, 3, 1) + - h::createWordsFileLine("earth", false, 3, 1) + - h::createWordsFileLine("astronomy", false, 4, 1) + - h::createWordsFileLine("concentrates", false, 4, 1) + - h::createWordsFileLine("studies", false, 4, 1) + - h::createWordsFileLine("field", false, 4, 1) + - h::createWordsFileLine("outside", false, 4, 1) + - h::createWordsFileLine("scope", false, 4, 1) + - h::createWordsFileLine("earth", false, 4, 1) + - h::createWordsFileLine("tester", false, 5, 1) + - h::createWordsFileLine("rockets", false, 5, 1) + - h::createWordsFileLine("astronomer", false, 5, 1) + - h::createWordsFileLine("", true, 5, 0) + - h::createWordsFileLine("although", false, 5, 1) + - h::createWordsFileLine("astronomer", false, 6, 0) + - h::createWordsFileLine("", true, 6, 0) + - h::createWordsFileLine("although", false, 6, 0) + - h::createWordsFileLine("", true, 6, 0) + - h::createWordsFileLine("space", false, 6, 1) + - h::createWordsFileLine("", true, 7, 0) + - h::createWordsFileLine("space", false, 7, 0) + - h::createWordsFileLine("earth", false, 7, 1); + createWordsFileLineAsString("astronomer", false, 1, 1) + + createWordsFileLineAsString("", true, 1, 0) + + createWordsFileLineAsString("scientist", false, 1, 1) + + createWordsFileLineAsString("field", false, 1, 1) + + createWordsFileLineAsString("astronomy", false, 1, 1) + + createWordsFileLineAsString("astronomer", false, 2, 0) + + createWordsFileLineAsString("", true, 2, 0) + + createWordsFileLineAsString(":s:firstsentence", false, 2, 0) + + createWordsFileLineAsString("scientist", false, 2, 0) + + createWordsFileLineAsString("field", false, 2, 0) + + createWordsFileLineAsString("astronomy", false, 2, 0) + + createWordsFileLineAsString("astronomy", false, 3, 1) + + createWordsFileLineAsString("concentrates", false, 3, 1) + + createWordsFileLineAsString("studies", false, 3, 1) + + createWordsFileLineAsString("specific", false, 3, 1) + + createWordsFileLineAsString("question", false, 3, 1) + + createWordsFileLineAsString("outside", false, 3, 1) + + createWordsFileLineAsString("scope", false, 3, 1) + + createWordsFileLineAsString("earth", false, 3, 1) + + createWordsFileLineAsString("astronomy", false, 4, 1) + + createWordsFileLineAsString("concentrates", false, 4, 1) + + createWordsFileLineAsString("studies", false, 4, 1) + + createWordsFileLineAsString("field", false, 4, 1) + + createWordsFileLineAsString("outside", false, 4, 1) + + createWordsFileLineAsString("scope", false, 4, 1) + + createWordsFileLineAsString("earth", false, 4, 1) + + createWordsFileLineAsString("tester", false, 5, 1) + + createWordsFileLineAsString("rockets", false, 5, 1) + + createWordsFileLineAsString("astronomer", false, 5, 1) + + createWordsFileLineAsString("", true, 5, 0) + + createWordsFileLineAsString("although", false, 5, 1) + + createWordsFileLineAsString("astronomer", false, 6, 0) + + createWordsFileLineAsString("", true, 6, 0) + + createWordsFileLineAsString("although", false, 6, 0) + + createWordsFileLineAsString("", true, 6, 0) + + createWordsFileLineAsString("space", false, 6, 1) + + createWordsFileLineAsString("", true, 7, 0) + + createWordsFileLineAsString("space", false, 7, 0) + + createWordsFileLineAsString("earth", false, 7, 1); std::string firstDocText = "An astronomer is a scientist in the field of " @@ -77,8 +78,8 @@ std::string secondDocText = "too although they might not be in space but on " "earth."; -std::string docsFileContent = h::createDocsFileLine(4, firstDocText) + - h::createDocsFileLine(7, secondDocText); +std::string docsFileContent = createDocsFileLineAsString(4, firstDocText) + + createDocsFileLineAsString(7, secondDocText); std::pair contentsOfWordsFileAndDocsFile = { wordsFileContent, docsFileContent}; diff --git a/test/engine/TextIndexScanTestHelpers.h b/test/engine/TextIndexScanTestHelpers.h index 83a72ddea4..6ba1b8c6de 100644 --- a/test/engine/TextIndexScanTestHelpers.h +++ b/test/engine/TextIndexScanTestHelpers.h @@ -66,18 +66,4 @@ inline string combineToString(const string& text, const string& word) { ss << "Text: " << text << ", Word: " << word << std::endl; return ss.str(); } - -inline std::string inlineSeparator = "\t"; -inline std::string lineSeparator = "\n"; - -inline string createWordsFileLine(std::string word, bool isEntity, - size_t contextId, size_t score) { - return word + inlineSeparator + (isEntity ? "1" : "0") + inlineSeparator + - std::to_string(contextId) + inlineSeparator + std::to_string(score) + - lineSeparator; -}; - -inline string createDocsFileLine(size_t docId, std::string docContent) { - return std::to_string(docId) + inlineSeparator + docContent + lineSeparator; -}; } // namespace textIndexScanTestHelpers From d7ec9be2e0405573456466b1489f77be43514ef3 Mon Sep 17 00:00:00 2001 From: Julian <14220769+Qup42@users.noreply.github.com> Date: Wed, 22 Jan 2025 10:24:58 +0100 Subject: [PATCH 2/2] Graph Store HTTP Protocol (GET, POST) back end (#1668) Implement a function `transformGraphStoreProtocol` that does the back end of transforming a SPARQL Graph Store HTTP Protocol request to it's equivalent SPARQL Query or Update. The integration will be a separate step. --- src/engine/CMakeLists.txt | 2 +- src/engine/GraphStoreProtocol.cpp | 101 ++++++++++++ src/engine/GraphStoreProtocol.h | 140 +++++++++++++++++ src/engine/Server.cpp | 42 ++--- src/engine/Server.h | 12 -- src/parser/TripleComponent.h | 4 + src/util/http/MediaTypes.cpp | 3 +- src/util/http/MediaTypes.h | 1 + src/util/http/UrlParser.cpp | 22 +++ src/util/http/UrlParser.h | 7 + test/CMakeLists.txt | 2 + test/GraphStoreProtocolTest.cpp | 247 ++++++++++++++++++++++++++++++ test/ServerTest.cpp | 95 ++++-------- test/TripleComponentTest.cpp | 18 +++ test/UrlParserTest.cpp | 23 +++ test/util/HttpRequestHelpers.h | 46 ++++++ 16 files changed, 651 insertions(+), 114 deletions(-) create mode 100644 src/engine/GraphStoreProtocol.cpp create mode 100644 src/engine/GraphStoreProtocol.h create mode 100644 test/GraphStoreProtocolTest.cpp create mode 100644 test/util/HttpRequestHelpers.h diff --git a/src/engine/CMakeLists.txt b/src/engine/CMakeLists.txt index be22a64d5d..e81c834303 100644 --- a/src/engine/CMakeLists.txt +++ b/src/engine/CMakeLists.txt @@ -14,5 +14,5 @@ add_library(engine CartesianProductJoin.cpp TextIndexScanForWord.cpp TextIndexScanForEntity.cpp TextLimit.cpp LazyGroupBy.cpp GroupByHashMapOptimization.cpp SpatialJoin.cpp CountConnectedSubgraphs.cpp SpatialJoinAlgorithms.cpp PathSearch.cpp ExecuteUpdate.cpp - Describe.cpp) + Describe.cpp GraphStoreProtocol.cpp) qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams s2) diff --git a/src/engine/GraphStoreProtocol.cpp b/src/engine/GraphStoreProtocol.cpp new file mode 100644 index 0000000000..fc46ec6fa0 --- /dev/null +++ b/src/engine/GraphStoreProtocol.cpp @@ -0,0 +1,101 @@ +// Copyright 2024, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Julian Mundhahs + +#include "engine/GraphStoreProtocol.h" + +#include "util/http/beast.h" + +// ____________________________________________________________________________ +GraphOrDefault GraphStoreProtocol::extractTargetGraph( + const ad_utility::url_parser::ParamValueMap& params) { + const std::optional graphIri = + ad_utility::url_parser::checkParameter(params, "graph", std::nullopt); + const bool isDefault = + ad_utility::url_parser::checkParameter(params, "default", "").has_value(); + if (graphIri.has_value() == isDefault) { + throw std::runtime_error( + "Exactly one of the query parameters default or graph must be set to " + "identify the graph for the graph store protocol request."); + } + if (graphIri.has_value()) { + return GraphRef::fromIrirefWithoutBrackets(graphIri.value()); + } else { + AD_CORRECTNESS_CHECK(isDefault); + return DEFAULT{}; + } +} + +// ____________________________________________________________________________ +void GraphStoreProtocol::throwUnsupportedMediatype( + const string_view& mediatype) { + throw UnsupportedMediatypeError(absl::StrCat( + "Mediatype \"", mediatype, + "\" is not supported for SPARQL Graph Store HTTP Protocol in QLever. " + "Supported: ", + toString(ad_utility::MediaType::turtle), ", ", + toString(ad_utility::MediaType::ntriples), ".")); +} + +// ____________________________________________________________________________ +void GraphStoreProtocol::throwUnsupportedHTTPMethod( + const std::string_view& method) { + throw std::runtime_error(absl::StrCat( + method, + " in the SPARQL Graph Store HTTP Protocol is not yet implemented " + "in QLever.")); +} + +// ____________________________________________________________________________ +std::vector GraphStoreProtocol::parseTriples( + const string& body, const ad_utility::MediaType contentType) { + using Re2Parser = RdfStringParser>; + switch (contentType) { + case ad_utility::MediaType::turtle: + case ad_utility::MediaType::ntriples: { + auto parser = Re2Parser(); + parser.setInputStream(body); + return parser.parseAndReturnAllTriples(); + } + default: { + throwUnsupportedMediatype(toString(contentType)); + } + } +} + +// ____________________________________________________________________________ +std::vector GraphStoreProtocol::convertTriples( + const GraphOrDefault& graph, std::vector triples) { + SparqlTripleSimpleWithGraph::Graph tripleGraph{std::monostate{}}; + if (std::holds_alternative(graph)) { + tripleGraph = Iri(std::get(graph).toStringRepresentation()); + } + auto transformTurtleTriple = [&tripleGraph](TurtleTriple&& triple) { + AD_CORRECTNESS_CHECK(triple.graphIri_.isId() && + triple.graphIri_.getId() == + qlever::specialIds().at(DEFAULT_GRAPH_IRI)); + + return SparqlTripleSimpleWithGraph(std::move(triple.subject_), + std::move(triple.predicate_), + std::move(triple.object_), tripleGraph); + }; + return ad_utility::transform(std::move(triples), transformTurtleTriple); +} + +// ____________________________________________________________________________ +ParsedQuery GraphStoreProtocol::transformGet(const GraphOrDefault& graph) { + ParsedQuery res; + res._clause = parsedQuery::ConstructClause( + {{Variable("?s"), Variable("?p"), Variable("?o")}}); + res._rootGraphPattern = {}; + parsedQuery::GraphPattern selectSPO; + selectSPO._graphPatterns.emplace_back(parsedQuery::BasicGraphPattern{ + {SparqlTriple(Variable("?s"), "?p", Variable("?o"))}}); + if (const auto* iri = + std::get_if(&graph)) { + res.datasetClauses_ = + parsedQuery::DatasetClauses::fromClauses({DatasetClause{*iri, false}}); + } + res._rootGraphPattern = std::move(selectSPO); + return res; +} diff --git a/src/engine/GraphStoreProtocol.h b/src/engine/GraphStoreProtocol.h new file mode 100644 index 0000000000..dee41d8fe3 --- /dev/null +++ b/src/engine/GraphStoreProtocol.h @@ -0,0 +1,140 @@ +// Copyright 2024, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Julian Mundhahs + +#pragma once + +#include + +#include "parser/ParsedQuery.h" +#include "parser/RdfParser.h" +#include "util/http/HttpUtils.h" +#include "util/http/UrlParser.h" + +// The mediatype of a request could not be determined. +class UnknownMediatypeError : public std::runtime_error { + public: + explicit UnknownMediatypeError(std::string_view msg) + : std::runtime_error{std::string{msg}} {} +}; + +// The mediatype of a request is not supported. +class UnsupportedMediatypeError : public std::runtime_error { + public: + explicit UnsupportedMediatypeError(std::string_view msg) + : std::runtime_error{std::string{msg}} {} +}; + +// Transform SPARQL Graph Store Protocol requests to their equivalent +// ParsedQuery (SPARQL Query or Update). +class GraphStoreProtocol { + private: + // Extract the mediatype from a request. + static ad_utility::MediaType extractMediatype( + const ad_utility::httpUtils::HttpRequest auto& rawRequest) { + using namespace boost::beast::http; + + std::string_view contentTypeString; + if (rawRequest.find(field::content_type) != rawRequest.end()) { + contentTypeString = rawRequest.at(field::content_type); + } + if (contentTypeString.empty()) { + // If the mediatype is not given, return an error. + // Note: The specs also allow to try to determine the media type from the + // content. + throw UnknownMediatypeError("Mediatype empty or not set."); + } + const auto mediatype = + ad_utility::getMediaTypeFromAcceptHeader(contentTypeString); + // A media type is set but not one of the supported ones as per the QLever + // MediaType code. + if (!mediatype.has_value()) { + throwUnsupportedMediatype(rawRequest.at(field::content_type)); + } + return mediatype.value(); + } + FRIEND_TEST(GraphStoreProtocolTest, extractMediatype); + + // Throws the error if a mediatype is not supported. + [[noreturn]] static void throwUnsupportedMediatype( + const std::string_view& mediatype); + + // Throws the error if an HTTP method is not supported. + [[noreturn]] static void throwUnsupportedHTTPMethod( + const std::string_view& method); + + // Parse the triples from the request body according to the content type. + static std::vector parseTriples( + const std::string& body, const ad_utility::MediaType contentType); + FRIEND_TEST(GraphStoreProtocolTest, parseTriples); + + // Transforms the triples from `TurtleTriple` to `SparqlTripleSimpleWithGraph` + // and sets the correct graph. + static std::vector convertTriples( + const GraphOrDefault& graph, std::vector triples); + FRIEND_TEST(GraphStoreProtocolTest, convertTriples); + + // Transform a SPARQL Graph Store Protocol POST to an equivalent ParsedQuery + // which is an SPARQL Update. + static ParsedQuery transformPost( + const ad_utility::httpUtils::HttpRequest auto& rawRequest, + const GraphOrDefault& graph) { + auto triples = + parseTriples(rawRequest.body(), extractMediatype(rawRequest)); + auto convertedTriples = convertTriples(graph, std::move(triples)); + updateClause::GraphUpdate up{std::move(convertedTriples), {}}; + ParsedQuery res; + res._clause = parsedQuery::UpdateClause{std::move(up)}; + return res; + } + FRIEND_TEST(GraphStoreProtocolTest, transformPost); + + // Transform a SPARQL Graph Store Protocol GET to an equivalent ParsedQuery + // which is an SPARQL Query. + static ParsedQuery transformGet(const GraphOrDefault& graph); + FRIEND_TEST(GraphStoreProtocolTest, transformGet); + + public: + // Every Graph Store Protocol request has equivalent SPARQL Query or Update. + // Transform the Graph Store Protocol request into it's equivalent Query or + // Update. + static ParsedQuery transformGraphStoreProtocol( + const ad_utility::httpUtils::HttpRequest auto& rawRequest) { + ad_utility::url_parser::ParsedUrl parsedUrl = + ad_utility::url_parser::parseRequestTarget(rawRequest.target()); + // We only support passing the target graph as a query parameter (`Indirect + // Graph Identification`). `Direct Graph Identification` (the URL is the + // graph) is not supported. See also + // https://www.w3.org/TR/2013/REC-sparql11-http-rdf-update-20130321/#graph-identification. + GraphOrDefault graph = extractTargetGraph(parsedUrl.parameters_); + + using enum boost::beast::http::verb; + auto method = rawRequest.method(); + if (method == get) { + return transformGet(graph); + } else if (method == put) { + throwUnsupportedHTTPMethod("PUT"); + } else if (method == delete_) { + throwUnsupportedHTTPMethod("DELETE"); + } else if (method == post) { + return transformPost(rawRequest, graph); + } else if (method == head) { + throwUnsupportedHTTPMethod("HEAD"); + } else if (method == patch) { + throwUnsupportedHTTPMethod("PATCH"); + } else { + throw std::runtime_error( + absl::StrCat("Unsupported HTTP method \"", + std::string_view{rawRequest.method_string()}, + "\" for the SPARQL Graph Store HTTP Protocol.")); + } + } + + private: + // Extract the graph to be acted upon using from the URL query parameters + // (`Indirect Graph Identification`). See + // https://www.w3.org/TR/2013/REC-sparql11-http-rdf-update-20130321/#indirect-graph-identification + static GraphOrDefault extractTargetGraph( + const ad_utility::url_parser::ParamValueMap& params); + FRIEND_TEST(GraphStoreProtocolTest, extractTargetGraph); +}; diff --git a/src/engine/Server.cpp b/src/engine/Server.cpp index 08fc6f9607..987c69fc7e 100644 --- a/src/engine/Server.cpp +++ b/src/engine/Server.cpp @@ -11,6 +11,7 @@ #include #include +#include "GraphStoreProtocol.h" #include "engine/ExecuteUpdate.h" #include "engine/ExportQueryExecutionTrees.h" #include "engine/QueryPlanner.h" @@ -347,8 +348,8 @@ Awaitable Server::process( // We always want to call `Server::checkParameter` with the same first // parameter. - auto checkParameter = - std::bind_front(&Server::checkParameter, std::cref(parameters)); + auto checkParameter = std::bind_front(&ad_utility::url_parser::checkParameter, + std::cref(parameters)); // Check the access token. If an access token is provided and the check fails, // throw an exception and do not process any part of the query (even if the @@ -537,9 +538,11 @@ Awaitable Server::process( std::pair Server::determineResultPinning( const ad_utility::url_parser::ParamValueMap& params) { const bool pinSubtrees = - checkParameter(params, "pinsubtrees", "true").has_value(); + ad_utility::url_parser::checkParameter(params, "pinsubtrees", "true") + .has_value(); const bool pinResult = - checkParameter(params, "pinresult", "true").has_value(); + ad_utility::url_parser::checkParameter(params, "pinresult", "true") + .has_value(); return {pinSubtrees, pinResult}; } @@ -729,17 +732,11 @@ Awaitable Server::sendStreamableResponse( } } -// ____________________________________________________________________________ -class NoSupportedMediatypeError : public std::runtime_error { - public: - explicit NoSupportedMediatypeError(std::string_view msg) - : std::runtime_error{std::string{msg}} {} -}; - // ____________________________________________________________________________ MediaType Server::determineMediaType( const ad_utility::url_parser::ParamValueMap& params, const ad_utility::httpUtils::HttpRequest auto& request) { + using namespace ad_utility::url_parser; // The following code block determines the media type to be used for the // result. The media type is either determined by the "Accept:" header of // the request or by the URL parameter "action=..." (for TSV and CSV export, @@ -997,7 +994,7 @@ Awaitable Server::processQueryOrUpdate( } catch (const QueryAlreadyInUseError& e) { responseStatus = http::status::conflict; exceptionErrorMsg = e.what(); - } catch (const NoSupportedMediatypeError& e) { + } catch (const UnknownMediatypeError& e) { responseStatus = http::status::bad_request; exceptionErrorMsg = e.what(); } catch (const ad_utility::CancellationException& e) { @@ -1118,24 +1115,3 @@ bool Server::checkAccessToken( return true; } } - -// _____________________________________________________________________________ -std::optional Server::checkParameter( - const ad_utility::url_parser::ParamValueMap& parameters, - std::string_view key, std::optional value) { - auto param = - ad_utility::url_parser::getParameterCheckAtMostOnce(parameters, key); - if (!param.has_value()) { - return std::nullopt; - } - std::string parameterValue = param.value(); - - // If value is given, but not equal to param value, return std::nullopt. If - // no value is given, set it to param value. - if (value == std::nullopt) { - value = parameterValue; - } else if (value != parameterValue) { - return std::nullopt; - } - return value; -} diff --git a/src/engine/Server.h b/src/engine/Server.h index 3ccc070cb7..399a5383d6 100644 --- a/src/engine/Server.h +++ b/src/engine/Server.h @@ -256,18 +256,6 @@ class Server { /// HTTP error response. bool checkAccessToken(std::optional accessToken) const; - /// Checks if a URL parameter exists in the request, and it matches the - /// expected `value`. If yes, return the value, otherwise return - /// `std::nullopt`. If `value` is `std::nullopt`, only check if the key - /// exists. We need this because we have parameters like "cmd=stats", where a - /// fixed combination of the key and value determines the kind of action, as - /// well as parameters like "index-decription=...", where the key determines - /// the kind of action. If the key is not found, always return `std::nullopt`. - static std::optional checkParameter( - const ad_utility::url_parser::ParamValueMap& parameters, - std::string_view key, std::optional value); - FRIEND_TEST(ServerTest, checkParameter); - /// Check if user-provided timeout is authorized with a valid access-token or /// lower than the server default. Return an empty optional and send a 403 /// Forbidden HTTP response if the change is not allowed. Return the new diff --git a/src/parser/TripleComponent.h b/src/parser/TripleComponent.h index fb874fc3c1..84abe54f7e 100644 --- a/src/parser/TripleComponent.h +++ b/src/parser/TripleComponent.h @@ -175,6 +175,10 @@ class TripleComponent { } [[nodiscard]] Variable& getVariable() { return std::get(_variant); } + bool isId() const { return std::holds_alternative(_variant); } + const Id& getId() const { return std::get(_variant); } + Id& getId() { return std::get(_variant); } + /// Convert to an RDF literal. `std::strings` will be emitted directly, /// `int64_t` is converted to a `xsd:integer` literal, and a `double` is /// converted to a `xsd:double`. diff --git a/src/util/http/MediaTypes.cpp b/src/util/http/MediaTypes.cpp index d91e7584df..b95d822562 100644 --- a/src/util/http/MediaTypes.cpp +++ b/src/util/http/MediaTypes.cpp @@ -18,7 +18,7 @@ using enum MediaType; // specified in the request. It's "application/sparql-results+json", as // required by the SPARQL standard. constexpr std::array SUPPORTED_MEDIA_TYPES{ - sparqlJson, sparqlXml, qleverJson, tsv, csv, turtle, octetStream}; + sparqlJson, sparqlXml, qleverJson, tsv, csv, turtle, ntriples, octetStream}; // _____________________________________________________________ const ad_utility::HashMap& getAllMediaTypes() { @@ -40,6 +40,7 @@ const ad_utility::HashMap& getAllMediaTypes() { add(sparqlXml, "application", "sparql-results+xml", {}); add(qleverJson, "application", "qlever-results+json", {}); add(turtle, "text", "turtle", {".ttl"}); + add(ntriples, "application", "n-triples", {".nt"}); add(octetStream, "application", "octet-stream", {}); return t; }(); diff --git a/src/util/http/MediaTypes.h b/src/util/http/MediaTypes.h index 0b2634b8ce..e03efd6842 100644 --- a/src/util/http/MediaTypes.h +++ b/src/util/http/MediaTypes.h @@ -28,6 +28,7 @@ enum class MediaType { tsv, csv, turtle, + ntriples, octetStream }; diff --git a/src/util/http/UrlParser.cpp b/src/util/http/UrlParser.cpp index 3149f97aec..1670fa937d 100644 --- a/src/util/http/UrlParser.cpp +++ b/src/util/http/UrlParser.cpp @@ -8,6 +8,7 @@ using namespace ad_utility::url_parser; +// _____________________________________________________________________________ std::optional ad_utility::url_parser::getParameterCheckAtMostOnce( const ParamValueMap& map, string_view key) { if (!map.contains(key)) { @@ -21,6 +22,27 @@ std::optional ad_utility::url_parser::getParameterCheckAtMostOnce( } return value.front(); } + +// _____________________________________________________________________________ +std::optional ad_utility::url_parser::checkParameter( + const ParamValueMap& parameters, std::string_view key, + std::optional value) { + const auto param = getParameterCheckAtMostOnce(parameters, key); + if (!param.has_value()) { + return std::nullopt; + } + std::string parameterValue = param.value(); + + // If no value is given, return the parameter's value. If value is given, but + // not equal to the parameter's value, return `std::nullopt`. + if (value == std::nullopt) { + value = parameterValue; + } else if (value != parameterValue) { + return std::nullopt; + } + return value; +} + // _____________________________________________________________________________ ParsedUrl ad_utility::url_parser::parseRequestTarget(std::string_view target) { auto urlResult = boost::urls::parse_origin_form(target); diff --git a/src/util/http/UrlParser.h b/src/util/http/UrlParser.h index 33ebc86b1d..47edd8404f 100644 --- a/src/util/http/UrlParser.h +++ b/src/util/http/UrlParser.h @@ -29,6 +29,13 @@ using ParamValueMap = ad_utility::HashMap>; std::optional getParameterCheckAtMostOnce(const ParamValueMap& map, string_view key); +// Checks if a parameter exists, and it matches the +// expected `value`. If yes, return the value, otherwise return +// `std::nullopt`. +std::optional checkParameter(const ParamValueMap& parameters, + std::string_view key, + std::optional value); + // A parsed URL. // - `path_` is the URL path // - `parameters_` is a map of the HTTP Query parameters diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index b9581312e8..5520ce2ff9 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -452,3 +452,5 @@ addLinkAndDiscoverTest(UrlParserTest) addLinkAndDiscoverTest(ServerTest engine) addLinkAndDiscoverTest(ExecuteUpdateTest engine) + +addLinkAndDiscoverTest(GraphStoreProtocolTest engine) diff --git a/test/GraphStoreProtocolTest.cpp b/test/GraphStoreProtocolTest.cpp new file mode 100644 index 0000000000..a7accb3ffd --- /dev/null +++ b/test/GraphStoreProtocolTest.cpp @@ -0,0 +1,247 @@ +// Copyright 2024, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Julian Mundhahs + +#include +#include +#include + +#include "./util/GTestHelpers.h" +#include "./util/HttpRequestHelpers.h" +#include "./util/TripleComponentTestHelpers.h" +#include "SparqlAntlrParserTestHelpers.h" +#include "engine/GraphStoreProtocol.h" + +namespace m = matchers; +using namespace ad_utility::testing; + +using Var = Variable; +using TC = TripleComponent; + +// _____________________________________________________________________________________________ +TEST(GraphStoreProtocolTest, extractTargetGraph) { + // Equivalent to `/?default` + EXPECT_THAT(GraphStoreProtocol::extractTargetGraph({{"default", {""}}}), + DEFAULT{}); + // Equivalent to `/?graph=foo` + EXPECT_THAT(GraphStoreProtocol::extractTargetGraph({{"graph", {"foo"}}}), + iri("")); + // Equivalent to `/?graph=foo&graph=bar` + AD_EXPECT_THROW_WITH_MESSAGE( + GraphStoreProtocol::extractTargetGraph({{"graph", {"foo", "bar"}}}), + testing::HasSubstr( + "Parameter \"graph\" must be given exactly once. Is: 2")); + const std::string eitherDefaultOrGraphErrorMsg = + "Exactly one of the query parameters default or graph must be set to " + "identify the graph for the graph store protocol request."; + // Equivalent to `/` or `/?` + AD_EXPECT_THROW_WITH_MESSAGE( + GraphStoreProtocol::extractTargetGraph({}), + testing::HasSubstr(eitherDefaultOrGraphErrorMsg)); + // Equivalent to `/?unrelated=a&unrelated=b` + AD_EXPECT_THROW_WITH_MESSAGE( + GraphStoreProtocol::extractTargetGraph({{"unrelated", {"a", "b"}}}), + testing::HasSubstr(eitherDefaultOrGraphErrorMsg)); + // Equivalent to `/?default&graph=foo` + AD_EXPECT_THROW_WITH_MESSAGE( + GraphStoreProtocol::extractTargetGraph( + {{"default", {""}}, {"graph", {"foo"}}}), + testing::HasSubstr(eitherDefaultOrGraphErrorMsg)); +} + +// _____________________________________________________________________________________________ +TEST(GraphStoreProtocolTest, transformPost) { + auto expectTransformPost = + [](const ad_utility::httpUtils::HttpRequest auto& request, + const testing::Matcher& matcher, + ad_utility::source_location l = + ad_utility::source_location::current()) { + auto trace = generateLocationTrace(l); + const ad_utility::url_parser::ParsedUrl parsedUrl = + ad_utility::url_parser::parseRequestTarget(request.target()); + const GraphOrDefault graph = + GraphStoreProtocol::extractTargetGraph(parsedUrl.parameters_); + EXPECT_THAT(GraphStoreProtocol::transformPost(request, graph), matcher); + }; + + expectTransformPost( + makePostRequest("/?default", "text/turtle", " ."), + m::UpdateClause( + m::GraphUpdate( + {}, {{iri(""), iri(""), iri(""), std::monostate{}}}, + std::nullopt), + m::GraphPattern())); + expectTransformPost( + makePostRequest("/?default", "application/n-triples", " ."), + m::UpdateClause( + m::GraphUpdate( + {}, {{iri(""), iri(""), iri(""), std::monostate{}}}, + std::nullopt), + m::GraphPattern())); + expectTransformPost( + makePostRequest("/?graph=bar", "application/n-triples", " ."), + m::UpdateClause( + m::GraphUpdate({}, + {{iri(""), iri(""), iri(""), Iri("")}}, + std::nullopt), + m::GraphPattern())); + AD_EXPECT_THROW_WITH_MESSAGE( + GraphStoreProtocol::transformPost( + ad_utility::testing::makePostRequest( + "/?default", "application/sparql-results+xml", ""), + DEFAULT{}), + testing::HasSubstr( + "Mediatype \"application/sparql-results+xml\" is not supported for " + "SPARQL Graph Store HTTP Protocol in QLever.")); + AD_EXPECT_THROW_WITH_MESSAGE( + GraphStoreProtocol::transformPost( + ad_utility::testing::makePostRequest( + "/?default", "application/n-quads", " ."), + DEFAULT{}), + testing::HasSubstr("Not a single media type known to this parser was " + "detected in \"application/n-quads\".")); + AD_EXPECT_THROW_WITH_MESSAGE( + GraphStoreProtocol::transformPost( + ad_utility::testing::makePostRequest( + "/?default", "application/unknown", "fantasy"), + DEFAULT{}), + testing::HasSubstr("Not a single media type known to this parser was " + "detected in \"application/unknown\".")); +} + +// _____________________________________________________________________________________________ +TEST(GraphStoreProtocolTest, transformGet) { + auto expectTransformGet = + [](const ad_utility::httpUtils::HttpRequest auto& request, + const testing::Matcher& matcher, + ad_utility::source_location l = + ad_utility::source_location::current()) { + auto trace = generateLocationTrace(l); + const ad_utility::url_parser::ParsedUrl parsedUrl = + ad_utility::url_parser::parseRequestTarget(request.target()); + const GraphOrDefault graph = + GraphStoreProtocol::extractTargetGraph(parsedUrl.parameters_); + EXPECT_THAT(GraphStoreProtocol::transformGet(graph), matcher); + }; + expectTransformGet( + makeGetRequest("/?default"), + m::ConstructQuery({{Var{"?s"}, Var{"?p"}, Var{"?o"}}}, + m::GraphPattern(matchers::Triples({SparqlTriple( + TC(Var{"?s"}), "?p", TC(Var{"?o"}))})))); + expectTransformGet( + makeGetRequest("/?graph=foo"), + m::ConstructQuery({{Var{"?s"}, Var{"?p"}, Var{"?o"}}}, + m::GraphPattern(matchers::Triples({SparqlTriple( + TC(Var{"?s"}), "?p", TC(Var{"?o"}))})), + ScanSpecificationAsTripleComponent::Graphs{ + {TripleComponent(iri(""))}})); +} + +// _____________________________________________________________________________________________ +TEST(GraphStoreProtocolTest, transformGraphStoreProtocol) { + EXPECT_THAT(GraphStoreProtocol::transformGraphStoreProtocol( + ad_utility::testing::makeGetRequest("/?default")), + m::ConstructQuery({{Var{"?s"}, Var{"?p"}, Var{"?o"}}}, + m::GraphPattern(matchers::Triples({SparqlTriple( + TC(Var{"?s"}), "?p", TC(Var{"?o"}))})))); + EXPECT_THAT( + GraphStoreProtocol::transformGraphStoreProtocol( + ad_utility::testing::makePostRequest( + "/?default", "application/n-triples", " .")), + m::UpdateClause(m::GraphUpdate({}, + {{iri(""), iri(""), iri(""), + std::monostate{}}}, + std::nullopt), + m::GraphPattern())); + auto expectUnsupportedMethod = + [](const http::verb method, ad_utility::source_location l = + ad_utility::source_location::current()) { + auto trace = generateLocationTrace(l); + AD_EXPECT_THROW_WITH_MESSAGE( + GraphStoreProtocol::transformGraphStoreProtocol( + ad_utility::testing::makeRequest(method, "/?default")), + testing::HasSubstr( + absl::StrCat(std::string{boost::beast::http::to_string(method)}, + " in the SPARQL Graph Store HTTP Protocol"))); + }; + expectUnsupportedMethod(http::verb::put); + expectUnsupportedMethod(http::verb::delete_); + expectUnsupportedMethod(http::verb::head); + expectUnsupportedMethod(http::verb::patch); + AD_EXPECT_THROW_WITH_MESSAGE( + GraphStoreProtocol::transformGraphStoreProtocol( + ad_utility::testing::makeRequest(boost::beast::http::verb::connect, + "/?default")), + testing::HasSubstr("Unsupported HTTP method \"CONNECT\"")); +} + +// _____________________________________________________________________________________________ +TEST(GraphStoreProtocolTest, extractMediatype) { + using enum http::field; + auto makeRequest = + [](const ad_utility::HashMap& headers) { + return ad_utility::testing::makeRequest(http::verb::get, "/", headers); + }; + AD_EXPECT_THROW_WITH_MESSAGE( + GraphStoreProtocol::extractMediatype(makeRequest({})), + testing::HasSubstr("Mediatype empty or not set.")); + AD_EXPECT_THROW_WITH_MESSAGE( + GraphStoreProtocol::extractMediatype(makeRequest({{content_type, ""}})), + testing::HasSubstr("Mediatype empty or not set.")); + EXPECT_THAT(GraphStoreProtocol::extractMediatype( + makeRequest({{content_type, "text/csv"}})), + testing::Eq(ad_utility::MediaType::csv)); + AD_EXPECT_THROW_WITH_MESSAGE( + GraphStoreProtocol::extractMediatype( + makeRequest({{content_type, "text/plain"}})), + testing::HasSubstr("Mediatype \"text/plain\" is not supported for SPARQL " + "Graph Store HTTP Protocol in QLever.")); + EXPECT_THAT(GraphStoreProtocol::extractMediatype( + makeRequest({{content_type, "application/n-triples"}})), + testing::Eq(ad_utility::MediaType::ntriples)); +} + +// _____________________________________________________________________________________________ +TEST(GraphStoreProtocolTest, parseTriples) { + AD_EXPECT_THROW_WITH_MESSAGE( + GraphStoreProtocol::parseTriples(" ", + ad_utility::MediaType::json), + testing::HasSubstr( + "Mediatype \"application/json\" is not supported for SPARQL " + "Graph Store HTTP Protocol in QLever.")); + const auto expectedTriples = + std::vector{{{iri("")}, {iri("")}, {iri("")}}}; + EXPECT_THAT(GraphStoreProtocol::parseTriples(" .", + ad_utility::MediaType::ntriples), + testing::Eq(expectedTriples)); + EXPECT_THAT(GraphStoreProtocol::parseTriples(" .", + ad_utility::MediaType::turtle), + testing::Eq(expectedTriples)); + EXPECT_THAT( + GraphStoreProtocol::parseTriples("", ad_utility::MediaType::ntriples), + testing::Eq(std::vector{})); + AD_EXPECT_THROW_WITH_MESSAGE( + GraphStoreProtocol::parseTriples(" ", + ad_utility::MediaType::ntriples), + testing::HasSubstr(" Parse error at byte position 7")); +} + +// _____________________________________________________________________________________________ +TEST(GraphStoreProtocolTest, convertTriples) { + auto expectConvert = + [](const GraphOrDefault& graph, std::vector triples, + const std::vector& expectedTriples, + ad_utility::source_location l = + ad_utility::source_location::current()) { + auto trace = generateLocationTrace(l); + EXPECT_THAT( + GraphStoreProtocol::convertTriples(graph, std::move(triples)), + testing::Eq(expectedTriples)); + }; + expectConvert(DEFAULT{}, {}, {}); + expectConvert(iri(""), {}, {}); + expectConvert(DEFAULT{}, {{{iri("")}, {iri("")}, {iri("")}}}, + {SparqlTripleSimpleWithGraph{iri(""), iri(""), iri(""), + std::monostate{}}}); + expectConvert(iri(""), {}, {}); +} diff --git a/test/ServerTest.cpp b/test/ServerTest.cpp index 292d77f12b..72480c9434 100644 --- a/test/ServerTest.cpp +++ b/test/ServerTest.cpp @@ -9,11 +9,13 @@ #include #include "util/GTestHelpers.h" +#include "util/HttpRequestHelpers.h" #include "util/http/HttpUtils.h" #include "util/http/UrlParser.h" using namespace ad_utility::url_parser; using namespace ad_utility::url_parser::sparqlOperation; +using namespace ad_utility::testing; namespace { auto ParsedRequestIs = [](const std::string& path, @@ -27,27 +29,9 @@ auto ParsedRequestIs = [](const std::string& path, AD_FIELD(ad_utility::url_parser::ParsedRequest, operation_, testing::Eq(operation))); }; -auto MakeBasicRequest = [](http::verb method, const std::string& target) { - // version 11 stands for HTTP/1.1 - return http::request{method, target, 11}; -}; -auto MakeGetRequest = [](const std::string& target) { - return MakeBasicRequest(http::verb::get, target); -}; -auto MakePostRequest = [](const std::string& target, - const std::string& contentType, - const std::string& body) { - auto req = MakeBasicRequest(http::verb::post, target); - req.set(http::field::content_type, contentType); - req.body() = body; - req.prepare_payload(); - return req; -}; } // namespace TEST(ServerTest, parseHttpRequest) { - namespace http = boost::beast::http; - auto parse = [](const ad_utility::httpUtils::HttpRequest auto& request) { return Server::parseHttpRequest(request); }; @@ -55,49 +39,49 @@ TEST(ServerTest, parseHttpRequest) { "application/x-www-form-urlencoded;charset=UTF-8"; const std::string QUERY = "application/sparql-query"; const std::string UPDATE = "application/sparql-update"; - EXPECT_THAT(parse(MakeGetRequest("/")), ParsedRequestIs("/", {}, None{})); - EXPECT_THAT(parse(MakeGetRequest("/ping")), + EXPECT_THAT(parse(makeGetRequest("/")), ParsedRequestIs("/", {}, None{})); + EXPECT_THAT(parse(makeGetRequest("/ping")), ParsedRequestIs("/ping", {}, None{})); - EXPECT_THAT(parse(MakeGetRequest("/?cmd=stats")), + EXPECT_THAT(parse(makeGetRequest("/?cmd=stats")), ParsedRequestIs("/", {{"cmd", {"stats"}}}, None{})); - EXPECT_THAT(parse(MakeGetRequest( + EXPECT_THAT(parse(makeGetRequest( "/?query=SELECT+%2A%20WHERE%20%7B%7D&action=csv_export")), ParsedRequestIs("/", {{"action", {"csv_export"}}}, Query{"SELECT * WHERE {}"})); EXPECT_THAT( - parse(MakePostRequest("/", URLENCODED, + parse(makePostRequest("/", URLENCODED, "query=SELECT+%2A%20WHERE%20%7B%7D&send=100")), ParsedRequestIs("/", {{"send", {"100"}}}, Query{"SELECT * WHERE {}"})); AD_EXPECT_THROW_WITH_MESSAGE( - parse(MakePostRequest("/", URLENCODED, + parse(makePostRequest("/", URLENCODED, "ääär y=SELECT+%2A%20WHERE%20%7B%7D&send=100")), ::testing::HasSubstr("Invalid URL-encoded POST request")); AD_EXPECT_THROW_WITH_MESSAGE( - parse(MakeGetRequest("/?query=SELECT%20%2A%20WHERE%20%7B%7D&query=SELECT%" + parse(makeGetRequest("/?query=SELECT%20%2A%20WHERE%20%7B%7D&query=SELECT%" "20%3Ffoo%20WHERE%20%7B%7D")), ::testing::StrEq( "Parameter \"query\" must be given exactly once. Is: 2")); AD_EXPECT_THROW_WITH_MESSAGE( - parse(MakePostRequest("/", URLENCODED, + parse(makePostRequest("/", URLENCODED, "query=SELECT%20%2A%20WHERE%20%7B%7D&update=DELETE%" "20%7B%7D%20WHERE%20%7B%7D")), ::testing::HasSubstr( "Request must only contain one of \"query\" and \"update\".")); AD_EXPECT_THROW_WITH_MESSAGE( - parse(MakePostRequest("/", URLENCODED, + parse(makePostRequest("/", URLENCODED, "update=DELETE%20%7B%7D%20WHERE%20%7B%7D&update=" "DELETE%20%7B%7D%20WHERE%20%7B%7D")), ::testing::StrEq( "Parameter \"update\" must be given exactly once. Is: 2")); EXPECT_THAT( - parse(MakePostRequest("/", "application/x-www-form-urlencoded", + parse(makePostRequest("/", "application/x-www-form-urlencoded", "query=SELECT%20%2A%20WHERE%20%7B%7D&send=100")), ParsedRequestIs("/", {{"send", {"100"}}}, Query{"SELECT * WHERE {}"})); - EXPECT_THAT(parse(MakePostRequest("/", URLENCODED, + EXPECT_THAT(parse(makePostRequest("/", URLENCODED, "query=SELECT%20%2A%20WHERE%20%7B%7D")), ParsedRequestIs("/", {}, Query{"SELECT * WHERE {}"})); EXPECT_THAT( - parse(MakePostRequest( + parse(makePostRequest( "/", URLENCODED, "query=SELECT%20%2A%20WHERE%20%7B%7D&default-graph-uri=https%3A%2F%" "2Fw3.org%2Fdefault&named-graph-uri=https%3A%2F%2Fw3.org%2F1&named-" @@ -108,63 +92,40 @@ TEST(ServerTest, parseHttpRequest) { {"named-graph-uri", {"https://w3.org/1", "https://w3.org/2"}}}, Query{"SELECT * WHERE {}"})); AD_EXPECT_THROW_WITH_MESSAGE( - parse(MakePostRequest("/?send=100", URLENCODED, + parse(makePostRequest("/?send=100", URLENCODED, "query=SELECT%20%2A%20WHERE%20%7B%7D")), testing::StrEq("URL-encoded POST requests must not contain query " "parameters in the URL.")); - EXPECT_THAT(parse(MakePostRequest("/", URLENCODED, "cmd=clear-cache")), + EXPECT_THAT(parse(makePostRequest("/", URLENCODED, "cmd=clear-cache")), ParsedRequestIs("/", {{"cmd", {"clear-cache"}}}, None{})); - EXPECT_THAT(parse(MakePostRequest("/", QUERY, "SELECT * WHERE {}")), + EXPECT_THAT(parse(makePostRequest("/", QUERY, "SELECT * WHERE {}")), ParsedRequestIs("/", {}, Query{"SELECT * WHERE {}"})); EXPECT_THAT( - parse(MakePostRequest("/?send=100", QUERY, "SELECT * WHERE {}")), + parse(makePostRequest("/?send=100", QUERY, "SELECT * WHERE {}")), ParsedRequestIs("/", {{"send", {"100"}}}, Query{"SELECT * WHERE {}"})); AD_EXPECT_THROW_WITH_MESSAGE( - parse(MakeBasicRequest(http::verb::patch, "/")), + parse(makeRequest(http::verb::patch, "/")), testing::StrEq( "Request method \"PATCH\" not supported (has to be GET or POST)")); AD_EXPECT_THROW_WITH_MESSAGE( - parse(MakePostRequest("/", "invalid/content-type", "")), + parse(makePostRequest("/", "invalid/content-type", "")), testing::StrEq( "POST request with content type \"invalid/content-type\" not " "supported (must be \"application/x-www-form-urlencoded\", " "\"application/sparql-query\" or \"application/sparql-update\")")); AD_EXPECT_THROW_WITH_MESSAGE( - parse(MakeGetRequest("/?update=DELETE%20%2A%20WHERE%20%7B%7D")), + parse(makeGetRequest("/?update=DELETE%20%2A%20WHERE%20%7B%7D")), testing::StrEq("SPARQL Update is not allowed as GET request.")); - EXPECT_THAT(parse(MakePostRequest("/", UPDATE, "DELETE * WHERE {}")), + EXPECT_THAT(parse(makePostRequest("/", UPDATE, "DELETE * WHERE {}")), ParsedRequestIs("/", {}, Update{"DELETE * WHERE {}"})); - EXPECT_THAT(parse(MakePostRequest("/", URLENCODED, + EXPECT_THAT(parse(makePostRequest("/", URLENCODED, "update=DELETE%20%2A%20WHERE%20%7B%7D")), ParsedRequestIs("/", {}, Update{"DELETE * WHERE {}"})); - EXPECT_THAT(parse(MakePostRequest("/", URLENCODED, + EXPECT_THAT(parse(makePostRequest("/", URLENCODED, "update=DELETE+%2A+WHERE%20%7B%7D")), ParsedRequestIs("/", {}, Update{"DELETE * WHERE {}"})); } -TEST(ServerTest, checkParameter) { - const ParamValueMap exampleParams = {{"foo", {"bar"}}, - {"baz", {"qux", "quux"}}}; - - EXPECT_THAT(Server::checkParameter(exampleParams, "doesNotExist", ""), - testing::Eq(std::nullopt)); - EXPECT_THAT(Server::checkParameter(exampleParams, "foo", "baz"), - testing::Eq(std::nullopt)); - EXPECT_THAT(Server::checkParameter(exampleParams, "foo", "bar"), - testing::Optional(testing::StrEq("bar"))); - AD_EXPECT_THROW_WITH_MESSAGE( - Server::checkParameter(exampleParams, "baz", "qux"), - testing::StrEq("Parameter \"baz\" must be given exactly once. Is: 2")); - EXPECT_THAT(Server::checkParameter(exampleParams, "foo", std::nullopt), - testing::Optional(testing::StrEq("bar"))); - AD_EXPECT_THROW_WITH_MESSAGE( - Server::checkParameter(exampleParams, "baz", std::nullopt), - testing::StrEq("Parameter \"baz\" must be given exactly once. Is: 2")); - AD_EXPECT_THROW_WITH_MESSAGE( - Server::checkParameter(exampleParams, "baz", std::nullopt), - testing::StrEq("Parameter \"baz\" must be given exactly once. Is: 2")); -} - TEST(ServerTest, determineResultPinning) { EXPECT_THAT(Server::determineResultPinning( {{"pinsubtrees", {"true"}}, {"pinresult", {"true"}}}), @@ -226,9 +187,9 @@ TEST(ServerTest, determineMediaType) { TEST(ServerTest, getQueryId) { using namespace ad_utility::websocket; Server server{9999, 1, ad_utility::MemorySize::megabytes(1), "accessToken"}; - auto reqWithExplicitQueryId = MakeGetRequest("/"); + auto reqWithExplicitQueryId = makeGetRequest("/"); reqWithExplicitQueryId.set("Query-Id", "100"); - const auto req = MakeGetRequest("/"); + const auto req = makeGetRequest("/"); { // A request with a custom query id. auto queryId1 = server.getQueryId(reqWithExplicitQueryId, @@ -250,10 +211,10 @@ TEST(ServerTest, getQueryId) { TEST(ServerTest, createMessageSender) { Server server{9999, 1, ad_utility::MemorySize::megabytes(1), "accessToken"}; - auto reqWithExplicitQueryId = MakeGetRequest("/"); + auto reqWithExplicitQueryId = makeGetRequest("/"); std::string customQueryId = "100"; reqWithExplicitQueryId.set("Query-Id", customQueryId); - const auto req = MakeGetRequest("/"); + const auto req = makeGetRequest("/"); // The query hub is only valid once, the server has been started. AD_EXPECT_THROW_WITH_MESSAGE( server.createMessageSender(server.queryHub_, req, diff --git a/test/TripleComponentTest.cpp b/test/TripleComponentTest.cpp index 2cc5c74cb3..2c67b16823 100644 --- a/test/TripleComponentTest.cpp +++ b/test/TripleComponentTest.cpp @@ -69,6 +69,24 @@ TEST(TripleComponent, setAndGetVariable) { ASSERT_EQ(tc.getVariable(), Variable{"?x"}); } +TEST(TripleComponent, setAndGetId) { + Id id = Id::makeFromVocabIndex(VocabIndex::make(1)); + TripleComponent tc{id}; + ASSERT_TRUE(tc.isId()); + ASSERT_FALSE(tc.isVariable()); + ASSERT_FALSE(tc.isString()); + ASSERT_FALSE(tc.isDouble()); + ASSERT_FALSE(tc.isInt()); + ASSERT_FALSE(tc.isBool()); + ASSERT_FALSE(tc.isIri()); + ASSERT_FALSE(tc.isLiteral()); + ASSERT_FALSE(tc.isUndef()); + ASSERT_EQ(tc, id); + ASSERT_EQ(tc.getId(), id); + const TripleComponent tcConst = std::move(tc); + ASSERT_EQ(tcConst.getId(), id); +} + TEST(TripleComponent, assignmentOperator) { TripleComponent object; object = -12.435; diff --git a/test/UrlParserTest.cpp b/test/UrlParserTest.cpp index eb6aeebc4d..d0b09ab030 100644 --- a/test/UrlParserTest.cpp +++ b/test/UrlParserTest.cpp @@ -128,3 +128,26 @@ TEST(UrlParserTest, parseDatasetClauses) { {iri(""), true}, {iri(""), true}})); } + +TEST(UrlParserTest, checkParameter) { + const url_parser::ParamValueMap exampleParams = {{"foo", {"bar"}}, + {"baz", {"qux", "quux"}}}; + + EXPECT_THAT(url_parser::checkParameter(exampleParams, "doesNotExist", ""), + ::testing::Eq(std::nullopt)); + EXPECT_THAT(url_parser::checkParameter(exampleParams, "foo", "baz"), + ::testing::Eq(std::nullopt)); + EXPECT_THAT(url_parser::checkParameter(exampleParams, "foo", "bar"), + ::testing::Optional(::testing::StrEq("bar"))); + AD_EXPECT_THROW_WITH_MESSAGE( + url_parser::checkParameter(exampleParams, "baz", "qux"), + ::testing::StrEq("Parameter \"baz\" must be given exactly once. Is: 2")); + EXPECT_THAT(url_parser::checkParameter(exampleParams, "foo", std::nullopt), + ::testing::Optional(::testing::StrEq("bar"))); + AD_EXPECT_THROW_WITH_MESSAGE( + url_parser::checkParameter(exampleParams, "baz", std::nullopt), + ::testing::StrEq("Parameter \"baz\" must be given exactly once. Is: 2")); + AD_EXPECT_THROW_WITH_MESSAGE( + url_parser::checkParameter(exampleParams, "baz", std::nullopt), + ::testing::StrEq("Parameter \"baz\" must be given exactly once. Is: 2")); +} diff --git a/test/util/HttpRequestHelpers.h b/test/util/HttpRequestHelpers.h new file mode 100644 index 0000000000..4505e828db --- /dev/null +++ b/test/util/HttpRequestHelpers.h @@ -0,0 +1,46 @@ +// Copyright 2024, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Julian Mundhahs + +#pragma once + +#include "util/HashMap.h" +#include "util/http/beast.h" + +namespace ad_utility::testing { + +namespace http = boost::beast::http; + +// Construct a boost::beast request with the HTTP method, target path, headers +// and body. +inline auto makeRequest( + const http::verb method = http::verb::get, const std::string& target = "/", + const ad_utility::HashMap& headers = {}, + const std::optional& body = std::nullopt) { + // version 11 stands for HTTP/1.1 + auto req = http::request{method, target, 11}; + for (const auto& [key, value] : headers) { + req.set(key, value); + } + if (body.has_value()) { + req.body() = body.value(); + req.prepare_payload(); + } + return req; +} + +// Constructs a boost::beast GET request with the target path. +inline auto makeGetRequest(const std::string& target) { + return makeRequest(http::verb::get, target); +} + +// Constructs a boost::beast POST request with the target path, body content +// type and body content. +inline auto makePostRequest(const std::string& target, + const std::string& contentType, + const std::string& body) { + return makeRequest(http::verb::post, target, + {{http::field::content_type, contentType}}, body); +} + +} // namespace ad_utility::testing