diff --git a/src/global/IndexTypes.h b/src/global/IndexTypes.h index 08ee960d00..4868e59694 100644 --- a/src/global/IndexTypes.h +++ b/src/global/IndexTypes.h @@ -16,3 +16,4 @@ using LocalVocabIndex = const LocalVocabEntry*; using TextRecordIndex = ad_utility::TypedIndex; using WordVocabIndex = ad_utility::TypedIndex; using BlankNodeIndex = ad_utility::TypedIndex; +using DocumentIndex = ad_utility::TypedIndex; diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp index 76c0015974..d1e6a70777 100644 --- a/src/index/IndexImpl.Text.cpp +++ b/src/index/IndexImpl.Text.cpp @@ -17,39 +17,22 @@ #include "backports/algorithm.h" #include "engine/CallFixedSize.h" #include "index/FTSAlgorithms.h" -#include "parser/ContextFileParser.h" +#include "parser/WordsAndDocsFileParser.h" #include "util/Conversions.h" #include "util/Simple8bCode.h" -namespace { - -// Custom delimiter class for tokenization of literals using `absl::StrSplit`. -// The `Find` function returns the next delimiter in `text` after the given -// `pos` or an empty substring if there is no next delimiter. -struct LiteralsTokenizationDelimiter { - absl::string_view Find(absl::string_view text, size_t pos) { - auto isWordChar = [](char c) -> bool { return std::isalnum(c); }; - auto found = std::find_if_not(text.begin() + pos, text.end(), isWordChar); - if (found == text.end()) return text.substr(text.size()); - return {found, found + 1}; - } -}; - -} // namespace - // _____________________________________________________________________________ -cppcoro::generator IndexImpl::wordsInTextRecords( - const std::string& contextFile, bool addWordsFromLiterals) { +cppcoro::generator IndexImpl::wordsInTextRecords( + std::string contextFile, bool addWordsFromLiterals) const { auto localeManager = textVocab_.getLocaleManager(); // ROUND 1: If context file aka wordsfile is not empty, read words from there. // Remember the last context id for the (optional) second round. TextRecordIndex contextId = TextRecordIndex::make(0); if (!contextFile.empty()) { - ContextFileParser::Line line; - ContextFileParser p(contextFile, localeManager); + WordsFileParser p(contextFile, localeManager); ad_utility::HashSet items; - while (p.getLine(line)) { - contextId = line._contextId; + for (auto& line : p) { + contextId = line.contextId_; co_yield line; } if (contextId > TextRecordIndex::make(0)) { @@ -65,15 +48,13 @@ cppcoro::generator IndexImpl::wordsInTextRecords( if (!isLiteral(text)) { continue; } - ContextFileParser::Line entityLine{text, true, contextId, 1, true}; + WordsFileLine entityLine{text, true, contextId, 1, true}; co_yield entityLine; std::string_view textView = text; textView = textView.substr(0, textView.rfind('"')); textView.remove_prefix(1); - for (auto word : absl::StrSplit(textView, LiteralsTokenizationDelimiter{}, - absl::SkipEmpty{})) { - auto wordNormalized = localeManager.getLowercaseUtf8(word); - ContextFileParser::Line wordLine{wordNormalized, false, contextId, 1}; + for (auto word : tokenizeAndNormalizeText(textView, localeManager)) { + WordsFileLine wordLine{std::move(word), false, contextId, 1}; co_yield wordLine; } contextId = contextId.incremented(); @@ -81,6 +62,56 @@ cppcoro::generator IndexImpl::wordsInTextRecords( } } +// _____________________________________________________________________________ +void IndexImpl::processEntityCaseDuringInvertedListProcessing( + const WordsFileLine& line, + ad_utility::HashMap& entitiesInContext, size_t& nofLiterals, + size_t& entityNotFoundErrorMsgCount) const { + VocabIndex eid; + // TODO Currently only IRIs and strings from the vocabulary can + // be tagged entities in the text index (no doubles, ints, etc). + if (getVocab().getId(line.word_, &eid)) { + // Note that `entitiesInContext` is a HashMap, so the `Id`s don't have + // to be contiguous. + entitiesInContext[Id::makeFromVocabIndex(eid)] += line.score_; + if (line.isLiteralEntity_) { + ++nofLiterals; + } + } else { + logEntityNotFound(line.word_, entityNotFoundErrorMsgCount); + } +} + +// _____________________________________________________________________________ +void IndexImpl::processWordCaseDuringInvertedListProcessing( + const WordsFileLine& line, + ad_utility::HashMap& wordsInContext) const { + // TODO Let the `textVocab_` return a `WordIndex` directly. + WordVocabIndex vid; + bool ret = textVocab_.getId(line.word_, &vid); + WordIndex wid = vid.get(); + if (!ret) { + LOG(ERROR) << "ERROR: word \"" << line.word_ << "\" " + << "not found in textVocab. Terminating\n"; + AD_FAIL(); + } + wordsInContext[wid] += line.score_; +} + +// _____________________________________________________________________________ +void IndexImpl::logEntityNotFound(const string& word, + size_t& entityNotFoundErrorMsgCount) const { + if (entityNotFoundErrorMsgCount < 20) { + LOG(WARN) << "Entity from text not in KB: " << word << '\n'; + if (++entityNotFoundErrorMsgCount == 20) { + LOG(WARN) << "There are more entities not in the KB..." + << " suppressing further warnings...\n"; + } + } else { + entityNotFoundErrorMsgCount++; + } +} + // _____________________________________________________________________________ void IndexImpl::addTextFromContextFile(const string& contextFile, bool addWordsFromLiterals) { @@ -214,12 +245,12 @@ size_t IndexImpl::processWordsForVocabulary(string const& contextFile, for (auto line : wordsInTextRecords(contextFile, addWordsFromLiterals)) { ++numLines; // LOG(INFO) << "LINE: " - // << std::setw(50) << line._word << " " - // << line._isEntity << "\t" - // << line._contextId.get() << "\t" - // << line._score << std::endl; - if (!line._isEntity) { - distinctWords.insert(line._word); + // << std::setw(50) << line.word_ << " " + // << line.isEntity_ << "\t" + // << line.contextId_.get() << "\t" + // << line.score_ << std::endl; + if (!line.isEntity_) { + distinctWords.insert(line.word_); } } textVocab_.createFromSet(distinctWords, onDiskBase_ + ".text.vocabulary"); @@ -243,49 +274,21 @@ void IndexImpl::processWordsForInvertedLists(const string& contextFile, size_t nofLiterals = 0; for (auto line : wordsInTextRecords(contextFile, addWordsFromLiterals)) { - if (line._contextId != currentContext) { + if (line.contextId_ != currentContext) { ++nofContexts; addContextToVector(writer, currentContext, wordsInContext, entitiesInContext); - currentContext = line._contextId; + currentContext = line.contextId_; wordsInContext.clear(); entitiesInContext.clear(); } - if (line._isEntity) { + if (line.isEntity_) { ++nofEntityPostings; - // TODO Currently only IRIs and strings from the vocabulary can - // be tagged entities in the text index (no doubles, ints, etc). - VocabIndex eid; - if (getVocab().getId(line._word, &eid)) { - // Note that `entitiesInContext` is a HashMap, so the `Id`s don't have - // to be contiguous. - entitiesInContext[Id::makeFromVocabIndex(eid)] += line._score; - if (line._isLiteralEntity) { - ++nofLiterals; - } - } else { - if (entityNotFoundErrorMsgCount < 20) { - LOG(WARN) << "Entity from text not in KB: " << line._word << '\n'; - if (++entityNotFoundErrorMsgCount == 20) { - LOG(WARN) << "There are more entities not in the KB..." - << " suppressing further warnings...\n"; - } - } else { - entityNotFoundErrorMsgCount++; - } - } + processEntityCaseDuringInvertedListProcessing( + line, entitiesInContext, nofLiterals, entityNotFoundErrorMsgCount); } else { ++nofWordPostings; - // TODO Let the `textVocab_` return a `WordIndex` directly. - WordVocabIndex vid; - bool ret = textVocab_.getId(line._word, &vid); - WordIndex wid = vid.get(); - if (!ret) { - LOG(ERROR) << "ERROR: word \"" << line._word << "\" " - << "not found in textVocab. Terminating\n"; - AD_FAIL(); - } - wordsInContext[wid] += line._score; + processWordCaseDuringInvertedListProcessing(line, wordsInContext); } } if (entityNotFoundErrorMsgCount > 0) { diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index d9ec19eb14..ac0003db87 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -29,9 +29,9 @@ #include "index/TextMetaData.h" #include "index/Vocabulary.h" #include "index/VocabularyMerger.h" -#include "parser/ContextFileParser.h" #include "parser/RdfParser.h" #include "parser/TripleComponent.h" +#include "parser/WordsAndDocsFileParser.h" #include "util/BufferedVector.h" #include "util/CancellationHandle.h" #include "util/File.h" @@ -521,8 +521,20 @@ class IndexImpl { // TODO: So far, this is limited to the internal vocabulary (still in the // testing phase, once it works, it should be easy to include the IRIs and // literals from the external vocabulary as well). - cppcoro::generator wordsInTextRecords( - const std::string& contextFile, bool addWordsFromLiterals); + cppcoro::generator wordsInTextRecords( + std::string contextFile, bool addWordsFromLiterals) const; + + void processEntityCaseDuringInvertedListProcessing( + const WordsFileLine& line, + ad_utility::HashMap& entitiesInContxt, size_t& nofLiterals, + size_t& entityNotFoundErrorMsgCount) const; + + void processWordCaseDuringInvertedListProcessing( + const WordsFileLine& line, + ad_utility::HashMap& wordsInContext) const; + + void logEntityNotFound(const string& word, + size_t& entityNotFoundErrorMsgCount) const; size_t processWordsForVocabulary(const string& contextFile, bool addWordsFromLiterals); diff --git a/src/parser/CMakeLists.txt b/src/parser/CMakeLists.txt index be4b3db44c..6fa123a793 100644 --- a/src/parser/CMakeLists.txt +++ b/src/parser/CMakeLists.txt @@ -10,7 +10,7 @@ add_library(parser ParsedQuery.cpp RdfParser.cpp Tokenizer.cpp - ContextFileParser.cpp + WordsAndDocsFileParser.cpp TurtleTokenId.h ParallelBuffer.cpp SparqlParserHelpers.cpp diff --git a/src/parser/ContextFileParser.cpp b/src/parser/ContextFileParser.cpp deleted file mode 100644 index 523bde486b..0000000000 --- a/src/parser/ContextFileParser.cpp +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2015, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) - -#include "./ContextFileParser.h" - -#include - -#include "../util/Exception.h" -#include "../util/StringUtils.h" - -// _____________________________________________________________________________ -ContextFileParser::ContextFileParser(const string& contextFile, - LocaleManager localeManager) - : _in(contextFile), _localeManager(std::move(localeManager)) {} - -// _____________________________________________________________________________ -ContextFileParser::~ContextFileParser() { _in.close(); } - -// _____________________________________________________________________________ -bool ContextFileParser::getLine(ContextFileParser::Line& line) { - string l; - if (std::getline(_in, l)) { - size_t i = l.find('\t'); - assert(i != string::npos); - size_t j = i + 2; - assert(j + 3 < l.size()); - size_t k = l.find('\t', j + 2); - assert(k != string::npos); - line._isEntity = (l[i + 1] == '1'); - line._word = - (line._isEntity ? l.substr(0, i) - : _localeManager.getLowercaseUtf8(l.substr(0, i))); - line._contextId = - TextRecordIndex::make(atol(l.substr(j + 1, k - j - 1).c_str())); - line._score = static_cast(atol(l.substr(k + 1).c_str())); -#ifndef NDEBUG - if (_lastCId > line._contextId) { - AD_THROW("ContextFile has to be sorted by context Id."); - } - _lastCId = line._contextId; -#endif - return true; - } - return false; -} diff --git a/src/parser/ContextFileParser.h b/src/parser/ContextFileParser.h deleted file mode 100644 index ba8d7bac9c..0000000000 --- a/src/parser/ContextFileParser.h +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2015, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) - -#pragma once - -#include - -#include -#include - -#include "../global/Id.h" -#include "../index/StringSortComparator.h" - -using std::string; - -class ContextFileParser { - public: - struct Line { - string _word; - bool _isEntity; - TextRecordIndex _contextId; - Score _score; - bool _isLiteralEntity = false; - }; - - explicit ContextFileParser(const string& contextFile, - LocaleManager localeManager); - ~ContextFileParser(); - // Don't allow copy & assignment - explicit ContextFileParser(const ContextFileParser& other) = delete; - ContextFileParser& operator=(const ContextFileParser& other) = delete; - - // Get the next line from the file. - // Returns true if something was stored. - bool getLine(Line&); - - private: - std::ifstream _in; -#ifndef NDEBUG - // Only used for sanity checks in debug builds - TextRecordIndex _lastCId = TextRecordIndex::make(0); -#endif - LocaleManager _localeManager; -}; diff --git a/src/parser/WordsAndDocsFileParser.cpp b/src/parser/WordsAndDocsFileParser.cpp new file mode 100644 index 0000000000..e7d36974c6 --- /dev/null +++ b/src/parser/WordsAndDocsFileParser.cpp @@ -0,0 +1,61 @@ +// Copyright 2015, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) +// Felix Meisen (fesemeisen@outlook.de) + +#include "parser/WordsAndDocsFileParser.h" + +#include + +#include "util/Exception.h" +#include "util/StringUtils.h" + +// _____________________________________________________________________________ +WordsAndDocsFileParser::WordsAndDocsFileParser( + const string& wordsOrDocsFile, const LocaleManager& localeManager) + : in_(wordsOrDocsFile), localeManager_(localeManager) {} + +// _____________________________________________________________________________ +ad_utility::InputRangeFromGet::Storage WordsFileParser::get() { + WordsFileLine line; + string l; + if (!std::getline(getInputStream(), l)) { + return std::nullopt; + } + std::string_view lineView(l); + size_t i = lineView.find('\t'); + assert(i != string::npos); + size_t j = i + 2; + assert(j + 3 < lineView.size()); + size_t k = lineView.find('\t', j + 2); + assert(k != string::npos); + line.isEntity_ = (lineView[i + 1] == '1'); + line.word_ = + (line.isEntity_ + ? lineView.substr(0, i) + : getLocaleManager().getLowercaseUtf8(lineView.substr(0, i))); + line.contextId_ = + TextRecordIndex::make(atol(lineView.substr(j + 1, k - j - 1).data())); + line.score_ = static_cast(atol(lineView.substr(k + 1).data())); +#ifndef NDEBUG + if (lastCId_ > line.contextId_) { + AD_THROW("ContextFile has to be sorted by context Id."); + } + lastCId_ = line.contextId_; +#endif + return line; +} + +// _____________________________________________________________________________ +ad_utility::InputRangeFromGet::Storage DocsFileParser::get() { + string l; + if (!std::getline(getInputStream(), l)) { + return std::nullopt; + } + DocsFileLine line; + size_t i = l.find('\t'); + assert(i != string::npos); + line.docId_ = DocumentIndex::make(atol(l.substr(0, i).c_str())); + line.docContent_ = l.substr(i + 1); + return line; +} diff --git a/src/parser/WordsAndDocsFileParser.h b/src/parser/WordsAndDocsFileParser.h new file mode 100644 index 0000000000..1fc80523ff --- /dev/null +++ b/src/parser/WordsAndDocsFileParser.h @@ -0,0 +1,192 @@ +// Copyright 2015, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) +// Felix Meisen (fesemeisen@outlook.de) + +#pragma once + +#include +#include + +#include +#include + +#include "global/Id.h" +#include "index/StringSortComparator.h" +#include "util/Iterators.h" +#include "util/Views.h" + +using std::string; + +/** + * @brief Represents a line in the words file. + * + * This struct holds information about a word or entity as it appears in the + * words file. + * + * The Fields are ordered in the same way the values follow in a line. + * Short field overview: string word_, bool isEntity, TextRecordIndex contextId, + * Score score_, bool isLiteralEntity (not found in + * wordsfile) + * + * @details + * + * Fields: + * - string word_: The string of the word, if it is an entity it will be + * . + * - bool isEntity_: True if the given word is an entity, false if it's a word. + * - TextRecordIndex contextId_: When creating the wordsfile docs from the + * docsfile get split into so called contexts. + * Those contexts overlap, meaning words and + * entities are covered multiple times. Each + * contextId corresponds to the next bigger or + * equal docId. + * - Score score_: Either 1 or 0 if isEntity is false. 0, 1, 100, 150 if + * isEntity is true. (this info is only constructed on the + * scientists.wordsfile.tsv) The score in the wordsfile is only + * relevant for the counting scoring metric. Because of the + * overlap of contexts the score is 1 if the word really has + * been seen for the first time and 0 if not. If a doc contains + * multiple mentions of a word there should be exactly as many + * wordsfile lines of that word with score 1 as there are + * mentions. The score for entities seems rather random and + * since no clear explanation of the creation of wordsfiles + * has been found yet they will stay rather random. + * - bool isLiteralEntity_: This does not directly stem from the wordsfile. + * When building the text index with literals, for + * every literal there will be WordsFileLines for all + * words in that literal. Additionally the whole + * literal itself will be added as word with isEntity + * being true. The need to count this comes only from + * a trick used in testing right now. To be specific + * the method getTextRecordFromResultTable + */ +struct WordsFileLine { + string word_; + bool isEntity_; + TextRecordIndex contextId_; + Score score_; + bool isLiteralEntity_ = false; +}; + +/** + * @brief Represents a line from the docsfile.tsv. + * + * This struct stores everything given in a line of the docsfile.tsv. + * + * The Fields are ordered in the same way the values follow in a line. + * Short field overview: DocumentIndex docId_, string docContent_ + * + * @details + * + * Fields: + * - DocumentIndex docId_: The docId is needed to build inverted indices for + * scoring and building of the docsDB. It is also used + * to return actual texts when searching for a word. + * The word (and entity) search returns a table with + * TextRecordIndex as type of one column. Those get + * mapped to the next bigger or equal docId which is + * then used to extract the text from the docsDB. + * - string docContent_: The whole text given after the first tab of a line of + * docsfile. + */ +struct DocsFileLine { + DocumentIndex docId_; + string docContent_; +}; + +// Custom delimiter class for tokenization of literals using `absl::StrSplit`. +// The `Find` function returns the next delimiter in `text` after the given +// `pos` or an empty substring if there is no next delimiter. +struct LiteralsTokenizationDelimiter { + absl::string_view Find(absl::string_view text, size_t pos) const { + auto isWordChar = [](char c) -> bool { return std::isalnum(c); }; + auto found = std::find_if_not(text.begin() + pos, text.end(), isWordChar); + if (found == text.end()) return text.substr(text.size()); + return {found, found + 1}; + } +}; + +/** + * @brief A function that can be used to tokenize and normalize a given text. + * @warning Both params are const refs where the original objects have to be + * kept alive during the usage of the returned object. + * @param text The text to be tokenized and normalized. + * @param localeManager The localeManager to be used for normalization. + * @details This function can be used in the following way: + * for (auto normalizedWord : tokenizeAndNormalizeText(text, localeManager)) { + * code; + * } + */ +inline auto tokenizeAndNormalizeText(std::string_view text, + const LocaleManager& localeManager) { + std::vector split{ + absl::StrSplit(text, LiteralsTokenizationDelimiter{}, absl::SkipEmpty{})}; + return ql::views::transform(ad_utility::OwningView{std::move(split)}, + [&localeManager](const auto& str) { + return localeManager.getLowercaseUtf8(str); + }); +} +/** + * @brief This class is the parent class of WordsFileParser and DocsFileParser + * + * @details It exists to reduce code duplication since the only difference + * between the child classes is the line type returned. + */ +class WordsAndDocsFileParser { + public: + explicit WordsAndDocsFileParser(const string& wordsOrDocsFile, + const LocaleManager& localeManager); + explicit WordsAndDocsFileParser(const WordsAndDocsFileParser& other) = delete; + WordsAndDocsFileParser& operator=(const WordsAndDocsFileParser& other) = + delete; + + protected: + std::ifstream& getInputStream() { return in_; } + const LocaleManager& getLocaleManager() const { return localeManager_; } + + private: + std::ifstream in_; + LocaleManager localeManager_; +}; + +/** + * @brief This class takes in the a pathToWordsFile and a localeManager. It then + * can be used to iterate the wordsFile while already normalizing the words + * using the localeManager. (If words are entities it doesn't normalize them) + * + * @details An object of this class can be iterated as follows: + * for (auto wordsFileLine : WordsFileParser{wordsFile, localeManager}) { + * code; + * } + * The type of the value returned when iterating is WordsFileLine + */ +class WordsFileParser : public WordsAndDocsFileParser, + public ad_utility::InputRangeFromGet { + public: + using WordsAndDocsFileParser::WordsAndDocsFileParser; + Storage get() override; + +#ifndef NDEBUG + private: + // Only used for sanity checks in debug builds + TextRecordIndex lastCId_ = TextRecordIndex::make(0); +#endif +}; + +/** + * @brief This class takes in the a pathToDocsFile and a localeManager. It then + * can be used to iterate over the docsFile to get the lines. + * + * @details An object of this class can be iterated as follows: + * for (auto docsFileLine : DocsFileParser{docsFile, localeManager}) { + * code; + * } + * The type of the value returned when iterating is DocsFileLine + */ +class DocsFileParser : public WordsAndDocsFileParser, + public ad_utility::InputRangeFromGet { + public: + using WordsAndDocsFileParser::WordsAndDocsFileParser; + Storage get() override; +}; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index bd375f4826..b9581312e8 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -137,7 +137,7 @@ addLinkAndDiscoverTestSerial(FileTest) addLinkAndDiscoverTest(Simple8bTest) -addLinkAndDiscoverTest(ContextFileParserTest parser) +addLinkAndDiscoverTest(WordsAndDocsFileParserTest parser) addLinkAndDiscoverTest(IndexMetaDataTest index) diff --git a/test/ContextFileParserTest.cpp b/test/ContextFileParserTest.cpp deleted file mode 100644 index 2b27c0f34d..0000000000 --- a/test/ContextFileParserTest.cpp +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2015, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) - -#include - -#include -#include - -#include "../src/parser/ContextFileParser.h" - -TEST(ContextFileParserTest, getLineTest) { - char* locale = setlocale(LC_CTYPE, ""); - std::cout << "Set locale LC_CTYPE to: " << locale << std::endl; - - std::fstream f("_testtmp.contexts.tsv", std::ios_base::out); - f << "Foo\t0\t0\t2\n" - "foo\t0\t0\t2\n" - "Bär\t1\t0\t1\n" - "Äü\t0\t0\t1\n" - "X\t0\t1\t1\n"; - - f.close(); - ContextFileParser p("_testtmp.contexts.tsv", - LocaleManager("en", "US", false)); - ContextFileParser::Line a; - ASSERT_TRUE(p.getLine(a)); - ASSERT_EQ("foo", a._word); - ASSERT_FALSE(a._isEntity); - ASSERT_EQ(0u, a._contextId.get()); - ASSERT_EQ(2u, a._score); - - ASSERT_TRUE(p.getLine(a)); - ASSERT_EQ("foo", a._word); - ASSERT_FALSE(a._isEntity); - ASSERT_EQ(0u, a._contextId.get()); - ASSERT_EQ(2u, a._score); - - ASSERT_TRUE(p.getLine(a)); - ASSERT_EQ("Bär", a._word); - ASSERT_TRUE(a._isEntity); - ASSERT_EQ(0u, a._contextId.get()); - ASSERT_EQ(1u, a._score); - - ASSERT_TRUE(p.getLine(a)); - ASSERT_EQ("äü", a._word); - ASSERT_FALSE(a._isEntity); - ASSERT_EQ(0u, a._contextId.get()); - ASSERT_EQ(1u, a._score); - - ASSERT_TRUE(p.getLine(a)); - ASSERT_EQ("x", a._word); - ASSERT_FALSE(a._isEntity); - ASSERT_EQ(1u, a._contextId.get()); - ASSERT_EQ(1u, a._score); - - ASSERT_FALSE(p.getLine(a)); - remove("_testtmp.contexts.tsv"); -}; diff --git a/test/WordsAndDocsFileLineCreator.h b/test/WordsAndDocsFileLineCreator.h new file mode 100644 index 0000000000..cb151216fd --- /dev/null +++ b/test/WordsAndDocsFileLineCreator.h @@ -0,0 +1,22 @@ +// Copyright 2024, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Felix Meisen (fesemeisen@outlook.de) + +#pragma once + +#include + +constexpr std::string_view inlineSeparator = "\t"; +constexpr std::string_view lineSeparator = "\n"; + +inline std::string createWordsFileLineAsString(std::string_view word, + bool isEntity, size_t contextId, + size_t score) { + return absl::StrCat(word, inlineSeparator, isEntity, inlineSeparator, + contextId, inlineSeparator, score, lineSeparator); +}; + +inline std::string createDocsFileLineAsString(size_t docId, + std::string_view docContent) { + return absl::StrCat(docId, inlineSeparator, docContent, lineSeparator); +}; diff --git a/test/WordsAndDocsFileParserTest.cpp b/test/WordsAndDocsFileParserTest.cpp new file mode 100644 index 0000000000..de7216ada7 --- /dev/null +++ b/test/WordsAndDocsFileParserTest.cpp @@ -0,0 +1,165 @@ +// Copyright 2015, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) + +#include + +#include +#include + +#include "./WordsAndDocsFileLineCreator.h" +#include "parser/WordsAndDocsFileParser.h" + +// All lambdas and type aliases used in this file contained here +namespace { + +/// Type aliases + +// Word, isEntity, contextId, score +using WordLine = std::tuple; +using WordLineVec = std::vector; + +// docId, docContent +using DocLine = std::tuple; +using DocLineVec = std::vector; + +using StringVec = std::vector; + +/// Lambdas + +auto getLocaleManager = []() -> LocaleManager { + return LocaleManager("en", "US", false); +}; + +auto wordsFileLineToWordLine = + [](const WordsFileLine& wordsFileLine) -> WordLine { + return std::make_tuple(wordsFileLine.word_, wordsFileLine.isEntity_, + static_cast(wordsFileLine.contextId_.get()), + static_cast(wordsFileLine.score_)); +}; + +// Lambda that takes in a path to wordsFile to initialize the Parser and an +// expectedResult that is compared against the parsers outputs. +auto testWordsFileParser = [](const std::string& wordsFilePath, + const WordLineVec& expectedResult) { + size_t i = 0; + LocaleManager localeManager = getLocaleManager(); + for (auto wordsFileLine : WordsFileParser{wordsFilePath, localeManager}) { + ASSERT_TRUE(i < expectedResult.size()); + WordLine testLine = wordsFileLineToWordLine(wordsFileLine); + + // Not testing the whole tuples against each other to have a cleaner + // indication what exactly caused the assertion to fail + ASSERT_EQ(std::get<0>(testLine), std::get<0>(expectedResult.at(i))); + ASSERT_EQ(std::get<1>(testLine), std::get<1>(expectedResult.at(i))); + ASSERT_EQ(std::get<2>(testLine), std::get<2>(expectedResult.at(i))); + ASSERT_EQ(std::get<3>(testLine), std::get<3>(expectedResult.at(i))); + + ++i; + } + ASSERT_EQ(i, expectedResult.size()); +}; + +auto docsFileLineToDocLine = [](const DocsFileLine& docsFileLine) -> DocLine { + return std::make_tuple(static_cast(docsFileLine.docId_.get()), + docsFileLine.docContent_); +}; + +// Same as testWordsFileParser but for docsFile +auto testDocsFileParser = [](const std::string& docsFilePath, + const DocLineVec& expectedResult) { + size_t i = 0; + LocaleManager localeManager = getLocaleManager(); + for (auto docsFileLine : DocsFileParser{docsFilePath, localeManager}) { + ASSERT_TRUE(i < expectedResult.size()); + DocLine testLine = docsFileLineToDocLine(docsFileLine); + + // Not testing the whole tuples against each other to have a cleaner + // indication what exactly caused the assertion to fail + ASSERT_EQ(std::get<0>(testLine), std::get<0>(expectedResult.at(i))); + ASSERT_EQ(std::get<1>(testLine), std::get<1>(expectedResult.at(i))); + + ++i; + } +}; + +// Passing the testText as copy to make sure it stays alive during the usage of +// tokenizer +auto testTokenizeAndNormalizeText = [](std::string testText, + const StringVec& normalizedTextAsVec) { + size_t i = 0; + LocaleManager localeManager = getLocaleManager(); + for (auto normalizedWord : + tokenizeAndNormalizeText(testText, localeManager)) { + ASSERT_TRUE(i < normalizedTextAsVec.size()); + ASSERT_EQ(normalizedWord, normalizedTextAsVec.at(i)); + + ++i; + } + ASSERT_EQ(i, normalizedTextAsVec.size()); +}; + +} // namespace + +TEST(WordsAndDocsFileParserTest, wordsFileParserTest) { + char* locale = setlocale(LC_CTYPE, ""); + std::cout << "Set locale LC_CTYPE to: " << locale << std::endl; + + std::fstream f("_testtmp.contexts.tsv", std::ios_base::out); + f << createWordsFileLineAsString("Foo", false, 0, 2) + << createWordsFileLineAsString("foo", false, 0, 2) + << createWordsFileLineAsString("Bär", true, 0, 1) + << createWordsFileLineAsString("Äü", false, 0, 1) + << createWordsFileLineAsString("X", false, 1, 1); + f.close(); + + WordLineVec expected = {{"foo", false, 0, 2}, + {"foo", false, 0, 2}, + {"Bär", true, 0, 1}, + {"äü", false, 0, 1}, + {"x", false, 1, 1}}; + + testWordsFileParser("_testtmp.contexts.tsv", expected); + remove("_testtmp.contexts.tsv"); +}; + +TEST(WordsAndDocsFileParser, docsFileParserTest) { + char* locale = setlocale(LC_CTYPE, ""); + std::cout << "Set locale LC_CTYPE to: " << locale << std::endl; + + std::fstream f("_testtmp.documents.tsv", std::ios_base::out); + f << createDocsFileLineAsString(4, "This TeSt is OnlyCharcters") + << createDocsFileLineAsString(7, "Wh4t h4pp3ns t0 num83rs") + << createDocsFileLineAsString(8, "An( sp@ci*l ch.ar,:act=_er+s") + << createDocsFileLineAsString(190293, "Large docId"); + f.close(); + + DocLineVec expected = {{4, "This TeSt is OnlyCharcters"}, + {7, "Wh4t h4pp3ns t0 num83rs"}, + {8, "An( sp@ci*l ch.ar,:act=_er+s"}, + {190293, "Large docId"}}; + + testDocsFileParser("_testtmp.documents.tsv", expected); + remove("_testtmp.documents.tsv"); +} + +TEST(TokenizeAndNormalizeText, tokenizeAndNormalizeTextTest) { + char* locale = setlocale(LC_CTYPE, ""); + std::cout << "Set locale LC_CTYPE to: " << locale << std::endl; + + // Test 1 + testTokenizeAndNormalizeText("already normalized text", + {"already", "normalized", "text"}); + + // Test 2 + testTokenizeAndNormalizeText("TeXt WITH UpperCASe", + {"text", "with", "uppercase"}); + + // Test 3 + testTokenizeAndNormalizeText("41ph4num3r1c t3xt", {"41ph4num3r1c", "t3xt"}); + + // Test 4 + testTokenizeAndNormalizeText( + "test\twith\ndifferent,separators.here ,.\t", + {"test", "with", "different", "separators", "here"}); +} diff --git a/test/engine/TextIndexScanForWordTest.cpp b/test/engine/TextIndexScanForWordTest.cpp index eac3cb0d2f..cc9b685ec8 100644 --- a/test/engine/TextIndexScanForWordTest.cpp +++ b/test/engine/TextIndexScanForWordTest.cpp @@ -5,6 +5,7 @@ #include #include +#include "../WordsAndDocsFileLineCreator.h" #include "../printers/VariablePrinters.h" #include "../util/GTestHelpers.h" #include "../util/IdTableHelpers.h" @@ -26,45 +27,45 @@ std::string kg = ". . . ."; std::string wordsFileContent = - h::createWordsFileLine("astronomer", false, 1, 1) + - h::createWordsFileLine("", true, 1, 0) + - h::createWordsFileLine("scientist", false, 1, 1) + - h::createWordsFileLine("field", false, 1, 1) + - h::createWordsFileLine("astronomy", false, 1, 1) + - h::createWordsFileLine("astronomer", false, 2, 0) + - h::createWordsFileLine("", true, 2, 0) + - h::createWordsFileLine(":s:firstsentence", false, 2, 0) + - h::createWordsFileLine("scientist", false, 2, 0) + - h::createWordsFileLine("field", false, 2, 0) + - h::createWordsFileLine("astronomy", false, 2, 0) + - h::createWordsFileLine("astronomy", false, 3, 1) + - h::createWordsFileLine("concentrates", false, 3, 1) + - h::createWordsFileLine("studies", false, 3, 1) + - h::createWordsFileLine("specific", false, 3, 1) + - h::createWordsFileLine("question", false, 3, 1) + - h::createWordsFileLine("outside", false, 3, 1) + - h::createWordsFileLine("scope", false, 3, 1) + - h::createWordsFileLine("earth", false, 3, 1) + - h::createWordsFileLine("astronomy", false, 4, 1) + - h::createWordsFileLine("concentrates", false, 4, 1) + - h::createWordsFileLine("studies", false, 4, 1) + - h::createWordsFileLine("field", false, 4, 1) + - h::createWordsFileLine("outside", false, 4, 1) + - h::createWordsFileLine("scope", false, 4, 1) + - h::createWordsFileLine("earth", false, 4, 1) + - h::createWordsFileLine("tester", false, 5, 1) + - h::createWordsFileLine("rockets", false, 5, 1) + - h::createWordsFileLine("astronomer", false, 5, 1) + - h::createWordsFileLine("", true, 5, 0) + - h::createWordsFileLine("although", false, 5, 1) + - h::createWordsFileLine("astronomer", false, 6, 0) + - h::createWordsFileLine("", true, 6, 0) + - h::createWordsFileLine("although", false, 6, 0) + - h::createWordsFileLine("", true, 6, 0) + - h::createWordsFileLine("space", false, 6, 1) + - h::createWordsFileLine("", true, 7, 0) + - h::createWordsFileLine("space", false, 7, 0) + - h::createWordsFileLine("earth", false, 7, 1); + createWordsFileLineAsString("astronomer", false, 1, 1) + + createWordsFileLineAsString("", true, 1, 0) + + createWordsFileLineAsString("scientist", false, 1, 1) + + createWordsFileLineAsString("field", false, 1, 1) + + createWordsFileLineAsString("astronomy", false, 1, 1) + + createWordsFileLineAsString("astronomer", false, 2, 0) + + createWordsFileLineAsString("", true, 2, 0) + + createWordsFileLineAsString(":s:firstsentence", false, 2, 0) + + createWordsFileLineAsString("scientist", false, 2, 0) + + createWordsFileLineAsString("field", false, 2, 0) + + createWordsFileLineAsString("astronomy", false, 2, 0) + + createWordsFileLineAsString("astronomy", false, 3, 1) + + createWordsFileLineAsString("concentrates", false, 3, 1) + + createWordsFileLineAsString("studies", false, 3, 1) + + createWordsFileLineAsString("specific", false, 3, 1) + + createWordsFileLineAsString("question", false, 3, 1) + + createWordsFileLineAsString("outside", false, 3, 1) + + createWordsFileLineAsString("scope", false, 3, 1) + + createWordsFileLineAsString("earth", false, 3, 1) + + createWordsFileLineAsString("astronomy", false, 4, 1) + + createWordsFileLineAsString("concentrates", false, 4, 1) + + createWordsFileLineAsString("studies", false, 4, 1) + + createWordsFileLineAsString("field", false, 4, 1) + + createWordsFileLineAsString("outside", false, 4, 1) + + createWordsFileLineAsString("scope", false, 4, 1) + + createWordsFileLineAsString("earth", false, 4, 1) + + createWordsFileLineAsString("tester", false, 5, 1) + + createWordsFileLineAsString("rockets", false, 5, 1) + + createWordsFileLineAsString("astronomer", false, 5, 1) + + createWordsFileLineAsString("", true, 5, 0) + + createWordsFileLineAsString("although", false, 5, 1) + + createWordsFileLineAsString("astronomer", false, 6, 0) + + createWordsFileLineAsString("", true, 6, 0) + + createWordsFileLineAsString("although", false, 6, 0) + + createWordsFileLineAsString("", true, 6, 0) + + createWordsFileLineAsString("space", false, 6, 1) + + createWordsFileLineAsString("", true, 7, 0) + + createWordsFileLineAsString("space", false, 7, 0) + + createWordsFileLineAsString("earth", false, 7, 1); std::string firstDocText = "An astronomer is a scientist in the field of " @@ -77,8 +78,8 @@ std::string secondDocText = "too although they might not be in space but on " "earth."; -std::string docsFileContent = h::createDocsFileLine(4, firstDocText) + - h::createDocsFileLine(7, secondDocText); +std::string docsFileContent = createDocsFileLineAsString(4, firstDocText) + + createDocsFileLineAsString(7, secondDocText); std::pair contentsOfWordsFileAndDocsFile = { wordsFileContent, docsFileContent}; diff --git a/test/engine/TextIndexScanTestHelpers.h b/test/engine/TextIndexScanTestHelpers.h index 83a72ddea4..6ba1b8c6de 100644 --- a/test/engine/TextIndexScanTestHelpers.h +++ b/test/engine/TextIndexScanTestHelpers.h @@ -66,18 +66,4 @@ inline string combineToString(const string& text, const string& word) { ss << "Text: " << text << ", Word: " << word << std::endl; return ss.str(); } - -inline std::string inlineSeparator = "\t"; -inline std::string lineSeparator = "\n"; - -inline string createWordsFileLine(std::string word, bool isEntity, - size_t contextId, size_t score) { - return word + inlineSeparator + (isEntity ? "1" : "0") + inlineSeparator + - std::to_string(contextId) + inlineSeparator + std::to_string(score) + - lineSeparator; -}; - -inline string createDocsFileLine(size_t docId, std::string docContent) { - return std::to_string(docId) + inlineSeparator + docContent + lineSeparator; -}; } // namespace textIndexScanTestHelpers