ad-freiburg · Flixtastic · Dec 28, 2024 · Dec 28, 2024
diff --git a/src/global/IndexTypes.h b/src/global/IndexTypes.h
@@ -16,3 +16,4 @@ using LocalVocabIndex = const LocalVocabEntry*;
 using TextRecordIndex = ad_utility::TypedIndex<uint64_t, "TextRecordIndex">;
 using WordVocabIndex = ad_utility::TypedIndex<uint64_t, "WordVocabIndex">;
 using BlankNodeIndex = ad_utility::TypedIndex<uint64_t, "BlankNodeIndex">;
+using DocumentIndex = ad_utility::TypedIndex<uint64_t, "DocumentIndex">;
diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp
@@ -17,7 +17,7 @@
 #include "backports/algorithm.h"
 #include "engine/CallFixedSize.h"
 #include "index/FTSAlgorithms.h"
-#include "parser/ContextFileParser.h"
+#include "parser/WordsAndDocsFileParser.h"
 #include "util/Conversions.h"
 #include "util/Simple8bCode.h"
 
@@ -35,21 +35,30 @@
   }
 };
 
+cppcoro::generator<std::string> tokenizeAndNormalizeTextLine(
+    std::string_view lineView, LocaleManager localeManager) {
+  // Currently it is not possible to use std::views or std::ranges with the
+  // splitter object returned by absl::StrSplit. Every solution I have seen
+  // will remove the lazy nature of StrSplit and views/ranges. (2024-12-28)
+  for (auto word : absl::StrSplit(lineView, LiteralsTokenizationDelimiter{},
+                                  absl::SkipEmpty{})) {
+    co_yield localeManager.getLowercaseUtf8(word);
+  }
+}
 }  // namespace
 
 // _____________________________________________________________________________
-cppcoro::generator<ContextFileParser::Line> IndexImpl::wordsInTextRecords(
+cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
     const std::string& contextFile, bool addWordsFromLiterals) {
   auto localeManager = textVocab_.getLocaleManager();
   // ROUND 1: If context file aka wordsfile is not empty, read words from there.
   // Remember the last context id for the (optional) second round.
   TextRecordIndex contextId = TextRecordIndex::make(0);
   if (!contextFile.empty()) {
-    ContextFileParser::Line line;
-    ContextFileParser p(contextFile, localeManager);
+    WordsFileParser p(contextFile, localeManager);
     ad_utility::HashSet<string> items;
-    while (p.getLine(line)) {
-      contextId = line._contextId;
+    for (auto line : p) {
+      contextId = line.contextId_;
       co_yield line;
     }
     if (contextId > TextRecordIndex::make(0)) {
@@ -65,15 +74,13 @@
       if (!isLiteral(text)) {
         continue;
       }
-      ContextFileParser::Line entityLine{text, true, contextId, 1, true};
+      WordsFileLine entityLine{text, true, contextId, 1, true};
       co_yield entityLine;
       std::string_view textView = text;
       textView = textView.substr(0, textView.rfind('"'));
       textView.remove_prefix(1);
-      for (auto word : absl::StrSplit(textView, LiteralsTokenizationDelimiter{},
-                                      absl::SkipEmpty{})) {
-        auto wordNormalized = localeManager.getLowercaseUtf8(word);
-        ContextFileParser::Line wordLine{wordNormalized, false, contextId, 1};
+      for (auto word : tokenizeAndNormalizeTextLine(textView, localeManager)) {
+        WordsFileLine wordLine{word, false, contextId, 1};
         co_yield wordLine;
       }
       contextId = contextId.incremented();
@@ -214,12 +221,12 @@
   for (auto line : wordsInTextRecords(contextFile, addWordsFromLiterals)) {
     ++numLines;
     // LOG(INFO) << "LINE: "
-    //           << std::setw(50) << line._word << "   "
-    //           << line._isEntity << "\t"
-    //           << line._contextId.get() << "\t"
-    //           << line._score << std::endl;
-    if (!line._isEntity) {
-      distinctWords.insert(line._word);
+    //           << std::setw(50) << line.word_ << "   "
+    //           << line.isEntity_ << "\t"
+    //           << line.contextId_.get() << "\t"
+    //           << line.score_ << std::endl;
+    if (!line.isEntity_) {
+      distinctWords.insert(line.word_);
     }
   }
   textVocab_.createFromSet(distinctWords, onDiskBase_ + ".text.vocabulary");
@@ -243,29 +250,29 @@
   size_t nofLiterals = 0;
 
   for (auto line : wordsInTextRecords(contextFile, addWordsFromLiterals)) {
-    if (line._contextId != currentContext) {
+    if (line.contextId_ != currentContext) {
       ++nofContexts;
       addContextToVector(writer, currentContext, wordsInContext,
                          entitiesInContext);
-      currentContext = line._contextId;
+      currentContext = line.contextId_;
       wordsInContext.clear();
       entitiesInContext.clear();
     }
-    if (line._isEntity) {
+    if (line.isEntity_) {
       ++nofEntityPostings;
       // TODO<joka921> Currently only IRIs and strings from the vocabulary can
       // be tagged entities in the text index (no doubles, ints, etc).
       VocabIndex eid;
-      if (getVocab().getId(line._word, &eid)) {
+      if (getVocab().getId(line.word_, &eid)) {
         // Note that `entitiesInContext` is a HashMap, so the `Id`s don't have
         // to be contiguous.
-        entitiesInContext[Id::makeFromVocabIndex(eid)] += line._score;
-        if (line._isLiteralEntity) {
+        entitiesInContext[Id::makeFromVocabIndex(eid)] += line.score_;
+        if (line.isLiteralEntity_) {
           ++nofLiterals;
         }
       } else {
         if (entityNotFoundErrorMsgCount < 20) {
-          LOG(WARN) << "Entity from text not in KB: " << line._word << '\n';
+          LOG(WARN) << "Entity from text not in KB: " << line.word_ << '\n';
           if (++entityNotFoundErrorMsgCount == 20) {
             LOG(WARN) << "There are more entities not in the KB..."
                       << " suppressing further warnings...\n";
@@ -278,14 +285,14 @@
       ++nofWordPostings;
       // TODO<joka921> Let the `textVocab_` return a `WordIndex` directly.
       WordVocabIndex vid;
-      bool ret = textVocab_.getId(line._word, &vid);
+      bool ret = textVocab_.getId(line.word_, &vid);
       WordIndex wid = vid.get();
       if (!ret) {
-        LOG(ERROR) << "ERROR: word \"" << line._word << "\" "
+        LOG(ERROR) << "ERROR: word \"" << line.word_ << "\" "
                    << "not found in textVocab. Terminating\n";
         AD_FAIL();
       }
-      wordsInContext[wid] += line._score;
+      wordsInContext[wid] += line.score_;
     }
   }
   if (entityNotFoundErrorMsgCount > 0) {

diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h
@@ -29,9 +29,9 @@
 #include "index/TextMetaData.h"
 #include "index/Vocabulary.h"
 #include "index/VocabularyMerger.h"
-#include "parser/ContextFileParser.h"
 #include "parser/RdfParser.h"
 #include "parser/TripleComponent.h"
+#include "parser/WordsAndDocsFileParser.h"
 #include "util/BufferedVector.h"
 #include "util/CancellationHandle.h"
 #include "util/File.h"
@@ -515,7 +515,7 @@ class IndexImpl {
   // TODO: So far, this is limited to the internal vocabulary (still in the
   // testing phase, once it works, it should be easy to include the IRIs and
   // literals from the external vocabulary as well).
-  cppcoro::generator<ContextFileParser::Line> wordsInTextRecords(
+  cppcoro::generator<WordsFileLine> wordsInTextRecords(
       const std::string& contextFile, bool addWordsFromLiterals);
 
   size_t processWordsForVocabulary(const string& contextFile,

diff --git a/src/parser/CMakeLists.txt b/src/parser/CMakeLists.txt
@@ -10,7 +10,7 @@ add_library(parser
         ParsedQuery.cpp
         RdfParser.cpp
         Tokenizer.cpp
-        ContextFileParser.cpp
+        WordsAndDocsFileParser.cpp
         TurtleTokenId.h
         ParallelBuffer.cpp
         SparqlParserHelpers.cpp

diff --git a/src/parser/ContextFileParser.cpp b/src/parser/ContextFileParser.cpp
diff --git a/src/parser/ContextFileParser.h b/src/parser/ContextFileParser.h
diff --git a/src/parser/WordsAndDocsFileParser.cpp b/src/parser/WordsAndDocsFileParser.cpp
@@ -0,0 +1,61 @@
+// Copyright 2015, University of Freiburg,
+// Chair of Algorithms and Data Structures.
+// Author: Björn Buchhold ([email protected])
+
+#include "parser/WordsAndDocsFileParser.h"
+
+#include <cassert>
+
+#include "../util/Exception.h"
+#include "../util/StringUtils.h"
+
+// _____________________________________________________________________________
+WordsAndDocsFileParser::WordsAndDocsFileParser(const string& wordsOrDocsFile,
+                                               LocaleManager localeManager)
+    : in_(wordsOrDocsFile), localeManager_(std::move(localeManager)) {}
+
+// _____________________________________________________________________________
+WordsAndDocsFileParser::~WordsAndDocsFileParser() { in_.close(); }
+
+// _____________________________________________________________________________
+ad_utility::InputRangeFromGet<WordsFileLine>::Storage WordsFileParser::get() {
+  WordsFileLine line;
+  string l;
+  if (std::getline(in_, l)) {
+    size_t i = l.find('\t');
+    assert(i != string::npos);
+    size_t j = i + 2;
+    assert(j + 3 < l.size());
+    size_t k = l.find('\t', j + 2);
+    assert(k != string::npos);
+    line.isEntity_ = (l[i + 1] == '1');
+    line.word_ =
+        (line.isEntity_ ? l.substr(0, i)
+                        : localeManager_.getLowercaseUtf8(l.substr(0, i)));
+    line.contextId_ =
+        TextRecordIndex::make(atol(l.substr(j + 1, k - j - 1).c_str()));
+    line.score_ = static_cast<Score>(atol(l.substr(k + 1).c_str()));
+#ifndef NDEBUG
+    if (lastCId_ > line.contextId_) {
+      AD_THROW("ContextFile has to be sorted by context Id.");
+    }
+    lastCId_ = line.contextId_;
+#endif
+    return line;
+  }
+  return std::nullopt;
+}
+
+// _____________________________________________________________________________
+ad_utility::InputRangeFromGet<DocsFileLine>::Storage DocsFileParser::get() {
+  DocsFileLine line;
+  string l;
+  if (std::getline(in_, l)) {
+    size_t i = l.find('\t');
+    assert(i != string::npos);
+    line.docId_ = DocumentIndex::make(atol(l.substr(0, i).c_str()));
+    line.docContent_ = l.substr(i + 1);
+    return line;
+  }
+  return std::nullopt;
+}
diff --git a/src/parser/WordsAndDocsFileParser.h b/src/parser/WordsAndDocsFileParser.h
@@ -0,0 +1,63 @@
+// Copyright 2015, University of Freiburg,
+// Chair of Algorithms and Data Structures.
+// Author: Björn Buchhold ([email protected])
+
+#pragma once
+
+#include <unicode/locid.h>
+
+#include <fstream>
+#include <string>
+
+#include "global/Id.h"
+#include "index/StringSortComparator.h"
+#include "util/Iterators.h"
+
+using std::string;
+
+struct WordsFileLine {
+  string word_;
+  bool isEntity_;
+  TextRecordIndex contextId_;
+  Score score_;
+  bool isLiteralEntity_ = false;
+};
+
+struct DocsFileLine {
+  string docContent_;
+  DocumentIndex docId_;
+};
+
+class WordsAndDocsFileParser {
+ public:
+  explicit WordsAndDocsFileParser(const string& wordsOrDocsFile,
+                                  LocaleManager localeManager);
+  ~WordsAndDocsFileParser();
+  explicit WordsAndDocsFileParser(const WordsAndDocsFileParser& other) = delete;
+  WordsAndDocsFileParser& operator=(const WordsAndDocsFileParser& other) =
+      delete;
+
+ protected:
+  std::ifstream in_;
+  LocaleManager localeManager_;
+};
+
+class WordsFileParser : public WordsAndDocsFileParser,
+                        public ad_utility::InputRangeFromGet<WordsFileLine> {
+ public:
+  using WordsAndDocsFileParser::WordsAndDocsFileParser;
+  Storage get() override;
+
+ private:
+#ifndef NDEBUG
+  // Only used for sanity checks in debug builds
+  TextRecordIndex lastCId_ = TextRecordIndex::make(0);
+#endif
+};
+
+class DocsFileParser : public WordsAndDocsFileParser,
+                       public ad_utility::InputRangeFromGet<DocsFileLine> {
+ public:
+  using WordsAndDocsFileParser::WordsAndDocsFileParser;
+  Storage get() override;
+};
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -137,7 +137,7 @@ addLinkAndDiscoverTestSerial(FileTest)
 
 addLinkAndDiscoverTest(Simple8bTest)
 
-addLinkAndDiscoverTest(ContextFileParserTest parser)
+addLinkAndDiscoverTest(WordsAndDocsFileParserTest parser)
 
 addLinkAndDiscoverTest(IndexMetaDataTest index)