Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better parsing for the words- and docsfile #1695

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/global/IndexTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ using LocalVocabIndex = const LocalVocabEntry*;
using TextRecordIndex = ad_utility::TypedIndex<uint64_t, "TextRecordIndex">;
using WordVocabIndex = ad_utility::TypedIndex<uint64_t, "WordVocabIndex">;
using BlankNodeIndex = ad_utility::TypedIndex<uint64_t, "BlankNodeIndex">;
using DocumentIndex = ad_utility::TypedIndex<uint64_t, "DocumentIndex">;
61 changes: 34 additions & 27 deletions src/index/IndexImpl.Text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#include "backports/algorithm.h"
#include "engine/CallFixedSize.h"
#include "index/FTSAlgorithms.h"
#include "parser/ContextFileParser.h"
#include "parser/WordsAndDocsFileParser.h"
#include "util/Conversions.h"
#include "util/Simple8bCode.h"

Expand All @@ -35,21 +35,30 @@
}
};

cppcoro::generator<std::string> tokenizeAndNormalizeTextLine(
std::string_view lineView, LocaleManager localeManager) {
// Currently it is not possible to use std::views or std::ranges with the
// splitter object returned by absl::StrSplit. Every solution I have seen
// will remove the lazy nature of StrSplit and views/ranges. (2024-12-28)
for (auto word : absl::StrSplit(lineView, LiteralsTokenizationDelimiter{},
absl::SkipEmpty{})) {
co_yield localeManager.getLowercaseUtf8(word);
}
}
} // namespace

// _____________________________________________________________________________
cppcoro::generator<ContextFileParser::Line> IndexImpl::wordsInTextRecords(
cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
const std::string& contextFile, bool addWordsFromLiterals) {
auto localeManager = textVocab_.getLocaleManager();
// ROUND 1: If context file aka wordsfile is not empty, read words from there.
// Remember the last context id for the (optional) second round.
TextRecordIndex contextId = TextRecordIndex::make(0);
if (!contextFile.empty()) {
ContextFileParser::Line line;
ContextFileParser p(contextFile, localeManager);
WordsFileParser p(contextFile, localeManager);
ad_utility::HashSet<string> items;
while (p.getLine(line)) {
contextId = line._contextId;
for (auto line : p) {
contextId = line.contextId_;
co_yield line;
}
if (contextId > TextRecordIndex::make(0)) {
Expand All @@ -65,15 +74,13 @@
if (!isLiteral(text)) {
continue;
}
ContextFileParser::Line entityLine{text, true, contextId, 1, true};
WordsFileLine entityLine{text, true, contextId, 1, true};
co_yield entityLine;
std::string_view textView = text;
textView = textView.substr(0, textView.rfind('"'));
textView.remove_prefix(1);
for (auto word : absl::StrSplit(textView, LiteralsTokenizationDelimiter{},
absl::SkipEmpty{})) {
auto wordNormalized = localeManager.getLowercaseUtf8(word);
ContextFileParser::Line wordLine{wordNormalized, false, contextId, 1};
for (auto word : tokenizeAndNormalizeTextLine(textView, localeManager)) {
WordsFileLine wordLine{word, false, contextId, 1};
co_yield wordLine;
}
contextId = contextId.incremented();
Expand Down Expand Up @@ -214,12 +221,12 @@
for (auto line : wordsInTextRecords(contextFile, addWordsFromLiterals)) {
++numLines;
// LOG(INFO) << "LINE: "
// << std::setw(50) << line._word << " "
// << line._isEntity << "\t"
// << line._contextId.get() << "\t"
// << line._score << std::endl;
if (!line._isEntity) {
distinctWords.insert(line._word);
// << std::setw(50) << line.word_ << " "
// << line.isEntity_ << "\t"
// << line.contextId_.get() << "\t"
// << line.score_ << std::endl;
if (!line.isEntity_) {
distinctWords.insert(line.word_);
}
}
textVocab_.createFromSet(distinctWords, onDiskBase_ + ".text.vocabulary");
Expand All @@ -243,29 +250,29 @@
size_t nofLiterals = 0;

for (auto line : wordsInTextRecords(contextFile, addWordsFromLiterals)) {
if (line._contextId != currentContext) {
if (line.contextId_ != currentContext) {
++nofContexts;
addContextToVector(writer, currentContext, wordsInContext,
entitiesInContext);
currentContext = line._contextId;
currentContext = line.contextId_;
wordsInContext.clear();
entitiesInContext.clear();
}
if (line._isEntity) {
if (line.isEntity_) {
++nofEntityPostings;
// TODO<joka921> Currently only IRIs and strings from the vocabulary can
// be tagged entities in the text index (no doubles, ints, etc).
VocabIndex eid;
if (getVocab().getId(line._word, &eid)) {
if (getVocab().getId(line.word_, &eid)) {
// Note that `entitiesInContext` is a HashMap, so the `Id`s don't have
// to be contiguous.
entitiesInContext[Id::makeFromVocabIndex(eid)] += line._score;
if (line._isLiteralEntity) {
entitiesInContext[Id::makeFromVocabIndex(eid)] += line.score_;
if (line.isLiteralEntity_) {
++nofLiterals;
}
} else {
if (entityNotFoundErrorMsgCount < 20) {
LOG(WARN) << "Entity from text not in KB: " << line._word << '\n';
LOG(WARN) << "Entity from text not in KB: " << line.word_ << '\n';
if (++entityNotFoundErrorMsgCount == 20) {
LOG(WARN) << "There are more entities not in the KB..."
<< " suppressing further warnings...\n";
Expand All @@ -278,14 +285,14 @@
++nofWordPostings;
// TODO<joka921> Let the `textVocab_` return a `WordIndex` directly.
WordVocabIndex vid;
bool ret = textVocab_.getId(line._word, &vid);
bool ret = textVocab_.getId(line.word_, &vid);
WordIndex wid = vid.get();
if (!ret) {
LOG(ERROR) << "ERROR: word \"" << line._word << "\" "
LOG(ERROR) << "ERROR: word \"" << line.word_ << "\" "

Check warning on line 291 in src/index/IndexImpl.Text.cpp

View check run for this annotation

Codecov / codecov/patch

src/index/IndexImpl.Text.cpp#L291

Added line #L291 was not covered by tests
<< "not found in textVocab. Terminating\n";
AD_FAIL();
}
wordsInContext[wid] += line._score;
wordsInContext[wid] += line.score_;
}
}
if (entityNotFoundErrorMsgCount > 0) {
Expand Down
4 changes: 2 additions & 2 deletions src/index/IndexImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@
#include "index/TextMetaData.h"
#include "index/Vocabulary.h"
#include "index/VocabularyMerger.h"
#include "parser/ContextFileParser.h"
#include "parser/RdfParser.h"
#include "parser/TripleComponent.h"
#include "parser/WordsAndDocsFileParser.h"
#include "util/BufferedVector.h"
#include "util/CancellationHandle.h"
#include "util/File.h"
Expand Down Expand Up @@ -515,7 +515,7 @@ class IndexImpl {
// TODO: So far, this is limited to the internal vocabulary (still in the
// testing phase, once it works, it should be easy to include the IRIs and
// literals from the external vocabulary as well).
cppcoro::generator<ContextFileParser::Line> wordsInTextRecords(
cppcoro::generator<WordsFileLine> wordsInTextRecords(
const std::string& contextFile, bool addWordsFromLiterals);

size_t processWordsForVocabulary(const string& contextFile,
Expand Down
2 changes: 1 addition & 1 deletion src/parser/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ add_library(parser
ParsedQuery.cpp
RdfParser.cpp
Tokenizer.cpp
ContextFileParser.cpp
WordsAndDocsFileParser.cpp
TurtleTokenId.h
ParallelBuffer.cpp
SparqlParserHelpers.cpp
Expand Down
46 changes: 0 additions & 46 deletions src/parser/ContextFileParser.cpp

This file was deleted.

45 changes: 0 additions & 45 deletions src/parser/ContextFileParser.h

This file was deleted.

61 changes: 61 additions & 0 deletions src/parser/WordsAndDocsFileParser.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Copyright 2015, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Björn Buchhold ([email protected])

#include "parser/WordsAndDocsFileParser.h"

#include <cassert>

#include "../util/Exception.h"
#include "../util/StringUtils.h"

// _____________________________________________________________________________
WordsAndDocsFileParser::WordsAndDocsFileParser(const string& wordsOrDocsFile,
LocaleManager localeManager)
: in_(wordsOrDocsFile), localeManager_(std::move(localeManager)) {}

// _____________________________________________________________________________
WordsAndDocsFileParser::~WordsAndDocsFileParser() { in_.close(); }

// _____________________________________________________________________________
ad_utility::InputRangeFromGet<WordsFileLine>::Storage WordsFileParser::get() {
WordsFileLine line;
string l;
if (std::getline(in_, l)) {
size_t i = l.find('\t');
assert(i != string::npos);
size_t j = i + 2;
assert(j + 3 < l.size());
size_t k = l.find('\t', j + 2);
assert(k != string::npos);
line.isEntity_ = (l[i + 1] == '1');
line.word_ =
(line.isEntity_ ? l.substr(0, i)
: localeManager_.getLowercaseUtf8(l.substr(0, i)));
line.contextId_ =
TextRecordIndex::make(atol(l.substr(j + 1, k - j - 1).c_str()));
line.score_ = static_cast<Score>(atol(l.substr(k + 1).c_str()));
#ifndef NDEBUG
if (lastCId_ > line.contextId_) {
AD_THROW("ContextFile has to be sorted by context Id.");
}
lastCId_ = line.contextId_;
#endif
return line;
}
return std::nullopt;
}

// _____________________________________________________________________________
ad_utility::InputRangeFromGet<DocsFileLine>::Storage DocsFileParser::get() {
DocsFileLine line;
string l;

Check warning on line 52 in src/parser/WordsAndDocsFileParser.cpp

View check run for this annotation

Codecov / codecov/patch

src/parser/WordsAndDocsFileParser.cpp#L50-L52

Added lines #L50 - L52 were not covered by tests
if (std::getline(in_, l)) {
size_t i = l.find('\t');
assert(i != string::npos);
line.docId_ = DocumentIndex::make(atol(l.substr(0, i).c_str()));
line.docContent_ = l.substr(i + 1);
return line;
}
return std::nullopt;
}

Check warning on line 61 in src/parser/WordsAndDocsFileParser.cpp

View check run for this annotation

Codecov / codecov/patch

src/parser/WordsAndDocsFileParser.cpp#L54-L61

Added lines #L54 - L61 were not covered by tests
63 changes: 63 additions & 0 deletions src/parser/WordsAndDocsFileParser.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// Copyright 2015, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Björn Buchhold ([email protected])

#pragma once

#include <unicode/locid.h>

#include <fstream>
#include <string>

#include "global/Id.h"
#include "index/StringSortComparator.h"
#include "util/Iterators.h"

using std::string;

struct WordsFileLine {
string word_;
bool isEntity_;
TextRecordIndex contextId_;
Score score_;
bool isLiteralEntity_ = false;
};

struct DocsFileLine {
string docContent_;
DocumentIndex docId_;
};

class WordsAndDocsFileParser {
public:
explicit WordsAndDocsFileParser(const string& wordsOrDocsFile,
LocaleManager localeManager);
~WordsAndDocsFileParser();
explicit WordsAndDocsFileParser(const WordsAndDocsFileParser& other) = delete;
WordsAndDocsFileParser& operator=(const WordsAndDocsFileParser& other) =
delete;

protected:
std::ifstream in_;
LocaleManager localeManager_;
};

class WordsFileParser : public WordsAndDocsFileParser,
public ad_utility::InputRangeFromGet<WordsFileLine> {
public:
using WordsAndDocsFileParser::WordsAndDocsFileParser;
Storage get() override;

private:
#ifndef NDEBUG
// Only used for sanity checks in debug builds
TextRecordIndex lastCId_ = TextRecordIndex::make(0);
#endif
};

class DocsFileParser : public WordsAndDocsFileParser,
public ad_utility::InputRangeFromGet<DocsFileLine> {
public:
using WordsAndDocsFileParser::WordsAndDocsFileParser;
Storage get() override;
};
2 changes: 1 addition & 1 deletion test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ addLinkAndDiscoverTestSerial(FileTest)

addLinkAndDiscoverTest(Simple8bTest)

addLinkAndDiscoverTest(ContextFileParserTest parser)
addLinkAndDiscoverTest(WordsAndDocsFileParserTest parser)

addLinkAndDiscoverTest(IndexMetaDataTest index)

Expand Down
Loading
Loading