Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add triples of ql:has-pattern predicate to PSO and POS #1226

Merged
merged 11 commits into from
Jan 17, 2024
35 changes: 34 additions & 1 deletion src/engine/idTable/CompressedExternalIdTable.h
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,10 @@ class CompressedExternalIdTableBase {
CompressedExternalIdTableWriter writer_;
std::future<void> compressAndWriteFuture_;

// Store whether this table has previously already been iterated over (in
// which case this member becomes `false`).
std::atomic<bool> isFirstIteration_ = true;

[[no_unique_address]] BlockTransformation blockTransformation_{};

public:
Expand Down Expand Up @@ -364,6 +368,7 @@ class CompressedExternalIdTableBase {
}
writer_.clear();
numBlocksPushed_ = 0;
isFirstIteration_ = true;
}

protected:
Expand Down Expand Up @@ -401,6 +406,9 @@ class CompressedExternalIdTableBase {
// until the pushing is actually finished, and return `true`. Using this
// function allows for an efficient usage of this class for very small inputs.
bool transformAndPushLastBlock() {
if (!isFirstIteration_) {
return numBlocksPushed_ != 0;
}
// If we have pushed at least one (complete) block, then the last future
// from pushing a block is still in flight. If we have never pushed a block,
// then also the future cannot be valid.
Expand Down Expand Up @@ -549,6 +557,9 @@ class CompressedExternalIdTableSorter
// output phase.
int numBufferedOutputBlocks_ = 4;

// See the `moveResultOnMerge()` getter function for documentation.
bool moveResultOnMerge_ = true;

public:
// Constructor.
CompressedExternalIdTableSorter(
Expand Down Expand Up @@ -579,6 +590,18 @@ class CompressedExternalIdTableSorter
// within this class.
using Base::push;

// If set to `false` then the sorted result can be extracted multiple times.
// If set to `true` then the result is moved out and unusable after the first
// merge. In that case an exception will be thrown at the start of the second
// merge.
// Note: This mechanism gives a performance advantage for very small inputs
// that can be completely sorted in RAM. In that case we can avoid a copy of
// the sorted result.
bool& moveResultOnMerge() {
AD_CONTRACT_CHECK(this->isFirstIteration_);
return moveResultOnMerge_;
}

// Transition from the input phase, where `push()` can be called, to the
// output phase and return a generator that yields the sorted elements one by
// one. Either this function or the following function must be called exactly
Expand All @@ -594,6 +617,8 @@ class CompressedExternalIdTableSorter
requires(N == NumStaticCols || N == 0)
cppcoro::generator<IdTableStatic<N>> getSortedBlocks(
std::optional<size_t> blocksize = std::nullopt) {
// If we move the result out, there must only be a single merge phase.
AD_CONTRACT_CHECK(this->isFirstIteration_ || !this->moveResultOnMerge_);
mergeIsActive_.store(true);
// Explanation for the second argument: One block is buffered by this
// generator, one block is buffered inside the `sortedBlocks` generator, so
Expand All @@ -604,6 +629,7 @@ class CompressedExternalIdTableSorter
std::max(1, numBufferedOutputBlocks_ - 2))) {
co_yield block;
}
this->isFirstIteration_ = false;
mergeIsActive_.store(false);
}

Expand Down Expand Up @@ -637,8 +663,15 @@ class CompressedExternalIdTableSorter
auto& block = this->currentBlock_;
const auto blocksizeOutput = blocksize.value_or(block.numRows());
if (block.numRows() <= blocksizeOutput) {
co_yield std::move(this->currentBlock_).template toStatic<N>();
if (this->moveResultOnMerge_) {
co_yield std::move(this->currentBlock_).template toStatic<N>();
} else {
auto blockAsStatic = IdTableStatic<N>(
this->currentBlock_.clone().template toStatic<N>());
co_yield blockAsStatic;
}
} else {
// TODO<C++23> Use `std::views::chunk`.
for (size_t i = 0; i < block.numRows(); i += blocksizeOutput) {
size_t upper = std::min(i + blocksizeOutput, block.numRows());
auto curBlock = IdTableStatic<NumStaticCols>(
Expand Down
14 changes: 12 additions & 2 deletions src/engine/idTable/IdTable.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,8 @@ class IdTable {
static constexpr bool columnsAreAllocatable =
std::is_constructible_v<ColumnStorage, size_t, Allocator>;

using value_type = T;
// The type of a single entry in a row.
using single_value_type = T;
// Because of the column-major layout, the `row_type` (a value type that
// stores the values of a single row) and the `row_reference` (a type that
// refers to a specific row of a specific `IdTable`) are different. They are
Expand All @@ -135,6 +136,11 @@ class IdTable {
using row_reference = RowReference<IdTable, ad_utility::IsConst::False>;
using const_row_reference = RowReference<IdTable, ad_utility::IsConst::True>;

// This alias is required to make the `IdTable` class work with advanced GTest
// features, because GTest uses `Container::value_type` directly instead of
// using `std::iterator_traits`.
using value_type = row_type;

private:
// Assign shorter aliases for some types that are important for the correct
// handling of the proxy reference, but that are not visible to the outside.
Expand Down Expand Up @@ -526,14 +532,18 @@ class IdTable {
// numColumns()` implies that the function applies a permutation to the table.
// For example `setColumnSubset({1, 2, 0})` rotates the columns of a table
// with three columns left by one element.
void setColumnSubset(std::span<const ColumnIndex> subset) requires isDynamic {
void setColumnSubset(std::span<const ColumnIndex> subset) {
// First check that the `subset` is indeed a subset of the column
// indices.
std::vector<ColumnIndex> check{subset.begin(), subset.end()};
std::ranges::sort(check);
AD_CONTRACT_CHECK(std::unique(check.begin(), check.end()) == check.end());
AD_CONTRACT_CHECK(!subset.empty() && subset.back() < numColumns());

// If the number of columns is statically fixed, then only a permutation of
// the columns and not a real subset is allowed.
AD_CONTRACT_CHECK(isDynamic || subset.size() == NumColumns);

Data newData;
newData.reserve(subset.size());
std::ranges::for_each(subset, [this, &newData](ColumnIndex colIdx) {
Expand Down
10 changes: 9 additions & 1 deletion src/engine/idTable/IdTableRow.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,14 @@ class Row {
friend void swap(Row& a, Row& b) { std::swap(a.data_, b.data_); }

bool operator==(const Row& other) const = default;

// Convert from a static `RowReference` to a `std::array` (makes a copy).
explicit operator std::array<T, numStaticColumns>() const
requires(numStaticColumns != 0) {
std::array<T, numStaticColumns> result;
std::ranges::copy(*this, result.begin());
return result;
}
};

// The following two classes store a reference to a row in the underlying
Expand Down Expand Up @@ -120,7 +128,7 @@ class RowReferenceImpl {
public:
static constexpr bool isConst = isConstTag == ad_utility::IsConst::True;
using TablePtr = std::conditional_t<isConst, const Table*, Table*>;
using T = typename Table::value_type;
using T = typename Table::single_value_type;
static constexpr int numStaticColumns = Table::numStaticColumns;

// Grant the `IdTable` class access to the internal details.
Expand Down
10 changes: 10 additions & 0 deletions src/global/SpecialIds.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,16 @@ static const inline ad_utility::HashMap<std::string, Id> specialIds = []() {
AD_CORRECTNESS_CHECK(uniqueIds.size() == result.size());
return result;
}();

// Return the [lowerBound, upperBound) for the special Ids.
// This range can be used to filter them out in cases where we want to ignore
// triples that were added by QLever for internal reasons.
static constexpr std::pair<Id, Id> getBoundsForSpecialIds() {
constexpr auto upperBound = Id::makeFromBool(false);
static_assert(static_cast<int>(Datatype::Undefined) == 0);
static_assert(upperBound.getBits() == 1UL << Id::numDataBits);
return {Id::fromBits(1), upperBound};
}
} // namespace qlever

#endif // QLEVER_SPECIALIDS_H
22 changes: 21 additions & 1 deletion src/index/IndexImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,9 @@ std::unique_ptr<ExternalSorter<SortByPSO, 5>> IndexImpl::buildOspWithPatterns(
auto isQleverInternalId) {
auto&& [hasPatternPredicateSortedByPSO, secondSorter] =
sortersFromPatternCreator;
// We need the patterns twice: once for the additional column, and once for
// the additional permutation.
hasPatternPredicateSortedByPSO->moveResultOnMerge() = false;
// The column with index 1 always is `has-predicate` and is not needed here.
// Note that the order of the columns during index building is alwasy `SPO`,
// but the sorting might be different (PSO in this case).
Expand Down Expand Up @@ -259,6 +262,19 @@ std::unique_ptr<ExternalSorter<SortByPSO, 5>> IndexImpl::buildOspWithPatterns(
makeSorterPtr<ThirdPermutation, NumColumnsIndexBuilding + 2>("third");
createSecondPermutationPair(NumColumnsIndexBuilding + 2, isQleverInternalId,
std::move(blockGenerator), *thirdSorter);
// Add the `ql:has-pattern` predicate to the sorter such that it will become
// part of the PSO and POS permutation.
LOG(INFO) << "Adding " << hasPatternPredicateSortedByPSO->size()
<< " additional triples to the POS and PSO permutation for the "
"`ql:has-pattern` predicate ..."
<< std::endl;
auto noPattern = Id::makeFromInt(NO_PATTERN);
static_assert(NumColumnsIndexBuilding == 3);
for (const auto& row : hasPatternPredicateSortedByPSO->sortedView()) {
// The repetition of the pattern index (`row[2]`) for the fourth column is
// useful for generic unit testing, but not needed otherwise.
thirdSorter->push(std::array{row[0], row[1], row[2], row[2], noPattern});
}
return thirdSorter;
}
// _____________________________________________________________________________
Expand All @@ -282,7 +298,10 @@ void IndexImpl::createFromFile(const string& filename) {
writeConfiguration();

auto isQleverInternalId = [&indexBuilderData](const auto& id) {
return indexBuilderData.vocabularyMetaData_.isQleverInternalId(id);
// The special internal IDs like `ql:has-pattern` (see `SpecialIds.h`)
// have the datatype `UNDEFINED`.
return indexBuilderData.vocabularyMetaData_.isQleverInternalId(id) ||
id.getDatatype() == Datatype::Undefined;
};

// For the first permutation, perform a unique.
Expand Down Expand Up @@ -754,6 +773,7 @@ void IndexImpl::createFromOnDiskIndex(const string& onDiskBase) {
totalVocabularySize_ = vocab_.size() + vocab_.getExternalVocab().size();
LOG(DEBUG) << "Number of words in internal and external vocabulary: "
<< totalVocabularySize_ << std::endl;

pso_.loadFromDisk(onDiskBase_);
pos_.loadFromDisk(onDiskBase_);

Expand Down
41 changes: 25 additions & 16 deletions src/index/IndexImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#include <engine/ResultTable.h>
#include <global/Pattern.h>
#include <global/SpecialIds.h>
#include <index/CompressedRelation.h>
#include <index/ConstantsIndexBuilding.h>
#include <index/DocsDB.h>
Expand Down Expand Up @@ -668,35 +669,43 @@ class IndexImpl {
// index scan) and `GroupBy.cpp`.
auto getIgnoredIdRanges(const Permutation::Enum permutation) const {
std::vector<std::pair<Id, Id>> ignoredRanges;
ignoredRanges.emplace_back(qlever::getBoundsForSpecialIds());

auto literalRange = getVocab().prefix_range("\"");
auto taggedPredicatesRange = getVocab().prefix_range("@");
auto internalEntitiesRange =
getVocab().prefix_range(INTERNAL_ENTITIES_URI_PREFIX);
ignoredRanges.emplace_back(
Id::makeFromVocabIndex(internalEntitiesRange.first),
Id::makeFromVocabIndex(internalEntitiesRange.second));

auto pushIgnoredRange = [&ignoredRanges](const auto& range) {
ignoredRanges.emplace_back(Id::makeFromVocabIndex(range.first),
Id::makeFromVocabIndex(range.second));
};
pushIgnoredRange(internalEntitiesRange);
using enum Permutation::Enum;
if (permutation == SPO || permutation == SOP) {
ignoredRanges.push_back({Id::makeFromVocabIndex(literalRange.first),
Id::makeFromVocabIndex(literalRange.second)});
pushIgnoredRange(literalRange);
} else if (permutation == PSO || permutation == POS) {
ignoredRanges.push_back(
{Id::makeFromVocabIndex(taggedPredicatesRange.first),
Id::makeFromVocabIndex(taggedPredicatesRange.second)});
pushIgnoredRange(taggedPredicatesRange);
}

auto isIllegalPredicateId = [=](Id predicateId) {
// A lambda that checks whether the `predicateId` is an internal ID like
// `ql:has-pattern` or `@en@rdfs:label`.
auto isInternalPredicateId = [internalEntitiesRange,
taggedPredicatesRange](Id predicateId) {
if (predicateId.getDatatype() == Datatype::Undefined) {
return true;
}
AD_CORRECTNESS_CHECK(predicateId.getDatatype() == Datatype::VocabIndex);
auto idx = predicateId.getVocabIndex();
return (idx >= internalEntitiesRange.first &&
idx < internalEntitiesRange.second) ||
(idx >= taggedPredicatesRange.first &&
idx < taggedPredicatesRange.second);
auto isInRange = [idx](const auto& range) {
return range.first <= idx && idx < range.second;
};
return (isInRange(internalEntitiesRange) ||
isInRange(taggedPredicatesRange));
};

auto isTripleIgnored = [permutation,
isIllegalPredicateId](const auto& triple) {
isInternalPredicateId](const auto& triple) {
// TODO<joka921, everybody in the future>:
// A lot of code (especially for statistical queries in `GroupBy.cpp` and
// the pattern trick) relies on this function being a noop for the `PSO`
Expand All @@ -707,9 +716,9 @@ class IndexImpl {
// be thoroughly reviewed.
if (permutation == SPO || permutation == OPS) {
// Predicates are always entities from the vocabulary.
return isIllegalPredicateId(triple[1]);
return isInternalPredicateId(triple[1]);
} else if (permutation == SOP || permutation == OSP) {
return isIllegalPredicateId(triple[2]);
return isInternalPredicateId(triple[2]);
}
return false;
};
Expand Down
3 changes: 3 additions & 0 deletions src/parser/TripleComponent.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "engine/LocalVocab.h"
#include "global/Constants.h"
#include "global/Id.h"
#include "global/SpecialIds.h"
#include "parser/RdfEscaping.h"
#include "parser/data/Variable.h"
#include "util/Date.h"
Expand Down Expand Up @@ -232,6 +233,8 @@ class TripleComponent {
isString() ? getString() : getLiteral().rawContent();
if (vocabulary.getId(content, &idx)) {
return Id::makeFromVocabIndex(idx);
} else if (qlever::specialIds.contains(content)) {
return qlever::specialIds.at(content);
} else {
return std::nullopt;
}
Expand Down
22 changes: 22 additions & 0 deletions test/IdTableTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -974,6 +974,28 @@ TEST(IdTable, setColumnSubset) {
ASSERT_ANY_THROW(t.setColumnSubset(std::vector<ColumnIndex>{1, 2}));
}

TEST(IdTableStatic, setColumnSubset) {
using IntTable = columnBasedIdTable::IdTable<int, 3>;
IntTable t;
t.push_back({0, 10, 20});
t.push_back({1, 11, 21});
t.push_back({2, 12, 22});
t.setColumnSubset(std::array{ColumnIndex(2), ColumnIndex(0), ColumnIndex(1)});
ASSERT_EQ(3, t.numColumns());
ASSERT_EQ(3, t.numRows());
ASSERT_THAT(t.getColumn(0), ::testing::ElementsAre(20, 21, 22));
ASSERT_THAT(t.getColumn(1), ::testing::ElementsAre(0, 1, 2));
ASSERT_THAT(t.getColumn(2), ::testing::ElementsAre(10, 11, 12));

// Duplicate columns are not allowed.
ASSERT_ANY_THROW(t.setColumnSubset(std::vector<ColumnIndex>{0, 0, 1}));
// A column index is out of range.
ASSERT_ANY_THROW(t.setColumnSubset(std::vector<ColumnIndex>{1, 2, 3}));

// For static tables, we need a permutation, a real subset is not allowed.
ASSERT_ANY_THROW(t.setColumnSubset(std::vector<ColumnIndex>{1, 2}));
}

TEST(IdTable, cornerCases) {
using Dynamic = columnBasedIdTable::IdTable<int, 0>;
{
Expand Down
Loading