Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: FastADC implementation #470

Open
wants to merge 54 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
1ee69e3
Impelement Operator class
ol-imorozko Feb 25, 2024
817ab07
Implement tests for Operator class
ol-imorozko Feb 29, 2024
973c70e
Implement ColumnOperand class
ol-imorozko Feb 28, 2024
8046aa9
Implement BaseProvider class
ol-imorozko Mar 15, 2024
7ee4acb
Implement PredicateProvider class
ol-imorozko Feb 26, 2024
cb204e4
Implement Predicate class
ol-imorozko Feb 28, 2024
b4b929c
Add test that checks that PredicateProvider works
ol-imorozko Mar 1, 2024
276137a
Create temporary PredicateBuilder class
ol-imorozko Jun 4, 2024
0695e58
Replace int with int64_t in Predicate class
ol-imorozko Sep 29, 2024
e014ea5
Initial commit that adds dc folder and placeholder for dc.h
ol-imorozko Feb 25, 2024
d7bec72
Implement method to get value from TypedColumnData
ol-imorozko Sep 24, 2024
f6f2c66
Implement IndexProvider class
ol-imorozko Mar 2, 2024
72b720d
Implement functions to get similarities metrics between two columns
ol-imorozko Mar 6, 2024
6de1e00
Implement PrediateBuilder class
ol-imorozko Mar 7, 2024
10be729
Implement tests for predicate space building
ol-imorozko Mar 8, 2024
5fb84b6
Implement Pli and PliShardBuilder
ol-imorozko Mar 16, 2024
f91e8d8
Implement CommonClueSetBuilder
ol-imorozko Sep 21, 2024
b7daeac
Implement SingleClueSetBuilder
ol-imorozko Sep 21, 2024
7706750
Implement CrossClueSetBuilder
ol-imorozko Sep 21, 2024
1984614
Add test that checks static fields of CommonClueSetBuilder
ol-imorozko May 1, 2024
7cd11f8
Implement ClueSetBuilder
ol-imorozko Sep 21, 2024
13c4f05
Add test that checks ClueSet building
ol-imorozko Sep 24, 2024
ee5a55b
FIXME: Add an ability to force kString type on TypedColumnData instea…
ol-imorozko Sep 24, 2024
b4698a8
Add initial EvidenceSetBuilder class that builds cardinality mask
ol-imorozko Sep 29, 2024
d652305
Add test that verifies CardinalityMask
ol-imorozko Sep 29, 2024
cf74991
Implement Evidence
ol-imorozko Sep 29, 2024
4344fad
Implement EvidenceSet
ol-imorozko Sep 29, 2024
ff7b99f
Implement EvidenceSetBuilder
ol-imorozko Sep 29, 2024
ced0db0
Add test to verify evidence set
ol-imorozko Sep 29, 2024
3c791fc
Fix wrong creating of inverted predicate, operands were swapped
ol-imorozko Oct 3, 2024
10a9f9e
Add type alias for bitset holding predicates
ol-imorozko Oct 1, 2024
29a2a56
Implement PredicateOrganizer class
ol-imorozko Oct 1, 2024
4acb27f
Add test that validates predicate organizer
ol-imorozko Oct 1, 2024
9384560
Implement DCCandidateTrie class
ol-imorozko Oct 1, 2024
1d3020f
Implement PredicateSet class
ol-imorozko Mar 2, 2024
8262756
Implement DenialConstraint class
ol-imorozko Oct 2, 2024
bbcdde7
Return reference from GetImplications Predicate method
ol-imorozko Oct 2, 2024
a34467a
Implement Closure class
ol-imorozko Oct 2, 2024
c713f26
Implement NTreeSearch class
ol-imorozko Oct 2, 2024
fadeadf
Implement DenialConstraintSet
ol-imorozko Oct 2, 2024
72bc916
Implement ApproximateEvidenceInverter class
ol-imorozko Oct 2, 2024
426d115
Implement test for approximate denial constraints
ol-imorozko Oct 2, 2024
f701159
Change namespace model to namespace algos::fastadc for FastADC files
ol-imorozko Oct 3, 2024
16c2c16
Split FastADC files into subfolders
ol-imorozko Oct 3, 2024
3db0fcc
Correct includes paths after renaming and moving FastADC files
ol-imorozko Oct 3, 2024
684ec02
Refactor providers* structures
ol-imorozko Oct 3, 2024
6cc5b6e
Adjust unittests after providers refactoring
ol-imorozko Oct 4, 2024
ca98a4a
Extract predicate packs and correction map building to a separate class
ol-imorozko Oct 5, 2024
fcafce1
Move cardinality mask building from Evidence set to a new structure
ol-imorozko Oct 5, 2024
67b5540
Remove unused clue field from Evidence class
ol-imorozko Oct 5, 2024
39fa0ac
Remove unused N field from SearchNode class
ol-imorozko Oct 5, 2024
fccf6ae
Optimize AccumulateClues by hashing clue with zero value
ol-imorozko Oct 5, 2024
3b2b34f
Increase performance of AccumulateClues by preallocating and utilizin…
ol-imorozko Oct 5, 2024
3cc3060
Optimize clues by moving allocations out of Build* methods
ol-imorozko Oct 5, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions src/core/algorithms/dc/FastADC/misc/misc.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#pragma once

#include "model/table/typed_column_data.h"

namespace algos::fastadc {

namespace {
// Helper to trigger a compile-time error for unsupported types
template <typename T>
struct DependentFalse : std::false_type {};
} // namespace

template <typename T>
[[nodiscard]] inline T const& GetValue(model::TypedColumnData const& column, size_t row) {
model::Type const& type = column.GetType();

if (!column.IsNullOrEmpty(row)) {
return type.GetValue<T>(column.GetValue(row));
}

/*
* Mimicking the Java behavior:
* https://github.com/RangerShaw/FastADC/blob/master/src/main/java/de/metanome/algorithms/dcfinder/input/Column.java#L71
*
* public Long getLong(int line) {
* return values.get(line).isEmpty() ? Long.MIN_VALUE :
* Long.parseLong(values.get(line));
* }
*
* public Double getDouble(int line) {
* return values.get(line).isEmpty() ? Double.MIN_VALUE :
* Double.parseDouble(values.get(line));
* }
*
* public String getString(int line) {
* return values.get(line) == null ? "" : values.get(line);
* }
*/
if constexpr (std::is_same_v<T, std::string>) {
static std::string const kEmptyStr = "";
return kEmptyStr;
} else if constexpr (std::is_same_v<T, int64_t>) {
static int64_t const kMinInt = std::numeric_limits<int64_t>::min();
return kMinInt;
} else if constexpr (std::is_same_v<T, double>) {
static double const kMinDouble = std::numeric_limits<double>::lowest();
return kMinDouble;
} else {
static_assert(DependentFalse<T>::value,
"FastADC algorithm supports only int64_t, string, or double as column types. "
"This function should not be called with other types.");
}
}

} // namespace algos::fastadc
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#include "typed_column_data_value_differences.h"

#include <easylogging++.h>

#include "misc.h"

namespace algos::fastadc {

template <typename T>
static double GetSharedPercentageTyped(model::TypedColumnData const& c1,
model::TypedColumnData const& c2) {
std::unordered_map<T, size_t> freq_map1;
std::unordered_map<T, size_t> freq_map2;

freq_map1.reserve(c1.GetNumRows());
freq_map2.reserve(c2.GetNumRows());

for (size_t i = 0; i < c1.GetNumRows(); i++) freq_map1[GetValue<T>(c1, i)]++;
for (size_t i = 0; i < c2.GetNumRows(); i++) freq_map2[GetValue<T>(c2, i)]++;

size_t shared_count = 0;
size_t total_count = 0;

for (auto const& [data, frequency1] : freq_map1) {
auto it = freq_map2.find(data);
size_t frequency2 = (it == freq_map2.end()) ? 0 : it->second;

shared_count += std::min(frequency1, frequency2);
total_count += std::max(frequency1, frequency2);
}

return total_count > 0 ? static_cast<double>(shared_count) / total_count : 0.0;
}

template <typename T>
static double CalculateAverageTyped(model::TypedColumnData const& column) {
double sum = 0.0;

if (column.GetNumRows() == 0) {
return sum;
}

for (size_t i = 0; i < column.GetNumRows(); ++i) {
sum += static_cast<double>(GetValue<T>(column, i));
}

return sum / column.GetNumRows();
}

double GetSharedPercentage(model::TypedColumnData const& c1, model::TypedColumnData const& c2) {
if (c1.GetColumn() == c2.GetColumn()) return 1.;

switch (c1.GetTypeId()) {
case model::TypeId::kInt:
return GetSharedPercentageTyped<int64_t>(c1, c2);
case model::TypeId::kDouble:
return GetSharedPercentageTyped<double>(c1, c2);
case model::TypeId::kString:
return GetSharedPercentageTyped<std::string>(c1, c2);
default:
LOG(DEBUG) << "Column " << c1.GetColumn()->ToString() << " with type "
<< c1.GetType().ToString()
<< " is not supported for shared percentage calculation";
return -1;
}
}

double GetAverageRatio(model::TypedColumnData const& c1, model::TypedColumnData const& c2) {
if (c1.GetColumn() == c2.GetColumn()) return 1.;

double avg1 = 0.0, avg2 = 0.0;

switch (c1.GetTypeId()) {
case model::TypeId::kInt:
avg1 = CalculateAverageTyped<int64_t>(c1);
avg2 = CalculateAverageTyped<int64_t>(c2);
break;
case model::TypeId::kDouble:
avg1 = CalculateAverageTyped<double>(c1);
avg2 = CalculateAverageTyped<double>(c2);
break;
default:
LOG(DEBUG) << "Column type " << c1.GetType().ToString() << " is not numeric";
return -1;
}

return std::min(avg1, avg2) / std::max(avg1, avg2);
}

} // namespace algos::fastadc
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#include "model/table/typed_column_data.h"

namespace algos::fastadc {

/**
* Computes the shared percentage of values between two columns of the same type.
* The function is specialized for columns containing only int, double, or string
* types. It calculates the frequency of each unique value in both columns and
* determines the ratio of the shared values to the total values.
*
* Assumes that types of values in both columns are the same.
*
* @return A double representing the shared percentage of values between the two
* columns if the types are int, double, or string. Returns -1 (i.e this indicates
* that there are no similaties whatsoever) if the column types are not
* supported for comparison
*/
double GetSharedPercentage(model::TypedColumnData const& c1, model::TypedColumnData const& c2);

/**
* Calculates the ratio of the smaller average to the larger average of two columns.
*
* Assumes that types of values in both columns are numeric and the same.
*
* @return A double representing the ratio of the smaller average to the larger average
* of the two columns if both columns are numeric. Returns -1 (i.e this indicates
* that there are no similaties whatsoever) if the column types are not
* types are not numeric.
*/
double GetAverageRatio(model::TypedColumnData const& c1, model::TypedColumnData const& c2);

} // namespace algos::fastadc
9 changes: 9 additions & 0 deletions src/core/algorithms/dc/FastADC/model/column_operand.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#include "column_operand.h"

namespace algos::fastadc {

size_t hash_value(ColumnOperand const& col_op) noexcept {
return std::hash<ColumnOperand>()(col_op);
}

} // namespace algos::fastadc
73 changes: 73 additions & 0 deletions src/core/algorithms/dc/FastADC/model/column_operand.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#pragma once

#include "table/column.h"

namespace algos::fastadc {

/**
* @brief Represents a column operand within a predicate for FastADC.
*
* FastADC processes Denial Constraints (DCs) that involve comparisons between
* pairs of rows within a dataset. A typical DC example, derived from a Functional
* Dependency such as A -> B, is expressed as: `forall t, s in r, not (t.A = s.A and t.B != s.B)`
* This denotes that for any pair of rows in the relation, it should not be the case
* that while the values in column "A" are equal, the values in column "B" are unequal.
*
* A predicate in this context (e.g., t.A == s.A) comprises three elements to be fully
* represented: the column operand from the first tuple ("t.A"), the comparison operator
* ("="), and the column operand from the second tuple ("s.A"). The `ColumnOperand` class
* encapsulates the column operand part of a predicate, such as "t.A" or "s.A".
*
* The class distinguishes between operands derived from the first tuple (t) and those
* from the second tuple (s) using a boolean flag `tuple_`, where `true` indicates an
* operand from the first tuple (t), and `false` indicates an operand from the second
* tuple (s).
*/
class ColumnOperand {
private:
Column const* column_;
bool tuple_;

public:
ColumnOperand(Column const* column, bool tuple) : column_(column), tuple_(tuple) {}

bool operator==(ColumnOperand const& rhs) const {
return column_ == rhs.column_ && tuple_ == rhs.tuple_;
}

Column const* GetColumn() const {
return column_;
}

bool GetTuple() const {
return tuple_;
}

// here TS means (t, s)
ColumnOperand GetInvTS() const {
return ColumnOperand(column_, !tuple_);
}

std::string ToString() const {
return (tuple_ ? "t." : "s.") + column_->GetName();
}
};

// NOLINTBEGIN(readability-identifier-naming)
size_t hash_value(ColumnOperand const& k) noexcept;
// NOLINTEND(readability-identifier-naming)

} // namespace algos::fastadc

namespace std {
template <>
struct hash<algos::fastadc::ColumnOperand> {
size_t operator()(algos::fastadc::ColumnOperand const& k) const noexcept {
size_t seed = 0;
boost::hash_combine(seed, k.GetColumn()->GetIndex());
boost::hash_combine(seed, k.GetTuple());
return seed;
}
};

} // namespace std
64 changes: 64 additions & 0 deletions src/core/algorithms/dc/FastADC/model/denial_constraint.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#pragma once

#include <sstream>
#include <string>

#include "predicate_set.h"

namespace algos::fastadc {

class DenialConstraint {
private:
PredicateSet predicate_set_;

public:
explicit DenialConstraint(PredicateSet const& predicateSet) : predicate_set_(predicateSet) {}

DenialConstraint(boost::dynamic_bitset<> const& predicates,
PredicateIndexProvider* predicate_index_provider)
: predicate_set_(predicates, predicate_index_provider) {
assert(predicate_index_provider);
}

DenialConstraint GetInvT1T2DC(PredicateProvider* predicate_provider) const {
return DenialConstraint(predicate_set_.GetInvTS(predicate_provider));
}

PredicateSet const& GetPredicateSet() const {
return predicate_set_;
}

size_t GetPredicateCount() const {
return predicate_set_.Size();
}

std::string ToString() const {
std::string const c_not = "\u00AC";
std::string const c_and = " ∧ ";
std::ostringstream sb;
sb << c_not << "{ ";
bool first = true;
for (PredicatePtr predicate : predicate_set_) {
if (!first) {
sb << c_and;
}
sb << predicate->ToString();
first = false;
}
sb << " }";
return sb.str();
}

bool operator==(DenialConstraint const& other) const = default;
};

} // namespace algos::fastadc

namespace std {
template <>
struct hash<algos::fastadc::DenialConstraint> {
size_t operator()(algos::fastadc::DenialConstraint const& k) const noexcept {
return k.GetPredicateSet().Hash();
}
};
} // namespace std
29 changes: 29 additions & 0 deletions src/core/algorithms/dc/FastADC/model/evidence.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#include "../util/common_clue_set_builder.h"
#include "predicate.h"

namespace algos::fastadc {

struct Evidence {
int64_t count;
PredicateBitset evidence;

Evidence(Clue satisfied, int64_t count, PredicateBitset const& cardinalityMask,
std::vector<PredicateBitset> const& correctionMap)
: count(count) {
evidence = cardinalityMask;

Clue tmp = satisfied;
size_t pos = 0;
while (tmp.any()) {
if (tmp.test(0)) {
evidence ^= correctionMap[pos];
}
tmp >>= 1;
pos++;
}
}

Evidence(Clue bitSet, int64_t count) : count(count), evidence(bitSet) {}
};

} // namespace algos::fastadc
Loading
Loading