From f532f66b0ea143ea74da1a334992b1ab4d5fcc52 Mon Sep 17 00:00:00 2001 From: shalasheg Date: Thu, 26 Sep 2024 01:53:54 +0300 Subject: [PATCH] Implement pdep, tau, mu_plus, rho measures and mining --- src/core/algorithms/fd/tane/enums.h | 2 +- src/core/algorithms/fd/tane/pfdtane.cpp | 1 + src/core/algorithms/fd/tane/pfdtane.h | 1 + src/core/algorithms/fd/tane/tane.cpp | 145 +++++++++++++++++++- src/core/algorithms/fd/tane/tane.h | 18 +++ src/core/algorithms/fd/tane/tane_common.cpp | 4 +- src/core/algorithms/fd/tane/tane_common.h | 1 + 7 files changed, 165 insertions(+), 7 deletions(-) diff --git a/src/core/algorithms/fd/tane/enums.h b/src/core/algorithms/fd/tane/enums.h index 30f6f9c569..bb36cda518 100644 --- a/src/core/algorithms/fd/tane/enums.h +++ b/src/core/algorithms/fd/tane/enums.h @@ -3,5 +3,5 @@ #include namespace algos { -BETTER_ENUM(ErrorMeasure, char, per_tuple = 0, per_value) +BETTER_ENUM(ErrorMeasure, char, g1 = 0, per_tuple, per_value, pdep, tau, mu_plus, rho) } diff --git a/src/core/algorithms/fd/tane/pfdtane.cpp b/src/core/algorithms/fd/tane/pfdtane.cpp index 69a3d96dd8..86fd91feff 100644 --- a/src/core/algorithms/fd/tane/pfdtane.cpp +++ b/src/core/algorithms/fd/tane/pfdtane.cpp @@ -27,6 +27,7 @@ config::ErrorType PFDTane::CalculateZeroAryFdError(ColumnData const* rhs) { } config::ErrorType PFDTane::CalculateFdError(model::PositionListIndex const* lhs_pli, + model::PositionListIndex const* rhs_pli, model::PositionListIndex const* joint_pli) { return CalculatePFDError(lhs_pli, joint_pli, error_measure_); } diff --git a/src/core/algorithms/fd/tane/pfdtane.h b/src/core/algorithms/fd/tane/pfdtane.h index 89b791c76a..47f1bf6719 100644 --- a/src/core/algorithms/fd/tane/pfdtane.h +++ b/src/core/algorithms/fd/tane/pfdtane.h @@ -16,6 +16,7 @@ class PFDTane : public tane::TaneCommon { void MakeExecuteOptsAvailableFDInternal() final; config::ErrorType CalculateZeroAryFdError(ColumnData const* rhs) override; config::ErrorType CalculateFdError(model::PositionListIndex const* lhs_pli, + model::PositionListIndex const* rhs_pli, model::PositionListIndex const* joint_pli) override; public: diff --git a/src/core/algorithms/fd/tane/tane.cpp b/src/core/algorithms/fd/tane/tane.cpp index 0d98b8ba51..f5fe0ebcf4 100644 --- a/src/core/algorithms/fd/tane/tane.cpp +++ b/src/core/algorithms/fd/tane/tane.cpp @@ -1,27 +1,164 @@ #include "tane.h" #include "config/error/option.h" +#include "config/error_measure/option.h" +#include "enums.h" #include "fd/pli_based_fd_algorithm.h" #include "model/table/column_data.h" namespace algos { +using Cluster = model::PositionListIndex::Cluster; Tane::Tane(std::optional relation_manager) - : tane::TaneCommon(relation_manager) {} + : tane::TaneCommon(relation_manager) { + RegisterOption(config::kErrorMeasureOpt(&error_measure_)); +} void Tane::MakeExecuteOptsAvailableFDInternal() { - MakeOptionsAvailable({config::kErrorOpt.GetName()}); + MakeOptionsAvailable({config::kErrorOpt.GetName(), config::kErrorMeasureOpt.GetName()}); } config::ErrorType Tane::CalculateZeroAryFdError(ColumnData const* rhs) { + if (error_measure_ == +ErrorMeasure::g1) return CalculateZeroAryG1(rhs); + return 1; +} + +config::ErrorType Tane::CalculateFdError(model::PositionListIndex const* lhs_pli, + model::PositionListIndex const* rhs_pli, + model::PositionListIndex const* joint_pli) { + switch (error_measure_) { + case +ErrorMeasure::pdep: + return 1 - CalculatePdepMeasure(lhs_pli, joint_pli); + case +ErrorMeasure::tau: + return 1 - CalculateTauMeasure(lhs_pli, rhs_pli, joint_pli); + case +ErrorMeasure::mu_plus: + return 1 - CalculateMuPlusMeasure(lhs_pli, rhs_pli, joint_pli); + case +ErrorMeasure::rho: + return 1 - CalculateRhoMeasure(lhs_pli, joint_pli); + } + return CalculateG1Error(lhs_pli, joint_pli); +} + +config::ErrorType Tane::CalculateZeroAryG1(ColumnData const* rhs) { return 1 - rhs->GetPositionListIndex()->GetNepAsLong() / static_cast(relation_.get()->GetNumTuplePairs()); } -config::ErrorType Tane::CalculateFdError(model::PositionListIndex const* lhs_pli, +config::ErrorType Tane::CalculateG1Error(model::PositionListIndex const* lhs_pli, model::PositionListIndex const* joint_pli) { - return (lhs_pli->GetNepAsLong() - joint_pli->GetNepAsLong()) / + return (double)(lhs_pli->GetNepAsLong() - joint_pli->GetNepAsLong()) / static_cast(relation_.get()->GetNumTuplePairs()); } +config::ErrorType Tane::PdepSelf(model::PositionListIndex const* x_pli) { + // model::PositionListIndex const* x_pli = rhs->GetPositionListIndex(); + size_t N = x_pli->GetRelationSize(); + config::ErrorType sum = 0; + std::size_t cluster_rows_count = 0; + std::deque const& x_index = x_pli->GetIndex(); + for (Cluster const& x_cluster : x_index) { + cluster_rows_count += x_cluster.size(); + sum += x_cluster.size() * x_cluster.size(); + } + std::size_t unique_rows = x_pli->GetRelationSize() - cluster_rows_count; + sum += unique_rows; + return static_cast(sum / (N * N)); +} + +config::ErrorType Tane::CalculatePdepMeasure(model::PositionListIndex const* x_pli, + model::PositionListIndex const* xa_pli) { + std::deque xa_index = xa_pli->GetIndex(); + std::deque x_index = x_pli->GetIndex(); + size_t N = x_pli->GetRelationSize(); + + config::ErrorType sum = 0; + + std::unordered_map x_frequencies; + + int x_value_id = 1; + for (Cluster const& x_cluster : x_index) { + x_frequencies[x_value_id++] = x_cluster.size(); + } + x_frequencies[0] = 1; + + auto x_prob = x_pli->CalculateAndGetProbingTable(); + + auto get_x_freq_by_tuple_ind{[&x_prob, &x_frequencies](int tuple_ind) { + int value_id = x_prob->at(tuple_ind); + return static_cast(x_frequencies[value_id]); + }}; + + for (Cluster const& xa_cluster : xa_index) { + config::ErrorType num = xa_cluster.size() * xa_cluster.size(); + config::ErrorType denum = get_x_freq_by_tuple_ind(xa_cluster.front()); + sum += num / denum; + } + + auto xa_prob = xa_pli->CalculateAndGetProbingTable(); + for (int i = 0; i < xa_prob->size(); i++) { + if (xa_prob->at(i) == 0) { + sum += 1 / get_x_freq_by_tuple_ind(i); + } + } + return (sum / static_cast(N)); +} + +config::ErrorType Tane::CalculateTauMeasure(model::PositionListIndex const* x_pli, + model::PositionListIndex const* a_pli, + model::PositionListIndex const* xa_pli) { + config::ErrorType pdepY = Tane::PdepSelf(a_pli); + if (pdepY == 1) return 1; + + config::ErrorType pdepXY = Tane::CalculatePdepMeasure(x_pli, xa_pli); + + return ((pdepXY - pdepY) / (1 - pdepY)); +} + +config::ErrorType Tane::CalculateMuPlusMeasure(model::PositionListIndex const* x_pli, + model::PositionListIndex const* a_pli, + model::PositionListIndex const* xa_pli) { + config::ErrorType pdepY = Tane::PdepSelf(a_pli); + if (pdepY == 1) return 1; + + config::ErrorType pdepXY = Tane::CalculatePdepMeasure(x_pli, xa_pli); + + size_t N = x_pli->GetRelationSize(); + std::size_t cluster_rows_count = 0; + std::deque const& x_index = x_pli->GetIndex(); + int K = x_index.size(); + + for (Cluster const& x_cluster : x_index) { + cluster_rows_count += x_cluster.size(); + } + + std::size_t unique_rows = x_pli->GetRelationSize() - cluster_rows_count; + K += unique_rows; + + if (K == N) return 1; + + config::ErrorType mu = 1 - (1 - pdepXY) / (1 - pdepY) * (N - 1) / (N - K); + config::ErrorType mu_plus = std::max(0., mu); + return mu_plus; +} + +config::ErrorType Tane::CalculateRhoMeasure(model::PositionListIndex const* x_pli, + model::PositionListIndex const* xa_pli) { + auto CalculateDom = [](model::PositionListIndex const* pli) { + auto index = pli->GetIndex(); + int dom = index.size(); + + std::size_t cluster_rows_count = 0; + for (Cluster const& cluster : index) { + cluster_rows_count += cluster.size(); + } + + std::size_t unique_rows = pli->GetRelationSize() - cluster_rows_count; + dom += unique_rows; + return static_cast(dom); + }; + config::ErrorType domX = CalculateDom(x_pli); + config::ErrorType domXA = CalculateDom(xa_pli); + return domX / domXA; +} + } // namespace algos diff --git a/src/core/algorithms/fd/tane/tane.h b/src/core/algorithms/fd/tane/tane.h index aba25c7660..d43f5384dd 100644 --- a/src/core/algorithms/fd/tane/tane.h +++ b/src/core/algorithms/fd/tane/tane.h @@ -1,6 +1,7 @@ #pragma once #include "config/error/type.h" +#include "enums.h" #include "model/table/column_data.h" #include "model/table/position_list_index.h" #include "tane_common.h" @@ -9,13 +10,30 @@ namespace algos { class Tane : public tane::TaneCommon { private: + ErrorMeasure error_measure_ = +ErrorMeasure::g1; void MakeExecuteOptsAvailableFDInternal() override final; config::ErrorType CalculateZeroAryFdError(ColumnData const* rhs) override; config::ErrorType CalculateFdError(model::PositionListIndex const* lhs_pli, + model::PositionListIndex const* rhs_pli, model::PositionListIndex const* joint_pli) override; public: Tane(std::optional relation_manager = std::nullopt); + config::ErrorType CalculateZeroAryG1(ColumnData const* rhs); + config::ErrorType CalculateG1Error(model::PositionListIndex const* lhs_pli, + model::PositionListIndex const* joint_pli); + + static config::ErrorType PdepSelf(model::PositionListIndex const* x_pli); + static config::ErrorType CalculatePdepMeasure(model::PositionListIndex const* x_pli, + model::PositionListIndex const* xa_pli); + static config::ErrorType CalculateTauMeasure(model::PositionListIndex const* x_pli, + model::PositionListIndex const* a_pli, + model::PositionListIndex const* xa_pli); + static config::ErrorType CalculateMuPlusMeasure(model::PositionListIndex const* x_pli, + model::PositionListIndex const* a_pli, + model::PositionListIndex const* xa_pli); + static config::ErrorType CalculateRhoMeasure(model::PositionListIndex const* x_pli, + model::PositionListIndex const* xa_pli); }; } // namespace algos diff --git a/src/core/algorithms/fd/tane/tane_common.cpp b/src/core/algorithms/fd/tane/tane_common.cpp index 0b9a0be474..497b1402c6 100644 --- a/src/core/algorithms/fd/tane/tane_common.cpp +++ b/src/core/algorithms/fd/tane/tane_common.cpp @@ -114,9 +114,9 @@ void TaneCommon::ComputeDependencies(model::LatticeLevel* level) { continue; } auto x_pli = x_vertex->GetPositionListIndex(); - + auto a_pli = relation_->GetColumnData(a_index).GetPositionListIndex(); // Check X -> A - config::ErrorType error = CalculateFdError(x_pli, xa_pli); + config::ErrorType error = CalculateFdError(x_pli, a_pli, xa_pli); if (error <= max_fd_error_) { Column const* rhs = schema->GetColumns()[a_index].get(); diff --git a/src/core/algorithms/fd/tane/tane_common.h b/src/core/algorithms/fd/tane/tane_common.h index c896acb044..11aaa1d854 100644 --- a/src/core/algorithms/fd/tane/tane_common.h +++ b/src/core/algorithms/fd/tane/tane_common.h @@ -22,6 +22,7 @@ class TaneCommon : public PliBasedFDAlgorithm { unsigned long long ExecuteInternal() final; virtual config::ErrorType CalculateZeroAryFdError(ColumnData const* rhs) = 0; virtual config::ErrorType CalculateFdError(model::PositionListIndex const* lhs_pli, + model::PositionListIndex const* rhs_pli, model::PositionListIndex const* joint_pli) = 0; static double CalculateUccError(model::PositionListIndex const* pli, ColumnLayoutRelationData const* relation_data);