From 9b7fad7281acec22b9b6c74a30df227ef586e0ee Mon Sep 17 00:00:00 2001 From: DaniilGoncharov Date: Mon, 22 Apr 2024 17:56:08 +0300 Subject: [PATCH] Add script for testing examples Add script for testing examples (without gfd). Bindings now return sorted list of fds (only for algos inherited from FDAlgorithm). Update README to the current output --- README.md | 26 +-- README_PYPI.md | 18 +-- examples/afd_multiple_error_thresholds.py | 8 +- examples/comparison_pfd_vs_afd.py | 7 +- examples/dedupe.py | 1 + examples/mining_set_od_2.py | 2 +- examples/testing/inputs/dedupe_input.txt | 13 ++ .../afd_multiple_error_thresholds_output.txt | 4 + .../outputs/algebraic_constraints_output.txt | 19 +++ .../outputs/anomaly_detection_output.txt | 44 +++++ .../outputs/comparison_pfd_vs_afd_output.txt | 7 + .../testing/outputs/data_stats_output.txt | 152 ++++++++++++++++++ examples/testing/outputs/dedupe_output.txt | 92 +++++++++++ .../testing/outputs/mine_typos_output.txt | 44 +++++ .../testing/outputs/mining_afd_output.txt | 4 + .../testing/outputs/mining_cfd_output.txt | 124 ++++++++++++++ examples/testing/outputs/mining_fd_output.txt | 8 + .../testing/outputs/mining_ind_output.txt | 34 ++++ .../testing/outputs/mining_list_od_output.txt | 49 ++++++ .../testing/outputs/mining_pfd_output.txt | 3 + .../outputs/mining_set_od_1_output.txt | 108 +++++++++++++ .../outputs/mining_set_od_2_output.txt | 134 +++++++++++++++ .../testing/outputs/verifying_aucc_output.txt | 64 ++++++++ .../outputs/verifying_fd_afd_output.txt | 90 +++++++++++ .../testing/outputs/verifying_mfd_output.txt | 1 + .../testing/outputs/verifying_ucc_output.txt | 18 +++ examples/testing/test_examples.sh | 40 +++++ src/core/algorithms/fd/fd_algorithm.cpp | 14 ++ src/core/algorithms/fd/fd_algorithm.h | 2 + src/core/model/table/vertical.cpp | 8 - src/core/model/table/vertical.h | 4 +- src/python_bindings/fd/bind_fd.cpp | 3 +- 32 files changed, 1104 insertions(+), 41 deletions(-) create mode 100644 examples/testing/inputs/dedupe_input.txt create mode 100644 examples/testing/outputs/afd_multiple_error_thresholds_output.txt create mode 100644 examples/testing/outputs/algebraic_constraints_output.txt create mode 100644 examples/testing/outputs/anomaly_detection_output.txt create mode 100644 examples/testing/outputs/comparison_pfd_vs_afd_output.txt create mode 100644 examples/testing/outputs/data_stats_output.txt create mode 100644 examples/testing/outputs/dedupe_output.txt create mode 100644 examples/testing/outputs/mine_typos_output.txt create mode 100644 examples/testing/outputs/mining_afd_output.txt create mode 100644 examples/testing/outputs/mining_cfd_output.txt create mode 100644 examples/testing/outputs/mining_fd_output.txt create mode 100644 examples/testing/outputs/mining_ind_output.txt create mode 100644 examples/testing/outputs/mining_list_od_output.txt create mode 100644 examples/testing/outputs/mining_pfd_output.txt create mode 100644 examples/testing/outputs/mining_set_od_1_output.txt create mode 100644 examples/testing/outputs/mining_set_od_2_output.txt create mode 100644 examples/testing/outputs/verifying_aucc_output.txt create mode 100644 examples/testing/outputs/verifying_fd_afd_output.txt create mode 100644 examples/testing/outputs/verifying_mfd_output.txt create mode 100644 examples/testing/outputs/verifying_ucc_output.txt create mode 100644 examples/testing/test_examples.sh diff --git a/README.md b/README.md index 568826d51..c96d825bf 100644 --- a/README.md +++ b/README.md @@ -54,13 +54,13 @@ python3 cli.py --task=fd --table=../examples/datasets/university_fd.csv , True ``` ```text -[Course Classroom] -> Professor -[Classroom Semester] -> Professor -[Classroom Semester] -> Course [Professor] -> Course -[Professor Semester] -> Classroom +[Course Classroom] -> Professor [Course Semester] -> Classroom [Course Semester] -> Professor +[Classroom Semester] -> Course +[Classroom Semester] -> Professor +[Professor Semester] -> Classroom ``` 2) Discover all approximate functional dependencies with error less than or equal to 0.1 in a table represented by a .csv file that uses a comma as the separator and has a header row. In this example the default AFD discovery algorithm (Pyro) is used. @@ -114,13 +114,13 @@ for fd in result: ``` ```text FDs: -[Course Classroom] -> Professor -[Classroom Semester] -> Professor -[Classroom Semester] -> Course [Professor] -> Course -[Professor Semester] -> Classroom +[Course Classroom] -> Professor [Course Semester] -> Classroom [Course Semester] -> Professor +[Classroom Semester] -> Course +[Classroom Semester] -> Professor +[Professor Semester] -> Classroom ``` 2) Discover all approximate functional dependencies with error less than or equal to 0.1 in a table represented by a .csv file that uses a comma as the separator and has a header row. In this example the AFD discovery algorithm Pyro is used. @@ -141,8 +141,8 @@ for fd in result: ``` ```text AFDs: -[Id] -> Price [Id] -> ProductName +[Id] -> Price [ProductName] -> Price ``` @@ -178,16 +178,16 @@ MFD holds >>> pyro.load_data(table=df) >>> pyro.execute(error=0.0) >>> print(f'[{", ".join(map(str, pyro.get_fds()))}]') -[[0 1 2] -> 4, [0 2 3] -> 4, [0 1 3] -> 4, [1 2 3] -> 4] +[[0 1 2] -> 4, [0 1 3] -> 4, [0 2 3] -> 4, [1 2 3] -> 4] >>> pyro.execute(error=0.1) >>> print(f'[{", ".join(map(str, pyro.get_fds()))}]') -[[2] -> 0, [2] -> 3, [2] -> 1, [0] -> 2, [3] -> 0, [0] -> 3, [0] -> 1, [1] -> 3, [1] -> 0, [3] -> 2, [3] -> 1, [1] -> 2, [2] -> 4, [3] -> 4, [0] -> 4, [1] -> 4] +[[0] -> 1, [0] -> 2, [0] -> 3, [0] -> 4, [1] -> 0, [1] -> 2, [1] -> 3, [1] -> 4, [2] -> 0, [2] -> 1, [2] -> 3, [2] -> 4, [3] -> 0, [3] -> 1, [3] -> 2, [3] -> 4] >>> pyro.execute(error=0.2) >>> print(f'[{", ".join(map(str, pyro.get_fds()))}]') -[[2] -> 0, [0] -> 2, [3] -> 2, [1] -> 2, [2] -> 4, [3] -> 4, [0] -> 4, [1] -> 4, [3] -> 0, [1] -> 0, [2] -> 3, [2] -> 1, [0] -> 3, [0] -> 1, [1] -> 3, [3] -> 1] +[[0] -> 1, [0] -> 2, [0] -> 3, [0] -> 4, [1] -> 0, [1] -> 2, [1] -> 3, [1] -> 4, [2] -> 0, [2] -> 1, [2] -> 3, [2] -> 4, [3] -> 0, [3] -> 1, [3] -> 2, [3] -> 4] >>> pyro.execute(error=0.3) >>> print(f'[{", ".join(map(str, pyro.get_fds()))}]') -[[2] -> 1, [0] -> 2, [2] -> 0, [2] -> 3, [0] -> 1, [3] -> 2, [3] -> 1, [1] -> 2, [3] -> 0, [0] -> 3, [4] -> 1, [1] -> 0, [1] -> 3, [4] -> 2, [4] -> 3, [2] -> 4, [3] -> 4, [0] -> 4, [1] -> 4] +[[0] -> 1, [0] -> 2, [0] -> 3, [0] -> 4, [1] -> 0, [1] -> 2, [1] -> 3, [1] -> 4, [2] -> 0, [2] -> 1, [2] -> 3, [2] -> 4, [3] -> 0, [3] -> 1, [3] -> 2, [3] -> 4, [4] -> 1, [4] -> 2, [4] -> 3] ``` ## Web interface diff --git a/README_PYPI.md b/README_PYPI.md index 96c3454f2..e7b7f7b92 100644 --- a/README_PYPI.md +++ b/README_PYPI.md @@ -73,13 +73,13 @@ for fd in result: ```text FDs: -[Course Classroom] -> Professor -[Classroom Semester] -> Professor -[Classroom Semester] -> Course [Professor] -> Course -[Professor Semester] -> Classroom +[Course Classroom] -> Professor [Course Semester] -> Classroom [Course Semester] -> Professor +[Classroom Semester] -> Course +[Classroom Semester] -> Professor +[Professor Semester] -> Classroom ``` 2) Discover all approximate functional dependencies with error less than or equal to 0.1 in a table represented by a @@ -103,8 +103,8 @@ for fd in result: ```text AFDs: -[Id] -> Price [Id] -> ProductName +[Id] -> Price [ProductName] -> Price ``` @@ -145,16 +145,16 @@ MFD holds >>> pyro.load_data(table=df) >>> pyro.execute(error=0.0) >>> print(f'[{", ".join(map(str, pyro.get_fds()))}]') -[[0 1 2] -> 4, [0 2 3] -> 4, [0 1 3] -> 4, [1 2 3] -> 4] +[[0 1 2] -> 4, [0 1 3] -> 4, [0 2 3] -> 4, [1 2 3] -> 4] >>> pyro.execute(error=0.1) >>> print(f'[{", ".join(map(str, pyro.get_fds()))}]') -[[2] -> 0, [2] -> 3, [2] -> 1, [0] -> 2, [3] -> 0, [0] -> 3, [0] -> 1, [1] -> 3, [1] -> 0, [3] -> 2, [3] -> 1, [1] -> 2, [2] -> 4, [3] -> 4, [0] -> 4, [1] -> 4] +[[0] -> 1, [0] -> 2, [0] -> 3, [0] -> 4, [1] -> 0, [1] -> 2, [1] -> 3, [1] -> 4, [2] -> 0, [2] -> 1, [2] -> 3, [2] -> 4, [3] -> 0, [3] -> 1, [3] -> 2, [3] -> 4] >>> pyro.execute(error=0.2) >>> print(f'[{", ".join(map(str, pyro.get_fds()))}]') -[[2] -> 0, [0] -> 2, [3] -> 2, [1] -> 2, [2] -> 4, [3] -> 4, [0] -> 4, [1] -> 4, [3] -> 0, [1] -> 0, [2] -> 3, [2] -> 1, [0] -> 3, [0] -> 1, [1] -> 3, [3] -> 1] +[[0] -> 1, [0] -> 2, [0] -> 3, [0] -> 4, [1] -> 0, [1] -> 2, [1] -> 3, [1] -> 4, [2] -> 0, [2] -> 1, [2] -> 3, [2] -> 4, [3] -> 0, [3] -> 1, [3] -> 2, [3] -> 4] >>> pyro.execute(error=0.3) >>> print(f'[{", ".join(map(str, pyro.get_fds()))}]') -[[2] -> 1, [0] -> 2, [2] -> 0, [2] -> 3, [0] -> 1, [3] -> 2, [3] -> 1, [1] -> 2, [3] -> 0, [0] -> 3, [4] -> 1, [1] -> 0, [1] -> 3, [4] -> 2, [4] -> 3, [2] -> 4, [3] -> 4, [0] -> 4, [1] -> 4] +[[0] -> 1, [0] -> 2, [0] -> 3, [0] -> 4, [1] -> 0, [1] -> 2, [1] -> 3, [1] -> 4, [2] -> 0, [2] -> 1, [2] -> 3, [2] -> 4, [3] -> 0, [3] -> 1, [3] -> 2, [3] -> 4, [4] -> 1, [4] -> 2, [4] -> 3] ``` More examples can be found in the [Desbordante repository](https://github.com/Desbordante/desbordante-core/tree/main/examples) on GitHub. diff --git a/examples/afd_multiple_error_thresholds.py b/examples/afd_multiple_error_thresholds.py index c6d6edcf6..c02520434 100644 --- a/examples/afd_multiple_error_thresholds.py +++ b/examples/afd_multiple_error_thresholds.py @@ -5,13 +5,13 @@ pyro.load_data(table=df) pyro.execute(error=0.0) print(f'[{", ".join(map(str, pyro.get_fds()))}]') -# [[0 1 2] -> 4, [0 2 3] -> 4, [0 1 3] -> 4, [1 2 3] -> 4] +# [[0 1 2] -> 4, [0 1 3] -> 4, [0 2 3] -> 4, [1 2 3] -> 4] pyro.execute(error=0.1) print(f'[{", ".join(map(str, pyro.get_fds()))}]') -# [[2] -> 0, [2] -> 3, [2] -> 1, [0] -> 2, [3] -> 0, [0] -> 3, [0] -> 1, [1] -> 3, [1] -> 0, [3] -> 2, [3] -> 1, [1] -> 2, [2] -> 4, [3] -> 4, [0] -> 4, [1] -> 4] +# [[0] -> 1, [0] -> 2, [0] -> 3, [0] -> 4, [1] -> 0, [1] -> 2, [1] -> 3, [1] -> 4, [2] -> 0, [2] -> 1, [2] -> 3, [2] -> 4, [3] -> 0, [3] -> 1, [3] -> 2, [3] -> 4] pyro.execute(error=0.2) print(f'[{", ".join(map(str, pyro.get_fds()))}]') -# [[2] -> 0, [0] -> 2, [3] -> 2, [1] -> 2, [2] -> 4, [3] -> 4, [0] -> 4, [1] -> 4, [3] -> 0, [1] -> 0, [2] -> 3, [2] -> 1, [0] -> 3, [0] -> 1, [1] -> 3, [3] -> 1] +# [[0] -> 1, [0] -> 2, [0] -> 3, [0] -> 4, [1] -> 0, [1] -> 2, [1] -> 3, [1] -> 4, [2] -> 0, [2] -> 1, [2] -> 3, [2] -> 4, [3] -> 0, [3] -> 1, [3] -> 2, [3] -> 4] pyro.execute(error=0.3) print(f'[{", ".join(map(str, pyro.get_fds()))}]') -# [[2] -> 1, [0] -> 2, [2] -> 0, [2] -> 3, [0] -> 1, [3] -> 2, [3] -> 1, [1] -> 2, [3] -> 0, [0] -> 3, [4] -> 1, [1] -> 0, [1] -> 3, [4] -> 2, [4] -> 3, [2] -> 4, [3] -> 4, [0] -> 4, [1] -> 4] +# [[0] -> 1, [0] -> 2, [0] -> 3, [0] -> 4, [1] -> 0, [1] -> 2, [1] -> 3, [1] -> 4, [2] -> 0, [2] -> 1, [2] -> 3, [2] -> 4, [3] -> 0, [3] -> 1, [3] -> 2, [3] -> 4, [4] -> 1, [4] -> 2, [4] -> 3] diff --git a/examples/comparison_pfd_vs_afd.py b/examples/comparison_pfd_vs_afd.py index 07cfb7657..fa0ba820d 100644 --- a/examples/comparison_pfd_vs_afd.py +++ b/examples/comparison_pfd_vs_afd.py @@ -1,4 +1,5 @@ import desbordante +from ordered_set import OrderedSet TABLE = 'examples/datasets/glitchy_sensor.csv' ERROR = 0.18 @@ -6,7 +7,7 @@ def stringify(fds): - return set(map(str, fds)) + return OrderedSet(map(str, fds)) def get_afds(): @@ -23,8 +24,8 @@ def get_pfds(): return algo.get_fds() -pfds = set(get_pfds()) -afds = set(get_afds()) +pfds = OrderedSet(get_pfds()) +afds = OrderedSet(get_afds()) print("pFDs \ AFDs =", stringify(pfds - afds)) print("AFDs \ pFDs =", stringify(afds - pfds)) diff --git a/examples/dedupe.py b/examples/dedupe.py index fab947f34..129e0ff7b 100644 --- a/examples/dedupe.py +++ b/examples/dedupe.py @@ -93,6 +93,7 @@ def merge_handler(df: pandas.DataFrame, new_rows, remaining_rows, used_rows): for col_name, values in zip(df.columns, zip(*df.iloc[list(used_rows)].itertuples(index=False))): distinct_values = list(set(values)) + distinct_values.sort() index = 0 if len(distinct_values) == 1 else choose_index(col_name, distinct_values) new_row.append(distinct_values[index]) remaining_rows -= used_rows diff --git a/examples/mining_set_od_2.py b/examples/mining_set_od_2.py index 83fb76edd..78b422b7f 100644 --- a/examples/mining_set_od_2.py +++ b/examples/mining_set_od_2.py @@ -94,7 +94,7 @@ def print_simple_ods_with_comments(simple_ods, table): print('be traced.') percent_values = list(table['percent']) - percent_classes = set([f'class [{i}] with {percent_values.count(i)} element{"" if percent_values.count(i) == 1 else "s"}' + percent_classes = list([f'class [{i}] with {percent_values.count(i)} element{"" if percent_values.count(i) == 1 else "s"}' for i in percent_values]) print() diff --git a/examples/testing/inputs/dedupe_input.txt b/examples/testing/inputs/dedupe_input.txt new file mode 100644 index 000000000..9c59a9d78 --- /dev/null +++ b/examples/testing/inputs/dedupe_input.txt @@ -0,0 +1,13 @@ +0 +1 2 3 4 5 6 +4 +merge 7 8 9 +0 +0 +1 +keepall +keepall +merge 52 53 +0 +1 +keepall diff --git a/examples/testing/outputs/afd_multiple_error_thresholds_output.txt b/examples/testing/outputs/afd_multiple_error_thresholds_output.txt new file mode 100644 index 000000000..5332e9613 --- /dev/null +++ b/examples/testing/outputs/afd_multiple_error_thresholds_output.txt @@ -0,0 +1,4 @@ +[[0 1 2] -> 4, [0 1 3] -> 4, [0 2 3] -> 4, [1 2 3] -> 4] +[[0] -> 1, [0] -> 2, [0] -> 3, [0] -> 4, [1] -> 0, [1] -> 2, [1] -> 3, [1] -> 4, [2] -> 0, [2] -> 1, [2] -> 3, [2] -> 4, [3] -> 0, [3] -> 1, [3] -> 2, [3] -> 4] +[[0] -> 1, [0] -> 2, [0] -> 3, [0] -> 4, [1] -> 0, [1] -> 2, [1] -> 3, [1] -> 4, [2] -> 0, [2] -> 1, [2] -> 3, [2] -> 4, [3] -> 0, [3] -> 1, [3] -> 2, [3] -> 4] +[[0] -> 1, [0] -> 2, [0] -> 3, [0] -> 4, [1] -> 0, [1] -> 2, [1] -> 3, [1] -> 4, [2] -> 0, [2] -> 1, [2] -> 3, [2] -> 4, [3] -> 0, [3] -> 1, [3] -> 2, [3] -> 4, [4] -> 1, [4] -> 2, [4] -> 3] diff --git a/examples/testing/outputs/algebraic_constraints_output.txt b/examples/testing/outputs/algebraic_constraints_output.txt new file mode 100644 index 000000000..845674ac0 --- /dev/null +++ b/examples/testing/outputs/algebraic_constraints_output.txt @@ -0,0 +1,19 @@ +Discovered ranges for (Delivery date - Dispatch date) are: +[(2.0, 7.0), (15.0, 22.0)] + +Rows in which the result of the chosen operation (-) is outside of discovered ranges: +id: 7 +Dispatch date: 1 +Delivery date: 30 +Difference: 29 + +id: 26 +Dispatch date: 7 +Delivery date: 18 +Difference: 11 + +id: 30 +Dispatch date: 11 +Delivery date: 22 +Difference: 11 + diff --git a/examples/testing/outputs/anomaly_detection_output.txt b/examples/testing/outputs/anomaly_detection_output.txt new file mode 100644 index 000000000..5a1e92be0 --- /dev/null +++ b/examples/testing/outputs/anomaly_detection_output.txt @@ -0,0 +1,44 @@ +FDs found for dataset 1: +[item_id] -> item_weight +[item_weight] -> item_id +[record_id] -> cargo_id +[record_id] -> item_id +[record_id] -> item_weight +[record_id] -> timestamp +[timestamp] -> cargo_id +[timestamp] -> item_id +[timestamp] -> item_weight +[timestamp] -> record_id +FDs found for dataset 2: +[item_id] -> item_weight +[item_weight] -> item_id +[record_id] -> cargo_id +[record_id] -> item_id +[record_id] -> item_weight +[record_id] -> timestamp +[timestamp] -> cargo_id +[timestamp] -> item_id +[timestamp] -> item_weight +[timestamp] -> record_id +FDs found for dataset 3: +[item_weight] -> item_id +[record_id] -> cargo_id +[record_id] -> item_id +[record_id] -> item_weight +[record_id] -> timestamp +[timestamp] -> cargo_id +[timestamp] -> item_id +[timestamp] -> item_weight +[timestamp] -> record_id +AFDs found for dataset 3: +[item_id cargo_id] -> item_weight +[item_weight] -> item_id +[record_id] -> cargo_id +[record_id] -> item_id +[record_id] -> item_weight +[record_id] -> timestamp +[timestamp] -> cargo_id +[timestamp] -> item_id +[timestamp] -> item_weight +[timestamp] -> record_id +MFD holds. diff --git a/examples/testing/outputs/comparison_pfd_vs_afd_output.txt b/examples/testing/outputs/comparison_pfd_vs_afd_output.txt new file mode 100644 index 000000000..9447c66c6 --- /dev/null +++ b/examples/testing/outputs/comparison_pfd_vs_afd_output.txt @@ -0,0 +1,7 @@ +pFDs \ AFDs = OrderedSet(['[DeviceId] -> Data']) +AFDs \ pFDs = OrderedSet() +AFDs ∩ pFDs = OrderedSet(['[Id] -> DeviceId', '[Id] -> Data', '[Data] -> Id', '[Data] -> DeviceId']) +1 - PerValue([DeviceId] -> Data) = 0.1714285714 +e([DeviceId] -> Data) = 0.23076923076923078 +In case of PerValue error measure, violations on data from the single "glitchy" +sensor device among many do not prevent dependecy from being found diff --git a/examples/testing/outputs/data_stats_output.txt b/examples/testing/outputs/data_stats_output.txt new file mode 100644 index 000000000..956b33b78 --- /dev/null +++ b/examples/testing/outputs/data_stats_output.txt @@ -0,0 +1,152 @@ +Columns with null: [] +Columns with all unique values: [0, 1] +Number of columns: 6 + +Column num: 0 +Min: 0008f14d-e2a7-4582-bf5e-89ce32b55606 +Max: fff1cd7a-04f9-486c-97de-d5d2c6ddb3cb +Distinct: 945 + +Column num: 1 +Min: Anthony Campbell +Max: William Taylor +Distinct: 945 + +Column num: 2 +Min: Addyson Aaliyah +Max: Shena Desiree +Distinct: 6 + +Column num: 3 +Min: MonsterWorq +Max: Yogatacular +Distinct: 5 + +Column num: 4 +Avg: 932.258201058201 +Sum of squares: 894298474 +Median: 945.0 +Min: 465 +Max: 2036 +Distinct: 28 +Corrected std: 278.07204551856535 + +Column num: 5 +Min: Client Solution Analyst +Max: Workshop Technician +Distinct: 15 + +Column num = 0 +num_chars = 34020 +num_uppercase_chars = 0 +type = String +isCategorical = 0 +num_lowercase_chars = 11108 +count = 945 +quantile50 = 81aabb56-808c-48a1-b2a3-5d3f2e1a752f +num_digit_chars = 19132 +distinct = 945 +avg_chars = 36.000000 +min = 0008f14d-e2a7-4582-bf5e-89ce32b55606 +quantile25 = 4307ef5b-2e00-4316-b04c-debff4edc5c4 +max = fff1cd7a-04f9-486c-97de-d5d2c6ddb3cb +quantile75 = c8539dda-ec0e-4c67-a2f4-2d201bb82171 +num_non_letter_chars = 22912 +vocab = -0123456789abcdef + +Column num = 1 +num_chars = 12261 +num_uppercase_chars = 1890 +type = String +isCategorical = 0 +num_lowercase_chars = 9426 +count = 945 +quantile50 = Kenneth King +num_digit_chars = 0 +distinct = 945 +avg_chars = 12.974603 +min = Anthony Campbell +quantile25 = Donna White +max = William Taylor +quantile75 = Patricia Gonzalez +num_non_letter_chars = 945 +vocab = ABCDEGHJKLMNPRSTWYabcdefghiklmnoprstuvwyz + +Column num = 2 +num_chars = 11843 +num_uppercase_chars = 1890 +type = String +isCategorical = 1 +num_lowercase_chars = 9008 +count = 945 +quantile50 = Galen Calla +num_digit_chars = 0 +distinct = 6 +avg_chars = 12.532275 +min = Addyson Aaliyah +quantile25 = Carrie Silvia +max = Shena Desiree +quantile75 = Paul Jeffry +num_non_letter_chars = 945 +vocab = ACDGJPSadefhilnorsuvy + +Column num = 3 +num_chars = 10452 +num_uppercase_chars = 1300 +type = String +isCategorical = 1 +num_lowercase_chars = 9152 +count = 945 +quantile50 = Talkspiration +num_digit_chars = 0 +distinct = 5 +avg_chars = 11.060317 +min = MonsterWorq +quantile25 = SpeakerAce +max = Yogatacular +quantile75 = Verbalthon +num_non_letter_chars = 0 +vocab = AMSTVWYabceghiklnopqrstu + +Column num = 4 +median_ad = 100.000000 +geometric_mean = 893.289725 +median = 945.000000 +sum_of_squares = 894298474 +num_negatives = 0 +quantile75 = 1020 +type = Int +mean_ad = 186.978103 +isCategorical = 0 +kurtosis = 2.859101 +count = 945 +quantile50 = 945 +num_zeros = 0 +avg = 932.258201 +distinct = 28 +STD = 278.072046 +skewness = 1.132442 +min = 465 +quantile25 = 800 +max = 2036 +sum = 880984 + +Column num = 5 +num_chars = 17603 +num_uppercase_chars = 2226 +type = String +isCategorical = 0 +num_lowercase_chars = 14152 +count = 945 +quantile50 = Physiotherapist +num_digit_chars = 0 +distinct = 15 +avg_chars = 18.627513 +min = Client Solution Analyst +quantile25 = JavaScript Developer +max = Workshop Technician +quantile75 = Service Technician +num_non_letter_chars = 1225 +vocab = -ACDEFJLMOPRSTWacdeghijklmnoprstuvy + + diff --git a/examples/testing/outputs/dedupe_output.txt b/examples/testing/outputs/dedupe_output.txt new file mode 100644 index 000000000..9c6eef4dc --- /dev/null +++ b/examples/testing/outputs/dedupe_output.txt @@ -0,0 +1,92 @@ +Deduplication parameters: +ALGORITHM='Pyro' +ERROR=0.00100 +DATASET_PATH='examples/datasets/duplicates.csv' +SEPARATOR=',' +INITIAL_WINDOW_SIZE=4 + +Dataset sample: + id name address city email phone country +0 5996 Kaede Sue 66 Pirus Kaede.Sue4422@virtex.rum 39 EU +1 36 Licia Wolf 35 Pilington Licia.Wolf1260@cmail.com 35 CM +2 17 Steve Doe 16 Syndye Steve.Doe272@muli.ry 16 GZ +3 62 Lisa Tarski 61 Syndye Lisa.Tarski3782@virtex.rum 61 JU +4 6 Mary Tarski 5 Lumdum Mary.Tarski30@ferser.edu 5 PR +.. ... ... ... ... ... ... ... +73 15 Ivan Dawn 14 Syndye Ivan.Dawn210@atomlema.ocg 14 FC +74 5993 Lisa Honjo 63 Roit Lisa.Honjo4032@virtex.rum 63 AI +75 59 Lisa Sue 58 Muxicu Lisa.Sue3422@cmail.com 58 AI +76 21 Steve Shiramine 20 Pilington Steve.Shiramine420@ferser.edu 20 GZ +77 44 Maxine Wolf 43 Muxicu Maxine.Wolf1892@atomlema.ocg 43 PR + +[78 rows x 7 columns] +Original records: 78 + +AFD info: +0: id -> ( name address city email phone country ) +2: address -> ( name ) +4: email -> ( name address phone country ) +5: phone -> ( name ) +LHS column index: RHS columns: +1: name +2: address +3: city +4: email +5: phone +6: country +RHS columns to use (indices): Equal columns to consider duplicates: id name address city email phone country +5 27 Björn Sue 26 Roit Björn.Sue702@cmail.com 26 CM +6 30 Björn Tarski 29 Lumdum Björn.Tarski870@ferser.edu 29 PR +7 5957 Björn Wolf 27 Björn.Wolf756@virtex.rum 27 AI +8 28 Björn Wolf 27 Björn.Wolf756@virtex.rum 27 AI +9 11886 Björn Wolf 28 Kustruma Björn.Wolf756@virtex.rum 27 AI +Command: Column: id. Which value to use? +0: 11886 +1: 28 +2: 5957 +index: Column: address. Which value to use? +0: 27 +1: 28 +index: Column: city. Which value to use? +0: +1: Kustruma +index: id name address city email phone country +5 27 Björn Sue 26 Roit Björn.Sue702@cmail.com 26 CM +6 30 Björn Tarski 29 Lumdum Björn.Tarski870@ferser.edu 29 PR +Command: id name address city email phone country +42 63 Lisa Dawn 62 Pilington Lisa.Dawn3906@atomlema.ocg 62 EU +43 57 Lisa Doe 56 Roit Lisa.Doe3192@virtex.rum 56 AI +44 64 Lisa Honjo 63 Pirus Lisa.Honjo4032@virtex.rum 63 AI +45 5993 Lisa Honjo 63 Roit Lisa.Honjo4032@virtex.rum 63 AI +Command: id name address city email phone country +50 60 Lisa Wolf 59 Syndye Lisa.Wolf3540@cmail.com 59 FC +51 7 Mary Dawn 6 Syndye Mary.Dawn42@atomlema.ocg 6 PR +52 5930 Mary Doe Lumdum Mary.Doe-5926@ferser.edu 0 +53 11859 Mary Doe Lumdum Mary.Doe-5926@ferser.edu 0 EU +54 1 Mary Doe Lumdum Mary.Doe0@muli.ry 4 EU +Command: Column: id. Which value to use? +0: 11859 +1: 5930 +index: Column: country. Which value to use? +0: +1: EU +index: id name address city email phone country +50 60 Lisa Wolf 59 Syndye Lisa.Wolf3540@cmail.com 59 FC +51 7 Mary Dawn 6 Syndye Mary.Dawn42@atomlema.ocg 6 PR +54 1 Mary Doe Lumdum Mary.Doe0@muli.ry 4 EU +Command: +Resulting records: 75. Duplicates found: 3 + id name address city email phone country +0 31 Björn Dawn 30 Muxicu Björn.Dawn930@atomlema.ocg 30 JU +1 25 Björn Doe 24 Pilington Björn.Doe600@muli.ry 24 FC +2 32 Björn Honjo 31 Kustruma Björn.Honjo992@virtex.rum 31 RI +3 29 Björn Shiramine 28 Syndye Björn.Shiramine812@virtex.rum 28 EU +4 26 Björn Smith 25 Pilington Björn.Smith650@virtex.rum 25 RI +.. .. ... ... ... ... ... ... +70 21 Steve Shiramine 20 Pilington Steve.Shiramine420@ferser.edu 20 GZ +71 20 Steve Wolf 19 Pilington Steve.Wolf380@muli.ry 19 RI +72 22 Steve Tarski 21 Pilington Steve.Tarski462@atomlema.ocg 21 PR +73 19 Steve Sue 18 Syndye Steve.Sue342@virtex.rum 18 AI +74 18 Steve Smith 17 Lumdum Steve.Smith306@cmail.com 17 EU + +[75 rows x 7 columns] diff --git a/examples/testing/outputs/mine_typos_output.txt b/examples/testing/outputs/mine_typos_output.txt new file mode 100644 index 000000000..ef25e0c11 --- /dev/null +++ b/examples/testing/outputs/mine_typos_output.txt @@ -0,0 +1,44 @@ +Starting typo discovery scenario with parameters: +RADIUS=3 +RATIO=0.1 +ERROR=0.005 +DATASET_PATH='examples/datasets/Workshop.csv' +EXACT_ALGORITHM='HyFD' +APPROXIMATE_ALGORITHM='Pyro' +HEADER=0 +SEPARATOR=',' + +Dataset sample: + id worker_name supervisor_surname workshop salary job_post +0 404f50cb-caf0-4974-97f9-9463434537e1 Jennifer Moore Galen Calla Yogatacular 980 Client Solution Analyst +1 b5e38281-9c09-49bf-91f5-c55397df4d43 Edward Lee Carrie Silvia MonsterWorq 905 Front-End Loader Operator +2 972b299d-2f27-4d6d-81d2-8effbc543bf1 Brian Lee Shena Desiree Talkspiration 700 Farm Assistant +3 3241fb48-5a15-4638-bd68-d915834a3f89 Kenneth Turner Paul Jeffry Verbalthon 980 Client Solution Analyst +4 9cbb9026-f157-4a01-aace-a42b05ab2a28 Betty Campbell Addyson Aaliyah SpeakerAce 800 Physiotherapist +.. ... ... ... ... ... ... +940 9cd700bc-b3d9-439d-afe9-945c2a20bc37 Richard Lopez Galen Calla Yogatacular 845 Senior Financial Planner +941 cc199ff4-453a-4ae5-9fbd-b45d72fa952a Helen Rodriguez Carrie Silvia MonsterWorq 465 Electrician +942 de650347-880a-42a2-88c9-4329f26fb912 Karen White Carrie Silvia MonsterWorq 510 JavaScript Developer +943 ae604e24-e040-4d50-b685-5b4897ab9ae9 Charles Smith Shena Desiree Talkspiration 975 Store Manager +944 d5cb954a-e942-47ae-9b62-b57f7a84c2db Jeff King Carrie Silvia MonsterWorq 465 Electrician + +[945 rows x 6 columns] + +Searching for almost holding FDs... + +Found! Almost holding FDs: +[supervisor_surname salary] -> job_post +[supervisor_surname job_post] -> salary +[workshop] -> supervisor_surname +[workshop salary] -> job_post +[workshop job_post] -> salary + +Selecting FD with index 2: + rows count workshop supervisor_surname + 198 Yogatacular Galen Calla + 1 Yogatacular Galen Calella + +Typo candidates and context: + id worker_name supervisor_surname workshop salary job_post +0 404f50cb-caf0-4974-97f9-9463434537e1 Jennifer Moore Galen Calla Yogatacular 980 Client Solution Analyst +7 ddba9118-ec89-472d-9f3f-bebd919f0e3a William Robinson Galen Calella Yogatacular 975 Store Manager diff --git a/examples/testing/outputs/mining_afd_output.txt b/examples/testing/outputs/mining_afd_output.txt new file mode 100644 index 000000000..191b21eaa --- /dev/null +++ b/examples/testing/outputs/mining_afd_output.txt @@ -0,0 +1,4 @@ +AFDs: +[Id] -> ProductName +[Id] -> Price +[ProductName] -> Price diff --git a/examples/testing/outputs/mining_cfd_output.txt b/examples/testing/outputs/mining_cfd_output.txt new file mode 100644 index 000000000..042a99843 --- /dev/null +++ b/examples/testing/outputs/mining_cfd_output.txt @@ -0,0 +1,124 @@ +options: +MINIMUM SUPPORT = 8 , MINIMUM CONFIDENCE = 0.7 , MAXIMUM LHS COUNT = 3 +displaying the first five (or fewer) discovered CFDs: + +CFD: +{(3, False)} -> (4, True) : + + Outlook Temperature Humidity Windy Play + 0 sunny hot high False False + 1 sunny hot high True False + 2 overcast hot high False True + 3 rain mild high False True + 4 rain cool normal False True + 5 rain cool normal True False + 6 overcast cool normal True True + 7 sunny mild high False False + 8 sunny cool normal False True + 9 rain mild normal False True + 10 sunny mild normal True True + 11 overcast mild high True True + 12 overcast hot normal False True + 13 rain mild high True False +lhs count: 1 +support: 8 +confidence: 6 / 8 = 0.7500 + + + +CFD: +{(2, _)} -> (4, _) : + + Outlook Temperature Humidity Windy Play + 0 sunny hot high False False + 1 sunny hot high True False + 2 overcast hot high False True + 3 rain mild high False True + 4 rain cool normal False True + 5 rain cool normal True False + 6 overcast cool normal True True + 7 sunny mild high False False + 8 sunny cool normal False True + 9 rain mild normal False True + 10 sunny mild normal True True + 11 overcast mild high True True + 12 overcast hot normal False True + 13 rain mild high True False +lhs count: 1 +support: 14 +confidence: 10 / 14 = 0.7143 + + + +CFD: +{(4, _)} -> (2, _) : + + Outlook Temperature Humidity Windy Play + 0 sunny hot high False False + 1 sunny hot high True False + 2 overcast hot high False True + 3 rain mild high False True + 4 rain cool normal False True + 5 rain cool normal True False + 6 overcast cool normal True True + 7 sunny mild high False False + 8 sunny cool normal False True + 9 rain mild normal False True + 10 sunny mild normal True True + 11 overcast mild high True True + 12 overcast hot normal False True + 13 rain mild high True False +lhs count: 1 +support: 14 +confidence: 10 / 14 = 0.7143 + + + +CFD: +{(3, _),(2, _)} -> (4, _) : + + Outlook Temperature Humidity Windy Play + 0 sunny hot high False False + 1 sunny hot high True False + 2 overcast hot high False True + 3 rain mild high False True + 4 rain cool normal False True + 5 rain cool normal True False + 6 overcast cool normal True True + 7 sunny mild high False False + 8 sunny cool normal False True + 9 rain mild normal False True + 10 sunny mild normal True True + 11 overcast mild high True True + 12 overcast hot normal False True + 13 rain mild high True False +lhs count: 2 +support: 14 +confidence: 10 / 14 = 0.7143 + + + +CFD: +{(2, _),(3, False)} -> (4, _) : + + Outlook Temperature Humidity Windy Play + 0 sunny hot high False False + 1 sunny hot high True False + 2 overcast hot high False True + 3 rain mild high False True + 4 rain cool normal False True + 5 rain cool normal True False + 6 overcast cool normal True True + 7 sunny mild high False False + 8 sunny cool normal False True + 9 rain mild normal False True + 10 sunny mild normal True True + 11 overcast mild high True True + 12 overcast hot normal False True + 13 rain mild high True False +lhs count: 2 +support: 8 +confidence: 6 / 8 = 0.7500 + + + diff --git a/examples/testing/outputs/mining_fd_output.txt b/examples/testing/outputs/mining_fd_output.txt new file mode 100644 index 000000000..a0abfc82f --- /dev/null +++ b/examples/testing/outputs/mining_fd_output.txt @@ -0,0 +1,8 @@ +FDs: +[Professor] -> Course +[Course Classroom] -> Professor +[Course Semester] -> Classroom +[Course Semester] -> Professor +[Classroom Semester] -> Course +[Classroom Semester] -> Professor +[Professor Semester] -> Classroom diff --git a/examples/testing/outputs/mining_ind_output.txt b/examples/testing/outputs/mining_ind_output.txt new file mode 100644 index 000000000..86f2687d1 --- /dev/null +++ b/examples/testing/outputs/mining_ind_output.txt @@ -0,0 +1,34 @@ +Found inclusion dependencies (-> means "is included in"): + +(course.csv, [Department name]) -> (department.csv, [Department name]) +(instructor.csv, [Department name]) -> (department.csv, [Department name]) +(student.csv, [Department name]) -> (department.csv, [Department name]) +(teaches.csv, [Instructor ID]) -> (instructor.csv, [ID]) +(teaches.csv, [Course ID]) -> (course.csv, [Course ID]) + +Tables for first IND: +course.csv: + +Course ID Title Department name +--------------------------------------------------------------------- +IT-1 Computer Science Institute of Information Technology +MM-3 Algebra Mathematics and Mechanics Faculty +H-1 History Institute of History +FL-2 English Faculty of Foreign Languages +IT-2 Programming Institute of Information Technology +S-5 Philosophy Faculty of Sociology +P-2 Physics Faculty of Physics +C-8 Chemistry Institute of Chemistry + +department.csv: + +Department name Building +----------------------------------------------------------- +Institute of Information Technology 5 Academic av. +Mathematics and Mechanics Faculty 3 Academic av. +Institute of History 29A University st. +Faculty of Foreign Languages 10 Science sq. +Faculty of Sociology 29C University st. +Faculty of Physics 10 Academic av. +Institute of Chemistry 11 Academic av. +Graduate School of Managemment 49 Science sq. diff --git a/examples/testing/outputs/mining_list_od_output.txt b/examples/testing/outputs/mining_list_od_output.txt new file mode 100644 index 000000000..7177b7767 --- /dev/null +++ b/examples/testing/outputs/mining_list_od_output.txt @@ -0,0 +1,49 @@ + ++----+----------+-----------------+--------+ +| | weight | shipping cost | days | +|----+----------+-----------------+--------| +| 0 | 5 | 14 | 2 | +| 1 | 10 | 22 | 6 | +| 2 | 3 | 10 | 4 | +| 3 | 10 | 25 | 7 | +| 4 | 5 | 14 | 2 | +| 5 | 20 | 40 | 8 | ++----+----------+-----------------+--------+ + +Resulting dependencies for this table are: +['weight', 'days'] -> ['shipping cost'] +['shipping cost'] -> ['weight', 'days'] +['weight'] -> ['shipping cost'] + +Depenency [weight] -> [shipping cost] means that ordering table by weight +will also order table by shipping cost automatically. Let's order by weight: + ++----+----------+-----------------+--------+ +| | weight | shipping cost | days | +|----+----------+-----------------+--------| +| 2 | 3 | 10 | 4 | +| 0 | 5 | 14 | 2 | +| 4 | 5 | 14 | 2 | +| 1 | 10 | 22 | 6 | +| 3 | 10 | 25 | 7 | +| 5 | 20 | 40 | 8 | ++----+----------+-----------------+--------+ + +We can see that shipping cost is sorted too. And dependency seems reasonable: +the more the package weights, the more expensive it will be to send it. + +Order dependencies are called lexicographical, because ordering for multiple +columns is lexicographical. For example [shipping cost] -> [weight, days] implies +that ordering by shipping cost will also lexicographically order [weight, days]: + ++----+----------+-----------------+--------+ +| | weight | shipping cost | days | +|----+----------+-----------------+--------| +| 2 | 3 | 10 | 4 | +| 0 | 5 | 14 | 2 | +| 4 | 5 | 14 | 2 | +| 1 | 10 | 22 | 6 | +| 3 | 10 | 25 | 7 | +| 5 | 20 | 40 | 8 | ++----+----------+-----------------+--------+ + diff --git a/examples/testing/outputs/mining_pfd_output.txt b/examples/testing/outputs/mining_pfd_output.txt new file mode 100644 index 000000000..1e91b04be --- /dev/null +++ b/examples/testing/outputs/mining_pfd_output.txt @@ -0,0 +1,3 @@ +per_value pFDs: +[Y] -> X +per_tuple pFDs: diff --git a/examples/testing/outputs/mining_set_od_1_output.txt b/examples/testing/outputs/mining_set_od_1_output.txt new file mode 100644 index 000000000..983052ce7 --- /dev/null +++ b/examples/testing/outputs/mining_set_od_1_output.txt @@ -0,0 +1,108 @@ ++----+--------+------------------+--------------+ +| | year | employee_grade | avg_salary | +|----+--------+------------------+--------------| +| 0 | 2020 | 24 | 1000 | +| 1 | 2020 | 40 | 7000 | +| 2 | 2020 | 32 | 5000 | +| 3 | 2020 | 29 | 3000 | +| 4 | 2020 | 49 | 10000 | +| 5 | 2021 | 50 | 15000 | +| 6 | 2021 | 25 | 1500 | +| 7 | 2021 | 30 | 6000 | ++----+--------+------------------+--------------+ + +Attribute symbols: +year -- 1 +employee_grade -- 2 +avg_salary -- 3 + +descending ods: 0 + +ascending ods: 2 +{1} : 2<= ~ 3<= +{1} : 3<= ~ 2<= + +Dependency "{1} : 2<= ~ 3<=" means that ordering the table +inside each equivalence class from "year" by attribute "avg_salary" +automatically entails ordering by attribute "employee_grade". + +We have 2 equivalence classes in "year": [2020] and [2021]. +Let's split the table into two tables based on these classes. + +Part 1: this part of table corresponds to class [2020] ++----+--------+------------------+--------------+ +| | year | employee_grade | avg_salary | +|----+--------+------------------+--------------| +| 0 | 2020 | 24 | 1000 | +| 1 | 2020 | 40 | 7000 | +| 2 | 2020 | 32 | 5000 | +| 3 | 2020 | 29 | 3000 | +| 4 | 2020 | 49 | 10000 | ++----+--------+------------------+--------------+ + +Let's sort it by attribute "avg_salary". + +Sorted part 1: ++----+--------+------------------+--------------+ +| | year | employee_grade | avg_salary | +|----+--------+------------------+--------------| +| 0 | 2020 | 24 | 1000 | +| 3 | 2020 | 29 | 3000 | +| 2 | 2020 | 32 | 5000 | +| 1 | 2020 | 40 | 7000 | +| 4 | 2020 | 49 | 10000 | ++----+--------+------------------+--------------+ + +We can see that this sort entails automatic ordering by +attribute "employee_grade". + +Part 2: this part of table corresponds to class [2021] ++----+--------+------------------+--------------+ +| | year | employee_grade | avg_salary | +|----+--------+------------------+--------------| +| 5 | 2021 | 50 | 15000 | +| 6 | 2021 | 25 | 1500 | +| 7 | 2021 | 30 | 6000 | ++----+--------+------------------+--------------+ + +Let's sort it by attribute "avg_salary". + +Sorted part 2: ++----+--------+------------------+--------------+ +| | year | employee_grade | avg_salary | +|----+--------+------------------+--------------| +| 6 | 2021 | 25 | 1500 | +| 7 | 2021 | 30 | 6000 | +| 5 | 2021 | 50 | 15000 | ++----+--------+------------------+--------------+ + +We can see that this sort entails automatic ordering by +attribute "employee_grade" too. + +Dependency "{1} : 3<= ~ 2<=" is similar to the first and means that +ordering the table inside each equivalence class from "year" by +attribute "employee_grade" automatically entails ordering by +attribute "avg_salary". This can be seen in the tables above. + +In other words, these dependencies indicate that the ordering of +average salary entails an automatic ordering of the employee grade +and vice versa. + +simple ods: 4 +{2} : [] -> 1<= +{3} : [] -> 1<= +{3} : [] -> 2<= +{2} : [] -> 3<= + +These dependencies mean that inside each equivalence class from +an attribute from their context the constancy of the attribute +from the right side of the dependency can be traced. + +For example, let's look at "{2} : [] -> 1<=". The context of this +dependency is attribute "employee_grade". We have 8 equivalence classes +in "employee_grade": [24], [40], [32], [29], [49], [50], [25], [30]. +Since all the elements of attribute "employee_grade" are different, +each of these classes contains only one element, so constancy within +each class occurs automatically. + +To better understand such dependencies, refer to the second example. diff --git a/examples/testing/outputs/mining_set_od_2_output.txt b/examples/testing/outputs/mining_set_od_2_output.txt new file mode 100644 index 000000000..ac9f174a9 --- /dev/null +++ b/examples/testing/outputs/mining_set_od_2_output.txt @@ -0,0 +1,134 @@ ++----+--------+------------+-----------+ +| | year | position | percent | +|----+--------+------------+-----------| +| 0 | 2020 | director | 10% | +| 1 | 2020 | other | 50% | +| 2 | 2020 | manager | 40% | +| 3 | 2021 | manager | 35% | +| 4 | 2021 | other | 55% | +| 5 | 2021 | director | 10% | ++----+--------+------------+-----------+ + +Attribute symbols: +year -- 1 +position -- 2 +percent -- 3 + +descending ods: 0 + +ascending ods: 2 +{} : 3<= ~ 2<= +{} : 2<= ~ 3<= + +Dependency "{} : 3<= ~ 2<=" means that ordering the table by attribute +"percent" automatically entails ordering by attribute "position". +Moreover, this is observed regardless of other attributes, since the +dependency context is empty. + +Let's sort it by attribute "percent". + +Sorted table: ++----+--------+------------+-----------+ +| | year | position | percent | +|----+--------+------------+-----------| +| 0 | 2020 | director | 10% | +| 5 | 2021 | director | 10% | +| 3 | 2021 | manager | 35% | +| 2 | 2020 | manager | 40% | +| 1 | 2020 | other | 50% | +| 4 | 2021 | other | 55% | ++----+--------+------------+-----------+ + +We can see that this sort entails automatic ordering by attribute +"position". + +Dependency "{} : 2<= ~ 3<=" is similar to the first and means that +ordering the table by attribute "position" automatically entails +ordering by attribute "percent". This can be seen in the table above. + +In other words, these dependencies indicate that the ordering of +percents entails an automatic ordering of the positions and vice +versa. + +simple ods: 2 +{3} : [] -> 2<= +{1,2} : [] -> 3<= + +Dependency "{3} : [] -> 2<=" means that inside each equivalence +class from "percent" the constancy of the attribute "position" can +be traced. + +We have 5 equivalence classes in "percent": +class [10%] with 2 elements +class [50%] with 1 element +class [40%] with 1 element +class [35%] with 1 element +class [55%] with 1 element +class [10%] with 2 elements + +This table shows the constancy of values from attribute "position" +within each equivalence class from "percent". For clarity, lines +containing different equivalence classes are colored differently. + ++--------+------------+-----------+ +| year | position | percent | +|--------+------------+-----------| +| 2020 | director | 10% | +| 2020 | other | 50% | +| 2020 | manager | 40% | +| 2021 | manager | 35% | +| 2021 | other | 55% | +| 2021 | director | 10% | ++--------+------------+-----------+ + +Dependency "{1,2} : [] -> 3<=" contains 2 attributes ("year" and +"position") in its context and means the following: in the context +of one year and one position the constancy of percents is observed. +That is, in those tuples in which the year and position are the same, +the same percent value is observed. + +The following table shows these observations. + ++--------+------------+-----------+ +| year | position | percent | +|--------+------------+-----------| +| 2020 | director | 10% | +| 2020 | other | 50% | +| 2020 | manager | 40% | +| 2021 | manager | 35% | +| 2021 | other | 55% | +| 2021 | director | 10% | ++--------+------------+-----------+ + +Consider the following two tables. In the first, dependency +"{1,2} : [] -> 3<=" continues to exist. But in the second one no +longer exists, since it is violated in third tuple, where the pair +(2020, director) corresponds to 20%. + +Dependency "{1,2} : [] -> 3<=" continues to exist: ++--------+------------+-----------+ +| year | position | percent | +|--------+------------+-----------| +| 2020 | director | 10% | +| 2020 | director | 10% | +| 2020 | director | 10% | +| 2020 | other | 50% | +| 2020 | manager | 40% | +| 2021 | manager | 35% | +| 2021 | other | 55% | +| 2021 | director | 10% | ++--------+------------+-----------+ + +Dependency "{1,2} : [] -> 3<=" no longer exists: ++--------+------------+-----------+ +| year | position | percent | +|--------+------------+-----------| +| 2020 | director | 10% | +| 2020 | director | 10% | +| 2020 | director | 20% | +| 2020 | other | 50% | +| 2020 | manager | 40% | +| 2021 | manager | 35% | +| 2021 | other | 55% | +| 2021 | director | 10% | ++--------+------------+-----------+ diff --git a/examples/testing/outputs/verifying_aucc_output.txt b/examples/testing/outputs/verifying_aucc_output.txt new file mode 100644 index 000000000..2ba32f13d --- /dev/null +++ b/examples/testing/outputs/verifying_aucc_output.txt @@ -0,0 +1,64 @@ +Dataset AUCC_example.csv: + ID name card_num card_active +0 1 Alex 665 True +1 2 Liam 667 True +2 3 Ezra 553 True +3 4 Alex 665 False +4 5 Kian 667 False +5 6 Otis 111 True +-------------------------------------------------------------------------------- +Checking whether (ID) UCC holds +-------------------------------------------------------------------------------- + +UCC holds, showing stats for AUCC is useless + +-------------------------------------------------------------------------------- +Checking whether (name) UCC holds +It should not hold, there are 2 persons, named Alex +-------------------------------------------------------------------------------- + +UCC does not hold +But AUCC with error = 0.0667 holds + +Also: +Total number of rows violating UCC: 2 +Number of clusters violating UCC: 1 +Clusters violating UCC: +found 1 clusters violating UCC: + +First violating cluster: + ID name card_num card_active +0 1 Alex 665 True +3 4 Alex 665 False + +-------------------------------------------------------------------------------- +Checking whether (card_num) UCC holds +It should not hold, there are 2 identical card numbers +-------------------------------------------------------------------------------- + +UCC does not hold +But AUCC with error = 0.1333 holds + +Also: +Total number of rows violating UCC: 4 +Number of clusters violating UCC: 2 +Clusters violating UCC: +found 2 clusters violating UCC: + +First violating cluster: + ID name card_num card_active +0 1 Alex 665 True +3 4 Alex 665 False +Second violating cluster: + ID name card_num card_active +1 2 Liam 667 True +4 5 Kian 667 False + +-------------------------------------------------------------------------------- +Checking whether (card_num, card_active) UCC holds +It should hold, cards with identical numbers are not active simultaneously +-------------------------------------------------------------------------------- + +UCC holds, showing stats for AUCC is useless + +-------------------------------------------------------------------------------- diff --git a/examples/testing/outputs/verifying_fd_afd_output.txt b/examples/testing/outputs/verifying_fd_afd_output.txt new file mode 100644 index 000000000..9c233efcb --- /dev/null +++ b/examples/testing/outputs/verifying_fd_afd_output.txt @@ -0,0 +1,90 @@ +First, let's look at the duplicates_short.csv table and try to verify the functional dependency in it. + + id name ... phone country +0 26 Björn Smith ... 25 RI +1 11859 Mary Doe ... 0 EU +2 1 Mary Doe ... 4 EU +3 56 Emily Honjo ... 55 GZ +4 30 Björn Tarski ... 29 PR +5 17788 Mary Doe ... 0 EU +6 5930 Mary Doe ... 0 EU +7 58 Lisa Smith ... 57 CM +8 29 Björn Shiramine ... 28 EU +9 28 Björn Wolf ... 27 AI +10 60 Lisa Wolf ... 59 FC +11 11886 Björn Wolf ... 27 AI +12 5970 Maxine Doe ... 40 CM +13 46 Maxine Tarski ... 45 EU +14 5957 Björn Wolf ... 27 AI + +[15 rows x 7 columns] + +Checking whether [id] -> [name] FD holds + FD holds +Checking whether [name] -> [credit_score] FD holds + FD does not hold +Number of clusters violating FD: 2 + #1 cluster: +1: Mary Doe -> 0.0 +2: Mary Doe -> 0.0 +5: Mary Doe -> 0.0 +6: Mary Doe -> nan +Most frequent rhs value proportion: 0.75 +Num distinct rhs values: 2 + + #2 cluster: +9: Björn Wolf -> 27.0 +11: Björn Wolf -> 28.0 +14: Björn Wolf -> 27.0 +Most frequent rhs value proportion: 0.6666666666666666 +Num distinct rhs values: 2 + +We learned that in this case the specified FD does not hold and there are two clusters of rows that contain values that prevent our FD from holding. A cluster (with respect to a fixed FD) is a collection of rows that share the same left-hand side part but differ on the right-hand side one. +Let's take a closer look at them. + +In the first cluster, three values are "0" and a single one is "nan". This suggests that this single entry with the "nan" value is a result of a mistake by someone who is not familiar with the table population policy. Therefore, it should probably be changed to "0". + +Now let's take a look at the second cluster. There are two entries: "27" and "28". In this case, it is probably a typo, since buttons 7 and 8 are located close to each other on the keyboard. + +Having analyzed these clusters, we can conclude that our FD does not hold due to typos in the data. Therefore, by eliminating them, we can get this FD to hold (and make our dataset error-free). + +-------------------------------------------------------------------------------- +Now let's look at the DnD.csv to consider the AFD + + Creature Strength HaveMagic +0 Ogre 9 False +1 Ogre 6 False +2 Elf 6 True +3 Elf 6 True +4 Elf 1 True +5 Dwarf 9 False +6 Dwarf 6 False + +Checking whether [Creature] -> [Strength] AFD holds (error threshold = 0.5) + AFD with this error threshold holds +Checking whether [Creature] -> [Strength] AFD holds (error threshold = 0.1) + AFD with this error threshold does not hold +But the same AFD with error threshold = 0.19047619047619047 holds + +Similarly to the FD verification primitive, the AFD one can provide a user with clusters: + +Number of clusters violating FD: 3 + #1 cluster: +2: Elf -> 6 +3: Elf -> 6 +4: Elf -> 1 +Most frequent rhs value proportion: 0.6666666666666666 +Num distinct rhs values: 2 + + #2 cluster: +0: Ogre -> 9 +1: Ogre -> 6 +Most frequent rhs value proportion: 0.5 +Num distinct rhs values: 2 + + #3 cluster: +5: Dwarf -> 9 +6: Dwarf -> 6 +Most frequent rhs value proportion: 0.5 +Num distinct rhs values: 2 + diff --git a/examples/testing/outputs/verifying_mfd_output.txt b/examples/testing/outputs/verifying_mfd_output.txt new file mode 100644 index 000000000..b7fa244a5 --- /dev/null +++ b/examples/testing/outputs/verifying_mfd_output.txt @@ -0,0 +1 @@ +MFD holds diff --git a/examples/testing/outputs/verifying_ucc_output.txt b/examples/testing/outputs/verifying_ucc_output.txt new file mode 100644 index 000000000..25d5ef93b --- /dev/null +++ b/examples/testing/outputs/verifying_ucc_output.txt @@ -0,0 +1,18 @@ +Checking whether (First Name) UCC holds +UCC does not hold +Total number of rows violating UCC: 2 +Number of clusters violating UCC: 1 +Clusters violating UCC: +[4, 5] + +Checking whether (First Name, Last Name) UCC holds +UCC holds + +Checking whether (Born Town, Born Country) UCC holds +UCC does not hold +Total number of rows violating UCC: 5 +Number of clusters violating UCC: 2 +Clusters violating UCC: +[2, 3, 4] +[6, 7] + diff --git a/examples/testing/test_examples.sh b/examples/testing/test_examples.sh new file mode 100644 index 000000000..817c0298f --- /dev/null +++ b/examples/testing/test_examples.sh @@ -0,0 +1,40 @@ +#!/bin/bash +echo "Testing afd_multiple_error_thresholds" && python3 examples/afd_multiple_error_thresholds.py | diff - examples/testing/outputs/afd_multiple_error_thresholds_output.txt + +echo "Testing algebraic_constraints" && python3 examples/algebraic_constraints.py | diff - examples/testing/outputs/algebraic_constraints_output.txt + +echo "Testing anomaly_detection" && python3 examples/anomaly_detection.py | diff - examples/testing/outputs/anomaly_detection_output.txt + +echo "Testing comparison_pfd_vs_afd" && python3 examples/comparison_pfd_vs_afd.py | diff - examples/testing/outputs/comparison_pfd_vs_afd_output.txt + +echo "Testing data_stats" && python3 examples/data_stats.py | diff - examples/testing/outputs/data_stats_output.txt + +#command sed "s,\x1B\[[0-9;]*[a-zA-Z],,g" removes ANSI color codes from output +echo "Testing mine_typos" && python3 examples/mine_typos.py | sed "s,\x1B\[[0-9;]*[a-zA-Z],,g" | diff - examples/testing/outputs/mine_typos_output.txt + +echo "Testing mining_afd" && python3 examples/mining_afd.py | diff - examples/testing/outputs/mining_afd_output.txt + +echo "Testing mining_fd" && python3 examples/mining_fd.py | diff - examples/testing/outputs/mining_fd_output.txt + +echo "Testing mining_ind" && python3 examples/mining_ind.py | diff --color=never - examples/testing/outputs/mining_ind_output.txt + +echo "Testing mining_list_od" && python3 examples/mining_list_od.py | diff - examples/testing/outputs/mining_list_od_output.txt + +echo "Testing mining_pfd" && python3 examples/mining_pfd.py | diff - examples/testing/outputs/mining_pfd_output.txt + +echo "Testing mining_set_od_1" && python3 examples/mining_set_od_1.py | diff - examples/testing/outputs/mining_set_od_1_output.txt + +echo "Testing mining_set_od_2" && python3 examples/mining_set_od_2.py --color=never | sed "s,\x1B\[[0-9;]*[a-zA-Z],,g" | diff - examples/testing/outputs/mining_set_od_2_output.txt + +echo "Testing verifying_aucc" && python3 examples/verifying_aucc.py | diff - examples/testing/outputs/verifying_aucc_output.txt + +echo "Testing verifying_fd_afd" && python3 examples/verifying_fd_afd.py | sed "s,\x1B\[[0-9;]*[a-zA-Z],,g" | diff --color=never - examples/testing/outputs/verifying_fd_afd_output.txt + +echo "Testing verifying_mfd" && python3 examples/verifying_mfd.py | diff - examples/testing/outputs/verifying_mfd_output.txt + +echo "Testing verifying_ucc" && python3 examples/verifying_ucc.py | diff - examples/testing/outputs/verifying_ucc_output.txt + +#!!!there are warnings in the current dedupe.py version +echo "Testing dedupe" && python3 -W ignore examples/dedupe.py < examples/testing/inputs/dedupe_input.txt | diff - examples/testing/outputs/dedupe_output.txt + +echo "Testing mining_cfd" && python3 examples/mining_cfd.py | sed "s,\x1B\[[0-9;]*[a-zA-Z],,g" | diff - examples/testing/outputs/mining_cfd_output.txt diff --git a/src/core/algorithms/fd/fd_algorithm.cpp b/src/core/algorithms/fd/fd_algorithm.cpp index 420dbd4f0..2827253fd 100644 --- a/src/core/algorithms/fd/fd_algorithm.cpp +++ b/src/core/algorithms/fd/fd_algorithm.cpp @@ -27,6 +27,20 @@ void FDAlgorithm::ResetState() { ResetStateFd(); } +std::list& FDAlgorithm::SortedFdList() { + fd_collection_.AsList().sort([](const FD& l_fd, const FD& r_fd) { + if (l_fd.GetLhs().GetArity() != r_fd.GetLhs().GetArity()) { + return l_fd.GetLhs().GetArity() < r_fd.GetLhs().GetArity(); + } + if (l_fd.GetLhs() != r_fd.GetLhs()) { + return l_fd.GetLhs() < r_fd.GetLhs(); + } + return l_fd.GetRhsIndex() < r_fd.GetRhsIndex(); + }); + + return fd_collection_.AsList(); +} + std::string FDAlgorithm::GetJsonFDs() const { return FDsToJson(FdList()); } diff --git a/src/core/algorithms/fd/fd_algorithm.h b/src/core/algorithms/fd/fd_algorithm.h index f80d17ff3..965505c72 100644 --- a/src/core/algorithms/fd/fd_algorithm.h +++ b/src/core/algorithms/fd/fd_algorithm.h @@ -66,6 +66,8 @@ class FDAlgorithm : public Algorithm { return fd_collection_.AsList(); } + std::list& SortedFdList(); + /* возвращает набор ФЗ в виде JSON-а. По сути, это просто представление фиксированного формата * для сравнения результатов разных алгоритмов. JSON - на всякий случай, если потом, например, * понадобится загрузить список в питон и как-нибудь его поанализировать diff --git a/src/core/model/table/vertical.cpp b/src/core/model/table/vertical.cpp index 5817a0aba..697097071 100644 --- a/src/core/model/table/vertical.cpp +++ b/src/core/model/table/vertical.cpp @@ -143,11 +143,3 @@ std::vector Vertical::GetParents() const { } return parents; } - -bool Vertical::operator<(Vertical const& rhs) const { - assert(*schema_ == *rhs.schema_); - if (this->column_indices_ == rhs.column_indices_) return false; - - boost::dynamic_bitset<> const& lr_xor = (this->column_indices_ ^ rhs.column_indices_); - return rhs.column_indices_.test(lr_xor.find_first()); -} diff --git a/src/core/model/table/vertical.h b/src/core/model/table/vertical.h index 672622b50..99cf675b2 100644 --- a/src/core/model/table/vertical.h +++ b/src/core/model/table/vertical.h @@ -42,7 +42,9 @@ class Vertical { * it treats bitsets little endian during comparison and this is not * suitable for this case, check out operator< for Columns. */ - bool operator<(Vertical const& rhs) const; + bool operator<(Vertical const& rhs) const { + return column_indices_ < rhs.column_indices_; + } bool operator==(Vertical const& other) const { return column_indices_ == other.column_indices_; diff --git a/src/python_bindings/fd/bind_fd.cpp b/src/python_bindings/fd/bind_fd.cpp index 609f7eb8d..78fa4b0cf 100644 --- a/src/python_bindings/fd/bind_fd.cpp +++ b/src/python_bindings/fd/bind_fd.cpp @@ -56,8 +56,7 @@ void BindFd(py::module_& main_module) { static constexpr auto kPFDTaneName = "PFDTane"; auto fd_algos_module = BindPrimitive(fd_module, py::overload_cast<>(&FDAlgorithm::FdList, py::const_), - "FdAlgorithm", "get_fds", + PFDTane>(fd_module, &FDAlgorithm::SortedFdList, "FdAlgorithm", "get_fds", {"HyFD", "Aid", "Depminer", "DFD", "FastFDs", "FDep", "FdMine", "FUN", kPyroName, kTaneName, kPFDTaneName});