Desbordante · DaniilGoncharov · Apr 22, 2024
diff --git a/README.md b/README.md
@@ -54,13 +54,13 @@ python3 cli.py --task=fd --table=../examples/datasets/university_fd.csv , True
 ```
 
 ```text
-[Course Classroom] -> Professor
-[Classroom Semester] -> Professor
-[Classroom Semester] -> Course
 [Professor] -> Course
-[Professor Semester] -> Classroom
+[Course Classroom] -> Professor
 [Course Semester] -> Classroom
 [Course Semester] -> Professor
+[Classroom Semester] -> Course
+[Classroom Semester] -> Professor
+[Professor Semester] -> Classroom
 ```
 
 2) Discover all approximate functional dependencies with error less than or equal to 0.1 in a table represented by a .csv file that uses a comma as the separator and has a header row. In this example the default AFD discovery algorithm (Pyro) is used.
@@ -114,13 +114,13 @@ for fd in result:
 ```
 ```text
 FDs:
-[Course Classroom] -> Professor
-[Classroom Semester] -> Professor
-[Classroom Semester] -> Course
 [Professor] -> Course
-[Professor Semester] -> Classroom
+[Course Classroom] -> Professor
 [Course Semester] -> Classroom
 [Course Semester] -> Professor
+[Classroom Semester] -> Course
+[Classroom Semester] -> Professor
+[Professor Semester] -> Classroom
 ```
 
 2) Discover all approximate functional dependencies with error less than or equal to 0.1 in a table represented by a .csv file that uses a comma as the separator and has a header row. In this example the AFD discovery algorithm Pyro is used.
@@ -141,8 +141,8 @@ for fd in result:
 ```
 ```text
 AFDs:
-[Id] -> Price
 [Id] -> ProductName
+[Id] -> Price
 [ProductName] -> Price
 ```
 
@@ -178,16 +178,16 @@ MFD holds
 >>> pyro.load_data(table=df)
 >>> pyro.execute(error=0.0)
 >>> print(f'[{", ".join(map(str, pyro.get_fds()))}]')
-[[0 1 2] -> 4, [0 2 3] -> 4, [0 1 3] -> 4, [1 2 3] -> 4]
+[[0 1 2] -> 4, [0 1 3] -> 4, [0 2 3] -> 4, [1 2 3] -> 4]
 >>> pyro.execute(error=0.1)
 >>> print(f'[{", ".join(map(str, pyro.get_fds()))}]')
-[[2] -> 0, [2] -> 3, [2] -> 1, [0] -> 2, [3] -> 0, [0] -> 3, [0] -> 1, [1] -> 3, [1] -> 0, [3] -> 2, [3] -> 1, [1] -> 2, [2] -> 4, [3] -> 4, [0] -> 4, [1] -> 4]
+[[0] -> 1, [0] -> 2, [0] -> 3, [0] -> 4, [1] -> 0, [1] -> 2, [1] -> 3, [1] -> 4, [2] -> 0, [2] -> 1, [2] -> 3, [2] -> 4, [3] -> 0, [3] -> 1, [3] -> 2, [3] -> 4]
 >>> pyro.execute(error=0.2)
 >>> print(f'[{", ".join(map(str, pyro.get_fds()))}]')
-[[2] -> 0, [0] -> 2, [3] -> 2, [1] -> 2, [2] -> 4, [3] -> 4, [0] -> 4, [1] -> 4, [3] -> 0, [1] -> 0, [2] -> 3, [2] -> 1, [0] -> 3, [0] -> 1, [1] -> 3, [3] -> 1]
+[[0] -> 1, [0] -> 2, [0] -> 3, [0] -> 4, [1] -> 0, [1] -> 2, [1] -> 3, [1] -> 4, [2] -> 0, [2] -> 1, [2] -> 3, [2] -> 4, [3] -> 0, [3] -> 1, [3] -> 2, [3] -> 4]
 >>> pyro.execute(error=0.3)
 >>> print(f'[{", ".join(map(str, pyro.get_fds()))}]')
-[[2] -> 1, [0] -> 2, [2] -> 0, [2] -> 3, [0] -> 1, [3] -> 2, [3] -> 1, [1] -> 2, [3] -> 0, [0] -> 3, [4] -> 1, [1] -> 0, [1] -> 3, [4] -> 2, [4] -> 3, [2] -> 4, [3] -> 4, [0] -> 4, [1] -> 4]
+[[0] -> 1, [0] -> 2, [0] -> 3, [0] -> 4, [1] -> 0, [1] -> 2, [1] -> 3, [1] -> 4, [2] -> 0, [2] -> 1, [2] -> 3, [2] -> 4, [3] -> 0, [3] -> 1, [3] -> 2, [3] -> 4, [4] -> 1, [4] -> 2, [4] -> 3]
 ```
 
 ## Web interface

diff --git a/README_PYPI.md b/README_PYPI.md
@@ -73,13 +73,13 @@ for fd in result:
 
 ```text
 FDs:
-[Course Classroom] -> Professor
-[Classroom Semester] -> Professor
-[Classroom Semester] -> Course
 [Professor] -> Course
-[Professor Semester] -> Classroom
+[Course Classroom] -> Professor
 [Course Semester] -> Classroom
 [Course Semester] -> Professor
+[Classroom Semester] -> Course
+[Classroom Semester] -> Professor
+[Professor Semester] -> Classroom
 ```
 
 2) Discover all approximate functional dependencies with error less than or equal to 0.1 in a table represented by a
@@ -103,8 +103,8 @@ for fd in result:
 
 ```text
 AFDs:
-[Id] -> Price
 [Id] -> ProductName
+[Id] -> Price
 [ProductName] -> Price
 ```
 
@@ -145,16 +145,16 @@ MFD holds
 >>> pyro.load_data(table=df)
 >>> pyro.execute(error=0.0)
 >>> print(f'[{", ".join(map(str, pyro.get_fds()))}]')
-[[0 1 2] -> 4, [0 2 3] -> 4, [0 1 3] -> 4, [1 2 3] -> 4]
+[[0 1 2] -> 4, [0 1 3] -> 4, [0 2 3] -> 4, [1 2 3] -> 4]
 >>> pyro.execute(error=0.1)
 >>> print(f'[{", ".join(map(str, pyro.get_fds()))}]')
-[[2] -> 0, [2] -> 3, [2] -> 1, [0] -> 2, [3] -> 0, [0] -> 3, [0] -> 1, [1] -> 3, [1] -> 0, [3] -> 2, [3] -> 1, [1] -> 2, [2] -> 4, [3] -> 4, [0] -> 4, [1] -> 4]
+[[0] -> 1, [0] -> 2, [0] -> 3, [0] -> 4, [1] -> 0, [1] -> 2, [1] -> 3, [1] -> 4, [2] -> 0, [2] -> 1, [2] -> 3, [2] -> 4, [3] -> 0, [3] -> 1, [3] -> 2, [3] -> 4]
 >>> pyro.execute(error=0.2)
 >>> print(f'[{", ".join(map(str, pyro.get_fds()))}]')
-[[2] -> 0, [0] -> 2, [3] -> 2, [1] -> 2, [2] -> 4, [3] -> 4, [0] -> 4, [1] -> 4, [3] -> 0, [1] -> 0, [2] -> 3, [2] -> 1, [0] -> 3, [0] -> 1, [1] -> 3, [3] -> 1]
+[[0] -> 1, [0] -> 2, [0] -> 3, [0] -> 4, [1] -> 0, [1] -> 2, [1] -> 3, [1] -> 4, [2] -> 0, [2] -> 1, [2] -> 3, [2] -> 4, [3] -> 0, [3] -> 1, [3] -> 2, [3] -> 4]
 >>> pyro.execute(error=0.3)
 >>> print(f'[{", ".join(map(str, pyro.get_fds()))}]')
-[[2] -> 1, [0] -> 2, [2] -> 0, [2] -> 3, [0] -> 1, [3] -> 2, [3] -> 1, [1] -> 2, [3] -> 0, [0] -> 3, [4] -> 1, [1] -> 0, [1] -> 3, [4] -> 2, [4] -> 3, [2] -> 4, [3] -> 4, [0] -> 4, [1] -> 4]
+[[0] -> 1, [0] -> 2, [0] -> 3, [0] -> 4, [1] -> 0, [1] -> 2, [1] -> 3, [1] -> 4, [2] -> 0, [2] -> 1, [2] -> 3, [2] -> 4, [3] -> 0, [3] -> 1, [3] -> 2, [3] -> 4, [4] -> 1, [4] -> 2, [4] -> 3]
 ```
 
 More examples can be found in the [Desbordante repository](https://github.com/Desbordante/desbordante-core/tree/main/examples) on GitHub.

diff --git a/examples/afd_multiple_error_thresholds.py b/examples/afd_multiple_error_thresholds.py
@@ -5,13 +5,13 @@
 pyro.load_data(table=df)
 pyro.execute(error=0.0)
 print(f'[{", ".join(map(str, pyro.get_fds()))}]')
-# [[0 1 2] -> 4, [0 2 3] -> 4, [0 1 3] -> 4, [1 2 3] -> 4]
+# [[0 1 2] -> 4, [0 1 3] -> 4, [0 2 3] -> 4, [1 2 3] -> 4]
 pyro.execute(error=0.1)
 print(f'[{", ".join(map(str, pyro.get_fds()))}]')
-# [[2] -> 0, [2] -> 3, [2] -> 1, [0] -> 2, [3] -> 0, [0] -> 3, [0] -> 1, [1] -> 3, [1] -> 0, [3] -> 2, [3] -> 1, [1] -> 2, [2] -> 4, [3] -> 4, [0] -> 4, [1] -> 4]
+# [[0] -> 1, [0] -> 2, [0] -> 3, [0] -> 4, [1] -> 0, [1] -> 2, [1] -> 3, [1] -> 4, [2] -> 0, [2] -> 1, [2] -> 3, [2] -> 4, [3] -> 0, [3] -> 1, [3] -> 2, [3] -> 4]
 pyro.execute(error=0.2)
 print(f'[{", ".join(map(str, pyro.get_fds()))}]')
-# [[2] -> 0, [0] -> 2, [3] -> 2, [1] -> 2, [2] -> 4, [3] -> 4, [0] -> 4, [1] -> 4, [3] -> 0, [1] -> 0, [2] -> 3, [2] -> 1, [0] -> 3, [0] -> 1, [1] -> 3, [3] -> 1]
+# [[0] -> 1, [0] -> 2, [0] -> 3, [0] -> 4, [1] -> 0, [1] -> 2, [1] -> 3, [1] -> 4, [2] -> 0, [2] -> 1, [2] -> 3, [2] -> 4, [3] -> 0, [3] -> 1, [3] -> 2, [3] -> 4]
 pyro.execute(error=0.3)
 print(f'[{", ".join(map(str, pyro.get_fds()))}]')
-# [[2] -> 1, [0] -> 2, [2] -> 0, [2] -> 3, [0] -> 1, [3] -> 2, [3] -> 1, [1] -> 2, [3] -> 0, [0] -> 3, [4] -> 1, [1] -> 0, [1] -> 3, [4] -> 2, [4] -> 3, [2] -> 4, [3] -> 4, [0] -> 4, [1] -> 4]
+# [[0] -> 1, [0] -> 2, [0] -> 3, [0] -> 4, [1] -> 0, [1] -> 2, [1] -> 3, [1] -> 4, [2] -> 0, [2] -> 1, [2] -> 3, [2] -> 4, [3] -> 0, [3] -> 1, [3] -> 2, [3] -> 4, [4] -> 1, [4] -> 2, [4] -> 3]
diff --git a/examples/comparison_pfd_vs_afd.py b/examples/comparison_pfd_vs_afd.py
@@ -1,12 +1,13 @@
 import desbordante
+from ordered_set import OrderedSet
 
 TABLE = 'examples/datasets/glitchy_sensor.csv'
 ERROR = 0.18
 ERROR_MEASURE = 'per_value' # per_tuple or per_value
 
 
 def stringify(fds):
-    return set(map(str, fds))
+    return OrderedSet(map(str, fds))
 
 
 def get_afds():
@@ -23,8 +24,8 @@ def get_pfds():
     return algo.get_fds()
 
 
-pfds = set(get_pfds())
-afds = set(get_afds())
+pfds = OrderedSet(get_pfds())
+afds = OrderedSet(get_afds())
 
 print("pFDs \ AFDs =", stringify(pfds - afds))
 print("AFDs \ pFDs =", stringify(afds - pfds))

diff --git a/examples/dedupe.py b/examples/dedupe.py
@@ -93,6 +93,7 @@ def merge_handler(df: pandas.DataFrame, new_rows, remaining_rows, used_rows):
     for col_name, values in zip(df.columns,
                                 zip(*df.iloc[list(used_rows)].itertuples(index=False))):
         distinct_values = list(set(values))
+        distinct_values.sort()
         index = 0 if len(distinct_values) == 1 else choose_index(col_name, distinct_values)
         new_row.append(distinct_values[index])
     remaining_rows -= used_rows

diff --git a/examples/mining_set_od_2.py b/examples/mining_set_od_2.py
@@ -94,7 +94,7 @@ def print_simple_ods_with_comments(simple_ods, table):
     print('be traced.')
 
     percent_values = list(table['percent'])
-    percent_classes = set([f'class [{i}] with {percent_values.count(i)} element{"" if percent_values.count(i) == 1 else "s"}'
+    percent_classes = list([f'class [{i}] with {percent_values.count(i)} element{"" if percent_values.count(i) == 1 else "s"}'
                            for i in percent_values])
 
     print()

diff --git a/examples/testing/inputs/dedupe_input.txt b/examples/testing/inputs/dedupe_input.txt
@@ -0,0 +1,13 @@
+0
+1 2 3 4 5 6
+4
+merge 7 8 9
+0
+0
+1
+keepall
+keepall
+merge 52 53
+0
+1
+keepall
diff --git a/examples/testing/outputs/afd_multiple_error_thresholds_output.txt b/examples/testing/outputs/afd_multiple_error_thresholds_output.txt
@@ -0,0 +1,4 @@
+[[0 1 2] -> 4, [0 1 3] -> 4, [0 2 3] -> 4, [1 2 3] -> 4]
+[[0] -> 1, [0] -> 2, [0] -> 3, [0] -> 4, [1] -> 0, [1] -> 2, [1] -> 3, [1] -> 4, [2] -> 0, [2] -> 1, [2] -> 3, [2] -> 4, [3] -> 0, [3] -> 1, [3] -> 2, [3] -> 4]
+[[0] -> 1, [0] -> 2, [0] -> 3, [0] -> 4, [1] -> 0, [1] -> 2, [1] -> 3, [1] -> 4, [2] -> 0, [2] -> 1, [2] -> 3, [2] -> 4, [3] -> 0, [3] -> 1, [3] -> 2, [3] -> 4]
+[[0] -> 1, [0] -> 2, [0] -> 3, [0] -> 4, [1] -> 0, [1] -> 2, [1] -> 3, [1] -> 4, [2] -> 0, [2] -> 1, [2] -> 3, [2] -> 4, [3] -> 0, [3] -> 1, [3] -> 2, [3] -> 4, [4] -> 1, [4] -> 2, [4] -> 3]
diff --git a/examples/testing/outputs/algebraic_constraints_output.txt b/examples/testing/outputs/algebraic_constraints_output.txt
@@ -0,0 +1,19 @@
+Discovered ranges for (Delivery date - Dispatch date) are:
+[(2.0, 7.0), (15.0, 22.0)]
+
+Rows in which the result of the chosen operation (-) is outside of discovered ranges:
+id: 7
+Dispatch date: 1
+Delivery date: 30
+Difference: 29
+
+id: 26
+Dispatch date: 7
+Delivery date: 18
+Difference: 11
+
+id: 30
+Dispatch date: 11
+Delivery date: 22
+Difference: 11
+
diff --git a/examples/testing/outputs/anomaly_detection_output.txt b/examples/testing/outputs/anomaly_detection_output.txt
@@ -0,0 +1,44 @@
+FDs found for dataset 1:
+[item_id] -> item_weight
+[item_weight] -> item_id
+[record_id] -> cargo_id
+[record_id] -> item_id
+[record_id] -> item_weight
+[record_id] -> timestamp
+[timestamp] -> cargo_id
+[timestamp] -> item_id
+[timestamp] -> item_weight
+[timestamp] -> record_id
+FDs found for dataset 2:
+[item_id] -> item_weight
+[item_weight] -> item_id
+[record_id] -> cargo_id
+[record_id] -> item_id
+[record_id] -> item_weight
+[record_id] -> timestamp
+[timestamp] -> cargo_id
+[timestamp] -> item_id
+[timestamp] -> item_weight
+[timestamp] -> record_id
+FDs found for dataset 3:
+[item_weight] -> item_id
+[record_id] -> cargo_id
+[record_id] -> item_id
+[record_id] -> item_weight
+[record_id] -> timestamp
+[timestamp] -> cargo_id
+[timestamp] -> item_id
+[timestamp] -> item_weight
+[timestamp] -> record_id
+AFDs found for dataset 3:
+[item_id cargo_id] -> item_weight
+[item_weight] -> item_id
+[record_id] -> cargo_id
+[record_id] -> item_id
+[record_id] -> item_weight
+[record_id] -> timestamp
+[timestamp] -> cargo_id
+[timestamp] -> item_id
+[timestamp] -> item_weight
+[timestamp] -> record_id
+MFD holds.
diff --git a/examples/testing/outputs/comparison_pfd_vs_afd_output.txt b/examples/testing/outputs/comparison_pfd_vs_afd_output.txt
@@ -0,0 +1,7 @@
+pFDs \ AFDs = OrderedSet(['[DeviceId] -> Data'])
+AFDs \ pFDs = OrderedSet()
+AFDs ∩ pFDs = OrderedSet(['[Id] -> DeviceId', '[Id] -> Data', '[Data] -> Id', '[Data] -> DeviceId'])
+1 - PerValue([DeviceId] -> Data) = 0.1714285714
+e([DeviceId] -> Data) = 0.23076923076923078
+In case of PerValue error measure, violations on data from the single "glitchy"
+sensor device among many do not prevent dependecy from being found