From e8550ea64523cc562cbce8c9b99481659212c340 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= <radoslaw.wasko@enso.org>
Date: Thu, 4 Mar 2021 18:23:43 +0100
Subject: [PATCH] Fix edge cases with name disambiguation on join

---
 .../std-lib/Database/src/Data/Table.enso      | 56 +++++++++++++++----
 distribution/std-lib/Test/src/Test.enso       | 13 ++---
 test/Database_Tests/src/Codegen_Spec.enso     | 14 +++++
 test/Database_Tests/src/Database_Spec.enso    |  3 +-
 4 files changed, 64 insertions(+), 22 deletions(-)

diff --git a/distribution/std-lib/Database/src/Data/Table.enso b/distribution/std-lib/Database/src/Data/Table.enso
index 50acaf7995089..d99320fcbdfd5 100644
--- a/distribution/std-lib/Database/src/Data/Table.enso
+++ b/distribution/std-lib/Database/src/Data/Table.enso
@@ -28,7 +28,6 @@ type Table
          - format_terminal: whether ANSI-terminal formatting should be used
     display : Integer -> Boolean -> Text
     display show_rows=10 format_terminal=False =
-        # TODO [RW] should I display the row with numbers if no index is set as in dataframes??
         df = this.reset_index.to_dataframe max_rows=show_rows
         indices_count = this.context.meta_index.length
         all_rows_count = this.row_count
@@ -275,6 +274,8 @@ type Table
     join : Table -> Nothing | Text | Column | Vector (Text | Column) -> Boolean -> Text -> Text -> Table
     join other on=Nothing drop_unmatched=False left_suffix='_left' right_suffix='_right' = Panic.recover <|
         Panic.rethrow (Helpers.ensure_name_is_sane left_suffix && Helpers.ensure_name_is_sane right_suffix)
+        if left_suffix == right_suffix then
+            Panic.throw <| Illegal_State_Error "left_suffix must be different from right_suffix"
         kind = if drop_unmatched then IR.Join_Inner else IR.Join_Left
         my_index : Vector Internal_Column
         my_index = case on of
@@ -312,18 +313,14 @@ type Table
                 new_limit = Nothing
                 new_ctx = IR.Context new_from [] [] [] new_index new_limit
 
-                # TODO [RW, MK] what if we have name conflict because both tables contain 'A', but then also one of them contains 'A'+suffix ??
-                left_names = Map.from_vector (this.internal_columns.map (p -> [p.name, True]))
-                right_names = Map.from_vector (other.internal_columns.map (p -> [p.name, True]))
-                rename suffix other_names this_name =
-                    if other_names.get_or_else this_name False then this_name+suffix else this_name
-                rename_left = rename left_suffix right_names
-                rename_right = rename right_suffix left_names
+                new_names = here.combine_names (this.internal_columns.map .name) (other.internal_columns.map .name) left_suffix right_suffix
+                left_names = new_names.first
+                right_names = new_names.second
 
-                new_left_columns = this.internal_columns.map p->
-                    Internal_Column (rename_left p.name) p.sql_type (IR.Column left_alias p.name)
-                new_right_columns = other.internal_columns.map p->
-                    Internal_Column (rename_right p.name) p.sql_type (IR.Column right_alias p.name)
+                new_left_columns = this.internal_columns.zip left_names p-> new_name->
+                    Internal_Column new_name p.sql_type (IR.Column left_alias p.name)
+                new_right_columns = other.internal_columns.zip right_names p-> new_name->
+                    Internal_Column new_name p.sql_type (IR.Column right_alias p.name)
 
                 new_columns = new_left_columns + new_right_columns
 
@@ -564,3 +561,38 @@ display_dataframe df indices_count all_rows_count format_terminal =
         missing_rows_count = all_rows_count - display_rows
         missing = '\n\u2026 and ' + missing_rows_count.to_text + ' hidden rows.'
         table + missing
+
+## PRIVATE
+
+   Creates a list of non-colliding names by merging the two lists and
+   appending suffixes if necessary.
+
+   If even after appending the suffixes it is impossible to have unique names,
+   it throws a panic. It returns two vectors, one for each input. It assumes
+   that the names within each argument itself are unique.
+combine_names left_names right_names left_suffix right_suffix =
+    make_count_map names =
+        map = names.fold Map.empty acc-> name->
+            count = acc.get_or_else name 0 + 1
+            acc.insert name count
+        name-> map.get_or_else name 0
+    original_names_count = make_count_map left_names+right_names
+    add_suffix_if_necessary suffix name = case original_names_count name > 1 of
+        True -> [name, name+suffix]
+        False -> [name, name]
+    left_pairs = left_names.map <| add_suffix_if_necessary left_suffix
+    right_pairs = right_names.map <| add_suffix_if_necessary right_suffix
+
+    new_names_count = make_count_map (left_pairs+right_pairs . map .second)
+    catch_ambiguity pairs = pairs.each pair->
+        original_name = pair.first
+        new_name = pair.second
+        case new_name!=original_name && (new_names_count new_name > 1) of
+            True ->
+                Panic.throw <| Illegal_State_Error "Duplicate column "+original_name+" was about to be renamed to "+new_name+" to disambiguate column names, but a column with name "+new_name+" already exists too. Please rename the columns before joining to avoid ambiguity."
+            False -> Nothing
+    catch_ambiguity left_pairs
+    catch_ambiguity right_pairs
+    new_left_names = left_pairs.map .second
+    new_right_names = right_pairs.map .second
+    [new_left_names, new_right_names]
diff --git a/distribution/std-lib/Test/src/Test.enso b/distribution/std-lib/Test/src/Test.enso
index e8cc515e482a8..8608c8a7e1b26 100644
--- a/distribution/std-lib/Test/src/Test.enso
+++ b/distribution/std-lib/Test/src/Test.enso
@@ -19,7 +19,7 @@ Spec.is_fail = this.behaviors.any .is_fail
 Suite.is_fail = this.specs.any .is_fail
 
 ## PRIVATE
-type Finished_With_Error err
+type Finished_With_Error err stack_trace
 
 ## PRIVATE
 type Matched_On_Error err
@@ -163,20 +163,17 @@ specify label ~behavior pending=Nothing =
 
 ## PRIVATE
 run_spec ~behavior =
-    ## FIXME revert these lines
     recovery = Panic.recover <|
-        behavior.catch err-> Panic.throw (Finished_With_Error err)
+        result = behavior
+        result.catch err-> Panic.throw (Finished_With_Error err result.get_stack_trace_text)
         Nothing
-#    recovery =
-#        behavior.map_error Finished_With_Error
     maybeExc = case recovery of
         _ -> Success
     result = maybeExc.catch ex->
         case ex of
             Failure _ -> ex
-            Finished_With_Error x ->
-                # TODO [RW] display at least original error location
-                Failure ("An unexpected error was returned: " + x.to_text)
+            Finished_With_Error err stack_trace_text ->
+                Failure ("An unexpected error was returned: " + err.to_text + '\n' + stack_trace_text)
             _ -> Failure ("An unexpected panic was thrown: " + ex.to_text + '\n' + maybeExc.get_stack_trace_text)
     result
 
diff --git a/test/Database_Tests/src/Codegen_Spec.enso b/test/Database_Tests/src/Codegen_Spec.enso
index 11bc1f5213d24..088c7bfbbcd6e 100644
--- a/test/Database_Tests/src/Codegen_Spec.enso
+++ b/test/Database_Tests/src/Codegen_Spec.enso
@@ -103,6 +103,20 @@ spec =
             r1 = t1.join (t3.set_index 'E') on='A'
             r1.to_sql.prepare . should_equal ['SELECT "T1"."A" AS "A_left", "T1"."B" AS "B", "T1"."C" AS "C", "T3"."A" AS "A_right", "T3"."F" AS "F" FROM (SELECT "T1"."A" AS "A", "T1"."B" AS "B", "T1"."C" AS "C" FROM "T1" AS "T1") AS "T1" LEFT JOIN (SELECT "T3"."E" AS "E", "T3"."A" AS "A", "T3"."F" AS "F" FROM "T3" AS "T3") AS "T3" ON ("T1"."A" = "T3"."E")', []]
 
+        Test.specify "should avoid duplicates when disambiguating column names" <|
+            connection =
+                table1 = ["T1", [["X", int], ["A", int], ["A_left", int]]]
+                table2 = ["T2", [["X", int], ["A", int], ["B", int]]]
+                tables = Map.from_vector [table1, table2]
+                Fake_Test_Connection.make Dialect.sqlite tables
+            t1 = connection.access_table "T1"
+            t2 = connection.access_table "T2"
+            (t1.set_index "X").join (t2.set_index "X") . should_fail_with Illegal_State_Error
+
+        Test.specify "should ensure that name suffixes are distinct" <|
+            err = (t1.set_index 'A').join (t2.set_index 'D') left_suffix='foo' right_suffix='foo'
+            err . should_fail_with Illegal_State_Error
+
         Test.specify "should correctly handle self-joins" <|
             r1 = t1.join (t1.set_index 'A') on='B'
             r1.to_sql.prepare . should_equal ['SELECT "T1_left"."A" AS "A", "T1_left"."B" AS "B_left", "T1_left"."C" AS "C_left", "T1_right"."B" AS "B_right", "T1_right"."C" AS "C_right" FROM (SELECT "T1"."A" AS "A", "T1"."B" AS "B", "T1"."C" AS "C" FROM "T1" AS "T1") AS "T1_left" LEFT JOIN (SELECT "T1"."A" AS "A", "T1"."B" AS "B", "T1"."C" AS "C" FROM "T1" AS "T1") AS "T1_right" ON ("T1_left"."B" = "T1_right"."A")', []]
diff --git a/test/Database_Tests/src/Database_Spec.enso b/test/Database_Tests/src/Database_Spec.enso
index ea171b84164b7..2d9345e045bcd 100644
--- a/test/Database_Tests/src/Database_Spec.enso
+++ b/test/Database_Tests/src/Database_Spec.enso
@@ -303,7 +303,6 @@ spec connection =
             df.insert [4, 4, 5.2, True, "baz"]
             df.insert [1, 5, 1.6, False, "spam"]
 
-            ord = [0, 3, 2, 4, 1]
             ints = [1, 2, 3, 4, 5]
             reals = [1.3, 4.6, 3.2, 5.2, 1.6]
             bools = [False, False, True, True, False]
@@ -353,6 +352,6 @@ sqlite_spec =
     file.delete
 
 postgres_spec =
-    # TODO use env vars to read tmp DB config
+    # TODO [RW] use env vars to read tmp DB config
     IO.println "PostgreSQL test database is not configured, skipping."
     Nothing