From e8550ea64523cc562cbce8c9b99481659212c340 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Thu, 4 Mar 2021 18:23:43 +0100 Subject: [PATCH] Fix edge cases with name disambiguation on join --- .../std-lib/Database/src/Data/Table.enso | 56 +++++++++++++++---- distribution/std-lib/Test/src/Test.enso | 13 ++--- test/Database_Tests/src/Codegen_Spec.enso | 14 +++++ test/Database_Tests/src/Database_Spec.enso | 3 +- 4 files changed, 64 insertions(+), 22 deletions(-) diff --git a/distribution/std-lib/Database/src/Data/Table.enso b/distribution/std-lib/Database/src/Data/Table.enso index 50acaf7995089..d99320fcbdfd5 100644 --- a/distribution/std-lib/Database/src/Data/Table.enso +++ b/distribution/std-lib/Database/src/Data/Table.enso @@ -28,7 +28,6 @@ type Table - format_terminal: whether ANSI-terminal formatting should be used display : Integer -> Boolean -> Text display show_rows=10 format_terminal=False = - # TODO [RW] should I display the row with numbers if no index is set as in dataframes?? df = this.reset_index.to_dataframe max_rows=show_rows indices_count = this.context.meta_index.length all_rows_count = this.row_count @@ -275,6 +274,8 @@ type Table join : Table -> Nothing | Text | Column | Vector (Text | Column) -> Boolean -> Text -> Text -> Table join other on=Nothing drop_unmatched=False left_suffix='_left' right_suffix='_right' = Panic.recover <| Panic.rethrow (Helpers.ensure_name_is_sane left_suffix && Helpers.ensure_name_is_sane right_suffix) + if left_suffix == right_suffix then + Panic.throw <| Illegal_State_Error "left_suffix must be different from right_suffix" kind = if drop_unmatched then IR.Join_Inner else IR.Join_Left my_index : Vector Internal_Column my_index = case on of @@ -312,18 +313,14 @@ type Table new_limit = Nothing new_ctx = IR.Context new_from [] [] [] new_index new_limit - # TODO [RW, MK] what if we have name conflict because both tables contain 'A', but then also one of them contains 'A'+suffix ?? - left_names = Map.from_vector (this.internal_columns.map (p -> [p.name, True])) - right_names = Map.from_vector (other.internal_columns.map (p -> [p.name, True])) - rename suffix other_names this_name = - if other_names.get_or_else this_name False then this_name+suffix else this_name - rename_left = rename left_suffix right_names - rename_right = rename right_suffix left_names + new_names = here.combine_names (this.internal_columns.map .name) (other.internal_columns.map .name) left_suffix right_suffix + left_names = new_names.first + right_names = new_names.second - new_left_columns = this.internal_columns.map p-> - Internal_Column (rename_left p.name) p.sql_type (IR.Column left_alias p.name) - new_right_columns = other.internal_columns.map p-> - Internal_Column (rename_right p.name) p.sql_type (IR.Column right_alias p.name) + new_left_columns = this.internal_columns.zip left_names p-> new_name-> + Internal_Column new_name p.sql_type (IR.Column left_alias p.name) + new_right_columns = other.internal_columns.zip right_names p-> new_name-> + Internal_Column new_name p.sql_type (IR.Column right_alias p.name) new_columns = new_left_columns + new_right_columns @@ -564,3 +561,38 @@ display_dataframe df indices_count all_rows_count format_terminal = missing_rows_count = all_rows_count - display_rows missing = '\n\u2026 and ' + missing_rows_count.to_text + ' hidden rows.' table + missing + +## PRIVATE + + Creates a list of non-colliding names by merging the two lists and + appending suffixes if necessary. + + If even after appending the suffixes it is impossible to have unique names, + it throws a panic. It returns two vectors, one for each input. It assumes + that the names within each argument itself are unique. +combine_names left_names right_names left_suffix right_suffix = + make_count_map names = + map = names.fold Map.empty acc-> name-> + count = acc.get_or_else name 0 + 1 + acc.insert name count + name-> map.get_or_else name 0 + original_names_count = make_count_map left_names+right_names + add_suffix_if_necessary suffix name = case original_names_count name > 1 of + True -> [name, name+suffix] + False -> [name, name] + left_pairs = left_names.map <| add_suffix_if_necessary left_suffix + right_pairs = right_names.map <| add_suffix_if_necessary right_suffix + + new_names_count = make_count_map (left_pairs+right_pairs . map .second) + catch_ambiguity pairs = pairs.each pair-> + original_name = pair.first + new_name = pair.second + case new_name!=original_name && (new_names_count new_name > 1) of + True -> + Panic.throw <| Illegal_State_Error "Duplicate column "+original_name+" was about to be renamed to "+new_name+" to disambiguate column names, but a column with name "+new_name+" already exists too. Please rename the columns before joining to avoid ambiguity." + False -> Nothing + catch_ambiguity left_pairs + catch_ambiguity right_pairs + new_left_names = left_pairs.map .second + new_right_names = right_pairs.map .second + [new_left_names, new_right_names] diff --git a/distribution/std-lib/Test/src/Test.enso b/distribution/std-lib/Test/src/Test.enso index e8cc515e482a8..8608c8a7e1b26 100644 --- a/distribution/std-lib/Test/src/Test.enso +++ b/distribution/std-lib/Test/src/Test.enso @@ -19,7 +19,7 @@ Spec.is_fail = this.behaviors.any .is_fail Suite.is_fail = this.specs.any .is_fail ## PRIVATE -type Finished_With_Error err +type Finished_With_Error err stack_trace ## PRIVATE type Matched_On_Error err @@ -163,20 +163,17 @@ specify label ~behavior pending=Nothing = ## PRIVATE run_spec ~behavior = - ## FIXME revert these lines recovery = Panic.recover <| - behavior.catch err-> Panic.throw (Finished_With_Error err) + result = behavior + result.catch err-> Panic.throw (Finished_With_Error err result.get_stack_trace_text) Nothing -# recovery = -# behavior.map_error Finished_With_Error maybeExc = case recovery of _ -> Success result = maybeExc.catch ex-> case ex of Failure _ -> ex - Finished_With_Error x -> - # TODO [RW] display at least original error location - Failure ("An unexpected error was returned: " + x.to_text) + Finished_With_Error err stack_trace_text -> + Failure ("An unexpected error was returned: " + err.to_text + '\n' + stack_trace_text) _ -> Failure ("An unexpected panic was thrown: " + ex.to_text + '\n' + maybeExc.get_stack_trace_text) result diff --git a/test/Database_Tests/src/Codegen_Spec.enso b/test/Database_Tests/src/Codegen_Spec.enso index 11bc1f5213d24..088c7bfbbcd6e 100644 --- a/test/Database_Tests/src/Codegen_Spec.enso +++ b/test/Database_Tests/src/Codegen_Spec.enso @@ -103,6 +103,20 @@ spec = r1 = t1.join (t3.set_index 'E') on='A' r1.to_sql.prepare . should_equal ['SELECT "T1"."A" AS "A_left", "T1"."B" AS "B", "T1"."C" AS "C", "T3"."A" AS "A_right", "T3"."F" AS "F" FROM (SELECT "T1"."A" AS "A", "T1"."B" AS "B", "T1"."C" AS "C" FROM "T1" AS "T1") AS "T1" LEFT JOIN (SELECT "T3"."E" AS "E", "T3"."A" AS "A", "T3"."F" AS "F" FROM "T3" AS "T3") AS "T3" ON ("T1"."A" = "T3"."E")', []] + Test.specify "should avoid duplicates when disambiguating column names" <| + connection = + table1 = ["T1", [["X", int], ["A", int], ["A_left", int]]] + table2 = ["T2", [["X", int], ["A", int], ["B", int]]] + tables = Map.from_vector [table1, table2] + Fake_Test_Connection.make Dialect.sqlite tables + t1 = connection.access_table "T1" + t2 = connection.access_table "T2" + (t1.set_index "X").join (t2.set_index "X") . should_fail_with Illegal_State_Error + + Test.specify "should ensure that name suffixes are distinct" <| + err = (t1.set_index 'A').join (t2.set_index 'D') left_suffix='foo' right_suffix='foo' + err . should_fail_with Illegal_State_Error + Test.specify "should correctly handle self-joins" <| r1 = t1.join (t1.set_index 'A') on='B' r1.to_sql.prepare . should_equal ['SELECT "T1_left"."A" AS "A", "T1_left"."B" AS "B_left", "T1_left"."C" AS "C_left", "T1_right"."B" AS "B_right", "T1_right"."C" AS "C_right" FROM (SELECT "T1"."A" AS "A", "T1"."B" AS "B", "T1"."C" AS "C" FROM "T1" AS "T1") AS "T1_left" LEFT JOIN (SELECT "T1"."A" AS "A", "T1"."B" AS "B", "T1"."C" AS "C" FROM "T1" AS "T1") AS "T1_right" ON ("T1_left"."B" = "T1_right"."A")', []] diff --git a/test/Database_Tests/src/Database_Spec.enso b/test/Database_Tests/src/Database_Spec.enso index ea171b84164b7..2d9345e045bcd 100644 --- a/test/Database_Tests/src/Database_Spec.enso +++ b/test/Database_Tests/src/Database_Spec.enso @@ -303,7 +303,6 @@ spec connection = df.insert [4, 4, 5.2, True, "baz"] df.insert [1, 5, 1.6, False, "spam"] - ord = [0, 3, 2, 4, 1] ints = [1, 2, 3, 4, 5] reals = [1.3, 4.6, 3.2, 5.2, 1.6] bools = [False, False, True, True, False] @@ -353,6 +352,6 @@ sqlite_spec = file.delete postgres_spec = - # TODO use env vars to read tmp DB config + # TODO [RW] use env vars to read tmp DB config IO.println "PostgreSQL test database is not configured, skipping." Nothing