Skip to content

Commit

Permalink
Fix edge cases with name disambiguation on join
Browse files Browse the repository at this point in the history
  • Loading branch information
radeusgd committed Mar 5, 2021
1 parent a9d25ff commit e8550ea
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 22 deletions.
56 changes: 44 additions & 12 deletions distribution/std-lib/Database/src/Data/Table.enso
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ type Table
- format_terminal: whether ANSI-terminal formatting should be used
display : Integer -> Boolean -> Text
display show_rows=10 format_terminal=False =
# TODO [RW] should I display the row with numbers if no index is set as in dataframes??
df = this.reset_index.to_dataframe max_rows=show_rows
indices_count = this.context.meta_index.length
all_rows_count = this.row_count
Expand Down Expand Up @@ -275,6 +274,8 @@ type Table
join : Table -> Nothing | Text | Column | Vector (Text | Column) -> Boolean -> Text -> Text -> Table
join other on=Nothing drop_unmatched=False left_suffix='_left' right_suffix='_right' = Panic.recover <|
Panic.rethrow (Helpers.ensure_name_is_sane left_suffix && Helpers.ensure_name_is_sane right_suffix)
if left_suffix == right_suffix then
Panic.throw <| Illegal_State_Error "left_suffix must be different from right_suffix"
kind = if drop_unmatched then IR.Join_Inner else IR.Join_Left
my_index : Vector Internal_Column
my_index = case on of
Expand Down Expand Up @@ -312,18 +313,14 @@ type Table
new_limit = Nothing
new_ctx = IR.Context new_from [] [] [] new_index new_limit

# TODO [RW, MK] what if we have name conflict because both tables contain 'A', but then also one of them contains 'A'+suffix ??
left_names = Map.from_vector (this.internal_columns.map (p -> [p.name, True]))
right_names = Map.from_vector (other.internal_columns.map (p -> [p.name, True]))
rename suffix other_names this_name =
if other_names.get_or_else this_name False then this_name+suffix else this_name
rename_left = rename left_suffix right_names
rename_right = rename right_suffix left_names
new_names = here.combine_names (this.internal_columns.map .name) (other.internal_columns.map .name) left_suffix right_suffix
left_names = new_names.first
right_names = new_names.second

new_left_columns = this.internal_columns.map p->
Internal_Column (rename_left p.name) p.sql_type (IR.Column left_alias p.name)
new_right_columns = other.internal_columns.map p->
Internal_Column (rename_right p.name) p.sql_type (IR.Column right_alias p.name)
new_left_columns = this.internal_columns.zip left_names p-> new_name->
Internal_Column new_name p.sql_type (IR.Column left_alias p.name)
new_right_columns = other.internal_columns.zip right_names p-> new_name->
Internal_Column new_name p.sql_type (IR.Column right_alias p.name)

new_columns = new_left_columns + new_right_columns

Expand Down Expand Up @@ -564,3 +561,38 @@ display_dataframe df indices_count all_rows_count format_terminal =
missing_rows_count = all_rows_count - display_rows
missing = '\n\u2026 and ' + missing_rows_count.to_text + ' hidden rows.'
table + missing

## PRIVATE

Creates a list of non-colliding names by merging the two lists and
appending suffixes if necessary.

If even after appending the suffixes it is impossible to have unique names,
it throws a panic. It returns two vectors, one for each input. It assumes
that the names within each argument itself are unique.
combine_names left_names right_names left_suffix right_suffix =
make_count_map names =
map = names.fold Map.empty acc-> name->
count = acc.get_or_else name 0 + 1
acc.insert name count
name-> map.get_or_else name 0
original_names_count = make_count_map left_names+right_names
add_suffix_if_necessary suffix name = case original_names_count name > 1 of
True -> [name, name+suffix]
False -> [name, name]
left_pairs = left_names.map <| add_suffix_if_necessary left_suffix
right_pairs = right_names.map <| add_suffix_if_necessary right_suffix

new_names_count = make_count_map (left_pairs+right_pairs . map .second)
catch_ambiguity pairs = pairs.each pair->
original_name = pair.first
new_name = pair.second
case new_name!=original_name && (new_names_count new_name > 1) of
True ->
Panic.throw <| Illegal_State_Error "Duplicate column "+original_name+" was about to be renamed to "+new_name+" to disambiguate column names, but a column with name "+new_name+" already exists too. Please rename the columns before joining to avoid ambiguity."
False -> Nothing
catch_ambiguity left_pairs
catch_ambiguity right_pairs
new_left_names = left_pairs.map .second
new_right_names = right_pairs.map .second
[new_left_names, new_right_names]
13 changes: 5 additions & 8 deletions distribution/std-lib/Test/src/Test.enso
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ Spec.is_fail = this.behaviors.any .is_fail
Suite.is_fail = this.specs.any .is_fail

## PRIVATE
type Finished_With_Error err
type Finished_With_Error err stack_trace

## PRIVATE
type Matched_On_Error err
Expand Down Expand Up @@ -163,20 +163,17 @@ specify label ~behavior pending=Nothing =

## PRIVATE
run_spec ~behavior =
## FIXME revert these lines
recovery = Panic.recover <|
behavior.catch err-> Panic.throw (Finished_With_Error err)
result = behavior
result.catch err-> Panic.throw (Finished_With_Error err result.get_stack_trace_text)
Nothing
# recovery =
# behavior.map_error Finished_With_Error
maybeExc = case recovery of
_ -> Success
result = maybeExc.catch ex->
case ex of
Failure _ -> ex
Finished_With_Error x ->
# TODO [RW] display at least original error location
Failure ("An unexpected error was returned: " + x.to_text)
Finished_With_Error err stack_trace_text ->
Failure ("An unexpected error was returned: " + err.to_text + '\n' + stack_trace_text)
_ -> Failure ("An unexpected panic was thrown: " + ex.to_text + '\n' + maybeExc.get_stack_trace_text)
result

Expand Down
14 changes: 14 additions & 0 deletions test/Database_Tests/src/Codegen_Spec.enso
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,20 @@ spec =
r1 = t1.join (t3.set_index 'E') on='A'
r1.to_sql.prepare . should_equal ['SELECT "T1"."A" AS "A_left", "T1"."B" AS "B", "T1"."C" AS "C", "T3"."A" AS "A_right", "T3"."F" AS "F" FROM (SELECT "T1"."A" AS "A", "T1"."B" AS "B", "T1"."C" AS "C" FROM "T1" AS "T1") AS "T1" LEFT JOIN (SELECT "T3"."E" AS "E", "T3"."A" AS "A", "T3"."F" AS "F" FROM "T3" AS "T3") AS "T3" ON ("T1"."A" = "T3"."E")', []]

Test.specify "should avoid duplicates when disambiguating column names" <|
connection =
table1 = ["T1", [["X", int], ["A", int], ["A_left", int]]]
table2 = ["T2", [["X", int], ["A", int], ["B", int]]]
tables = Map.from_vector [table1, table2]
Fake_Test_Connection.make Dialect.sqlite tables
t1 = connection.access_table "T1"
t2 = connection.access_table "T2"
(t1.set_index "X").join (t2.set_index "X") . should_fail_with Illegal_State_Error

Test.specify "should ensure that name suffixes are distinct" <|
err = (t1.set_index 'A').join (t2.set_index 'D') left_suffix='foo' right_suffix='foo'
err . should_fail_with Illegal_State_Error

Test.specify "should correctly handle self-joins" <|
r1 = t1.join (t1.set_index 'A') on='B'
r1.to_sql.prepare . should_equal ['SELECT "T1_left"."A" AS "A", "T1_left"."B" AS "B_left", "T1_left"."C" AS "C_left", "T1_right"."B" AS "B_right", "T1_right"."C" AS "C_right" FROM (SELECT "T1"."A" AS "A", "T1"."B" AS "B", "T1"."C" AS "C" FROM "T1" AS "T1") AS "T1_left" LEFT JOIN (SELECT "T1"."A" AS "A", "T1"."B" AS "B", "T1"."C" AS "C" FROM "T1" AS "T1") AS "T1_right" ON ("T1_left"."B" = "T1_right"."A")', []]
Expand Down
3 changes: 1 addition & 2 deletions test/Database_Tests/src/Database_Spec.enso
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,6 @@ spec connection =
df.insert [4, 4, 5.2, True, "baz"]
df.insert [1, 5, 1.6, False, "spam"]

ord = [0, 3, 2, 4, 1]
ints = [1, 2, 3, 4, 5]
reals = [1.3, 4.6, 3.2, 5.2, 1.6]
bools = [False, False, True, True, False]
Expand Down Expand Up @@ -353,6 +352,6 @@ sqlite_spec =
file.delete

postgres_spec =
# TODO use env vars to read tmp DB config
# TODO [RW] use env vars to read tmp DB config
IO.println "PostgreSQL test database is not configured, skipping."
Nothing

0 comments on commit e8550ea

Please sign in to comment.