Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix a Bug in the Database Join Implementation #1614

Merged
merged 8 commits into from
Mar 25, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions distribution/std-lib/Standard/src/Database/Data/Internal/IR.enso
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,36 @@ type Context
set_limit new_limit =
Context this.from_spec this.where_filters this.orders this.groups this.meta_index new_limit

## PRIVATE

'Lifts' this context into a subquery, so that the original context (with all filters etc.) is
encapsulated within the subquery and all external references passed as the second argument,
refer directly to that subquery.

It takes a list of lists of columns that should be included in that subquery (this can for
example the list of regular columns, the list of indices etc.)
It assumes that columns on these lists all have unique names.

It returns a new context and the lists transformed in such a way that each column corresponds
to one from the original list but it is valid in the new context.

This is useful as a preprocessing step between combining queries, for example in a join.
# as_subquery : Text -> Vector (Vector Internal_Column) -> [IR.Sub_Query, Vector (Vector Internal_Column)]
as_subquery alias column_lists =
rewrite_internal_column : Internal_Column -> Internal_Column
rewrite_internal_column column =
Internal_Column column.name column.sql_type (IR.Column alias column.name)

new_columns = column_lists.map columns->
columns.map rewrite_internal_column

encapsulated_columns = column_lists.flat_map columns->
columns.map column-> [column.name, column.expression]
new_from = IR.Sub_Query encapsulated_columns this alias

[new_from, new_columns]


## PRIVATE

Used as part of the context, specifies the sources of the query.
Expand Down
171 changes: 115 additions & 56 deletions distribution/std-lib/Standard/src/Database/Data/Table.enso
Original file line number Diff line number Diff line change
Expand Up @@ -156,9 +156,9 @@ type Table
## UNSTABLE

Sets the index of this table, using the column with the provided name.
set_index : Text | Column | Vector (Text | Column) -> Table
set_index : Text | Column | Vector Text -> Table
set_index index = Panic.recover <|
new_index = (Helpers.unify_vector_singleton index).map (this.resolve >> .as_internal)
new_index = (Helpers.unify_vector_singleton index).map (this.at >> .as_internal)
new_ctx = this.context.set_index new_index
new_cols = this.internal_columns.filter col->
turned_into_index = new_index.exists i-> i.name == col.name
Expand Down Expand Up @@ -286,59 +286,79 @@ type Table
when there's a name conflict with a column of `other`.
- right_suffix: a suffix that should be added to the columns of `other`
when there's a name conflict with a column of `this`.
join : Table -> Nothing | Text | Column | Vector (Text | Column) -> Boolean -> Text -> Text -> Table
join other on=Nothing drop_unmatched=False left_suffix='_left' right_suffix='_right' = Panic.recover <|
Panic.rethrow (Helpers.ensure_name_is_sane left_suffix && Helpers.ensure_name_is_sane right_suffix)
if left_suffix == right_suffix then
Panic.throw <| Illegal_State_Error "left_suffix must be different from right_suffix"
kind = if drop_unmatched then IR.Join_Inner else IR.Join_Left
my_index : Vector Internal_Column
my_index = case on of
Nothing -> this.context.meta_index
_ ->
(Helpers.unify_vector_singleton on).map (this.resolve >> .as_internal)
other_index = other.context.meta_index
case my_index.length == other_index.length of
False -> Panic.throw <| Illegal_State_Error "Cannot join with multi-indexes of different lengths."
True ->
# TODO [RW] we may be able to avoid creating subqueries if there are no groups, orders or wheres,
# so it may be worth optimizing that here (#1515)
new_table_name = this.name + "_" + other.name
aliases = case this.name == other.name of
True -> [this.name+left_suffix, other.name+right_suffix]
False -> [this.name, other.name]
left_alias = aliases.first
right_alias = aliases.second

left_subquery_cols = this.internal_columns_with_index.map c-> [c.name, c.expression]
right_subquery_cols = other.internal_columns_with_index.map c-> [c.name, c.expression]
left_query = IR.Sub_Query left_subquery_cols this.context left_alias
right_query = IR.Sub_Query right_subquery_cols other.context right_alias

left_renamed_index = my_index.map <|
IR.lift_expression_map (IR.substitute_origin this.name left_alias)
right_renamed_index = other_index.map <|
IR.lift_expression_map (IR.substitute_origin other.name right_alias)
on_exprs = left_renamed_index.zip right_renamed_index l-> r->
IR.Operation "=" [l.expression, r.expression]

new_index = left_renamed_index
new_from = IR.Join kind left_query right_query on_exprs
new_limit = Nothing
new_ctx = IR.Context new_from [] [] [] new_index new_limit

new_names = here.combine_names (this.internal_columns.map .name) (other.internal_columns.map .name) left_suffix right_suffix
left_names = new_names.first
right_names = new_names.second

new_left_columns = this.internal_columns.zip left_names p-> new_name->
Internal_Column new_name p.sql_type (IR.Column left_alias p.name)
new_right_columns = other.internal_columns.zip right_names p-> new_name->
Internal_Column new_name p.sql_type (IR.Column right_alias p.name)

new_columns = new_left_columns + new_right_columns

Table new_table_name this.connection new_columns new_ctx
join : Table | Column -> Nothing | Text | Column | Vector (Text | Column) -> Boolean -> Text -> Text -> Table
join other on=Nothing drop_unmatched=False left_suffix='_left' right_suffix='_right' = case other of
Column _ _ _ _ _ -> this.join other.to_table on drop_unmatched left_suffix right_suffix
Table _ _ _ _ -> Panic.recover <|
Panic.rethrow (Helpers.ensure_name_is_sane left_suffix && Helpers.ensure_name_is_sane right_suffix)
if left_suffix == right_suffix then
Panic.throw <| Illegal_State_Error "left_suffix must be different from right_suffix"
kind = if drop_unmatched then IR.Join_Inner else IR.Join_Left

# Prepare the left and right pairs of indices along which the join will be performed.
left_join_index : Vector Internal_Column
left_join_index = case on of
Nothing -> this.context.meta_index
_ ->
(Helpers.unify_vector_singleton on).map (this.resolve >> .as_internal)
right_join_index = other.context.meta_index
if left_join_index.length != right_join_index.length then
Panic.throw <| Illegal_State_Error "Cannot join with multi-indexes of different lengths."

# TODO [RW] We may be able to avoid creating subqueries if there are no groups, orders or wheres,
# so it may be worth optimizing that here (#1515).
new_table_name = this.name + "_" + other.name
aliases = case this.name == other.name of
True -> [this.name+left_suffix, other.name+right_suffix]
False -> [this.name, other.name]
left_alias = aliases.first
right_alias = aliases.second

# Ensure that the join indices (which are not directly visible to the user, but must be materialized in the sub-query)
# get a fresh set of names, so that they do not collide with other parts of the query.
left_used_names = this.internal_columns_with_index.map .name
left_join_index_fresh = here.freshen_columns left_used_names left_join_index

# Create subqueries that encapsulate the original queries and provide needed columns.
# We only include the meta_index from the left table, because only this one will be kept in the result.
# The generated new sets of columns refer to the encapsulated expressions within the subquery and are
# valid in contexts whose from_spec is this subquery directly or it is a join containing this subquery.
# TODO [RW] Not all of these included columns are actually usable from the external context, so
# in the future we may consider pruning some of them as additional optimization and simplification of the query.
left_config = this.context.as_subquery left_alias [this.internal_columns, this.context.meta_index, left_join_index_fresh]
right_config = other.context.as_subquery right_alias [other.internal_columns, right_join_index]

left_subquery = left_config.first
left_new_columns = left_config.second.at 0
left_new_meta_index = left_config.second.at 1
left_new_join_index = left_config.second.at 2

right_subquery = right_config.first
right_new_columns = right_config.second.at 0
right_new_join_index = right_config.second.at 1

# Generate new names for all columns (including the indices) that will be retained in the created Table.
left_names_before = (left_new_meta_index + left_new_columns).map .name
right_names_before = right_new_columns.map .name
new_names = here.combine_names left_names_before right_names_before left_suffix right_suffix
left_indices_count = left_new_meta_index.length
left_new_meta_index_names = new_names.first.take_start left_indices_count
left_new_columns_names = new_names.first.drop_start left_indices_count
right_new_columns_names = new_names.second

# Rename columns to the newly allocated names
new_index = here.rename_columns left_new_meta_index left_new_meta_index_names
left_renamed_columns = here.rename_columns left_new_columns left_new_columns_names
right_renamed_columns = here.rename_columns right_new_columns right_new_columns_names
new_columns = left_renamed_columns + right_renamed_columns

on_exprs = left_new_join_index.zip right_new_join_index l-> r->
IR.Operation "=" [l.expression, r.expression]
new_from = IR.Join kind left_subquery right_subquery on_exprs
new_limit = Nothing
new_ctx = IR.Context new_from [] [] [] new_index new_limit

Table new_table_name this.connection new_columns new_ctx

## UNSTABLE

Expand Down Expand Up @@ -483,6 +503,7 @@ type Table
internal_columns_with_index =
this.context.meta_index + this.internal_columns


## PRIVATE

Inserts a new row to the table. It actually modifies the underlying table
Expand Down Expand Up @@ -552,7 +573,10 @@ type Aggregate_Table
ungrouped : Table
ungrouped =
new_ctx = this.context.set_groups []
Table this.name this.connection this.internal_columns new_ctx
new_cols = this.internal_columns.filter col->
turned_into_index = this.context.meta_index.exists i-> i.name == col.name
turned_into_index.not
Table this.name this.connection new_cols new_ctx

type Integrity_Error
## UNSTABLE
Expand Down Expand Up @@ -641,3 +665,38 @@ combine_names left_names right_names left_suffix right_suffix =
new_left_names = left_pairs.map .second
new_right_names = right_pairs.map .second
[new_left_names, new_right_names]

## PRIVATE

Transforms `preferred_names` names in such a way to not collide with `used_names`. If a name
from `preferred_names` does not collide with others, it is kept as is, otherwise numerical
suffixes are added.
fresh_names : Vector Text -> Vector Text -> Vector Text
fresh_names used_names preferred_names =
freshen currently_used name ix =
new_name = if ix == 0 then name else name+"_"+ix.to_text
case currently_used.contains new_name of
False -> new_name
True -> freshen currently_used name ix+1
res = preferred_names . fold [used_names, []] acc-> name->
used = acc.first
new_name = freshen used name 0
[used_names + [new_name], acc.second + [new_name]]
res.second

## PRIVATE

Transforms the vector of columns, changing names of each column to the corresponding name from the second vector.
rename_columns : Vector Internal_Column -> Vector Text -> Vector Internal_Column
rename_columns columns new_names =
columns.zip new_names col-> name->
col.rename name

## PRIVATE

Ensures that the provided columns do not clash with the vector of names provided as first argument.
Original column names are kept if possible, but if they would clash, the columns are renamed.
freshen_columns : Vector Text -> Vector Column -> Vector Column
freshen_columns used_names columns =
fresh_names = here.fresh_names used_names (columns.map .name)
here.rename_columns columns fresh_names
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,14 @@ prepare_visualization x max_rows = Helpers.recover_errors <| case x of
Database_Column.Aggregate_Column _ _ _ _ _ ->
here.prepare_visualization x.ungrouped.to_table max_rows

# TODO [RW] Should we truncate Vectors?
# We also visualize Vectors and arrays
Vector.Vector _ ->
truncated = x.take_start max_rows
Json.from_pairs [["json", truncated], ["all_rows_count", x.length]] . to_text
Array ->
here.prepare_visualization (Vector.Vector x) max_rows

# Anything else will be visualized with the JSON or matrix visualization
_ ->
Json.from_pairs [["json", x]] . to_text
Expand Down
27 changes: 23 additions & 4 deletions test/Database_Tests/src/Codegen_Spec.enso
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ from Standard.Base import all

import Database_Tests.Helpers.Fake_Test_Connection
import Standard.Database.Data.Dialect
import Standard.Database.Data.Table as Table_Module
import Standard.Test

from Standard.Database import all
Expand Down Expand Up @@ -96,15 +97,19 @@ spec =
t3 = test_connection.access_table "T3"
Test.specify "should allow joining tables index-on-index" <|
r1 = t1.set_index 'A' . join (t2.set_index 'D')
r1.to_sql.prepare . should_equal ['SELECT "T1"."B" AS "B", "T1"."C" AS "C", "T2"."E" AS "E", "T2"."F" AS "F" FROM (SELECT "T1"."A" AS "A", "T1"."B" AS "B", "T1"."C" AS "C" FROM "T1" AS "T1") AS "T1" LEFT JOIN (SELECT "T2"."D" AS "D", "T2"."E" AS "E", "T2"."F" AS "F" FROM "T2" AS "T2") AS "T2" ON ("T1"."A" = "T2"."D")', []]
r1.to_sql.prepare . should_equal ['SELECT "T1"."B" AS "B", "T1"."C" AS "C", "T2"."E" AS "E", "T2"."F" AS "F" FROM (SELECT "T1"."B" AS "B", "T1"."C" AS "C", "T1"."A" AS "A", "T1"."A" AS "A_1" FROM "T1" AS "T1") AS "T1" LEFT JOIN (SELECT "T2"."E" AS "E", "T2"."F" AS "F", "T2"."D" AS "D" FROM "T2" AS "T2") AS "T2" ON ("T1"."A_1" = "T2"."D")', []]

Test.specify "should allow joining tables column-on-index" <|
r1 = t1.join (t2.set_index 'D') on='B' drop_unmatched=True
r1.to_sql.prepare . should_equal ['SELECT "T1"."A" AS "A", "T1"."B" AS "B", "T1"."C" AS "C", "T2"."E" AS "E", "T2"."F" AS "F" FROM (SELECT "T1"."A" AS "A", "T1"."B" AS "B", "T1"."C" AS "C" FROM "T1" AS "T1") AS "T1" INNER JOIN (SELECT "T2"."D" AS "D", "T2"."E" AS "E", "T2"."F" AS "F" FROM "T2" AS "T2") AS "T2" ON ("T1"."B" = "T2"."D")', []]
r1.to_sql.prepare . should_equal ['SELECT "T1"."A" AS "A", "T1"."B" AS "B", "T1"."C" AS "C", "T2"."E" AS "E", "T2"."F" AS "F" FROM (SELECT "T1"."A" AS "A", "T1"."B" AS "B", "T1"."C" AS "C", "T1"."B" AS "B_1" FROM "T1" AS "T1") AS "T1" INNER JOIN (SELECT "T2"."E" AS "E", "T2"."F" AS "F", "T2"."D" AS "D" FROM "T2" AS "T2") AS "T2" ON ("T1"."B_1" = "T2"."D")', []]

Test.specify "should append suffixes to disambiguate column names" <|
r1 = t1.join (t3.set_index 'E') on='A'
r1.to_sql.prepare . should_equal ['SELECT "T1"."A" AS "A_left", "T1"."B" AS "B", "T1"."C" AS "C", "T3"."A" AS "A_right", "T3"."F" AS "F" FROM (SELECT "T1"."A" AS "A", "T1"."B" AS "B", "T1"."C" AS "C" FROM "T1" AS "T1") AS "T1" LEFT JOIN (SELECT "T3"."E" AS "E", "T3"."A" AS "A", "T3"."F" AS "F" FROM "T3" AS "T3") AS "T3" ON ("T1"."A" = "T3"."E")', []]
r1.to_sql.prepare . should_equal ['SELECT "T1"."A" AS "A_left", "T1"."B" AS "B", "T1"."C" AS "C", "T3"."A" AS "A_right", "T3"."F" AS "F" FROM (SELECT "T1"."A" AS "A", "T1"."B" AS "B", "T1"."C" AS "C", "T1"."A" AS "A_1" FROM "T1" AS "T1") AS "T1" LEFT JOIN (SELECT "T3"."A" AS "A", "T3"."F" AS "F", "T3"."E" AS "E" FROM "T3" AS "T3") AS "T3" ON ("T1"."A_1" = "T3"."E")', []]

r2 = (t3.set_index 'E').join (t2.set_index 'F')
r2.index.name . should_equal "E_left"
r2.columns . map .name . should_equal ["A", "F", "D", "E_right"]

Test.specify "should avoid duplicates when disambiguating column names" <|
connection =
Expand All @@ -122,7 +127,7 @@ spec =

Test.specify "should correctly handle self-joins" <|
r1 = t1.join (t1.set_index 'A') on='B'
r1.to_sql.prepare . should_equal ['SELECT "T1_left"."A" AS "A", "T1_left"."B" AS "B_left", "T1_left"."C" AS "C_left", "T1_right"."B" AS "B_right", "T1_right"."C" AS "C_right" FROM (SELECT "T1"."A" AS "A", "T1"."B" AS "B", "T1"."C" AS "C" FROM "T1" AS "T1") AS "T1_left" LEFT JOIN (SELECT "T1"."A" AS "A", "T1"."B" AS "B", "T1"."C" AS "C" FROM "T1" AS "T1") AS "T1_right" ON ("T1_left"."B" = "T1_right"."A")', []]
r1.to_sql.prepare . should_equal ['SELECT "T1_left"."A" AS "A", "T1_left"."B" AS "B_left", "T1_left"."C" AS "C_left", "T1_right"."B" AS "B_right", "T1_right"."C" AS "C_right" FROM (SELECT "T1"."A" AS "A", "T1"."B" AS "B", "T1"."C" AS "C", "T1"."B" AS "B_1" FROM "T1" AS "T1") AS "T1_left" LEFT JOIN (SELECT "T1"."B" AS "B", "T1"."C" AS "C", "T1"."A" AS "A" FROM "T1" AS "T1") AS "T1_right" ON ("T1_left"."B_1" = "T1_right"."A")', []]

Test.group "[Codegen] Handling Missing Values" <|
Test.specify "fill_missing should allow to replace missing values in a column with a constant" <|
Expand Down Expand Up @@ -177,3 +182,17 @@ spec =
Test.specify 'should return dataflow error when passed a non-existent column' <|
r = t1.sort by='foobar'
r.should_fail_with No_Such_Column_Error

Test.group "Helpers" <|
Test.specify "combine_names should combine lists of names" <|
v1 = ["A", "B"]
v2 = ["A", "C"]
combined = Table_Module.combine_names v1 v2 "_1" "_2"
combined.first . should_equal ["A_1", "B"]
combined.second . should_equal ["A_2", "C"]

Test.expect_panic_with (Table_Module.combine_names ["A", "A_1"] ["A"] "_1" "_2") Illegal_State_Error
Test.specify "fresh_names should provide fresh names" <|
used_names = ["A", "A_1"]
preferred_names = ["A", "A", "B"]
Table_Module.fresh_names used_names preferred_names . should_equal ["A_2", "A_3", "B"]
Loading