Skip to content

Commit

Permalink
Improvements to Table.union (#9968)
Browse files Browse the repository at this point in the history
- Closes #9952
  • Loading branch information
radeusgd authored May 22, 2024
1 parent 517299b commit 1e0649f
Show file tree
Hide file tree
Showing 17 changed files with 774 additions and 517 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -666,6 +666,7 @@
- [Added `Text.cleanse` `Column.Text_Cleanse` and `Table.Text_Cleanse`][9879]
- [Added ability to save an existing Postgres connection as a Data Link in Enso
Cloud.][9957]
- [Improved `Table.union`.][9968]

[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
Expand Down Expand Up @@ -978,6 +979,7 @@
[9873]: https://github.com/enso-org/enso/pull/9873
[9879]: https://github.com/enso-org/enso/pull/9879
[9957]: https://github.com/enso-org/enso/pull/9957
[9968]: https://github.com/enso-org/enso/pull/9968

#### Enso Compiler

Expand Down
187 changes: 95 additions & 92 deletions distribution/lib/Standard/Database/0.0.0-dev/src/DB_Table.enso

Large diffs are not rendered by default.

11 changes: 9 additions & 2 deletions distribution/lib/Standard/Database/0.0.0-dev/src/Errors.enso
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,18 @@ type SQL_Error
Convert the SQL error to a textual representation.
to_text : Text
to_text self =
query = if self.related_query.is_nothing.not then " [Query was: " + self.related_query.to_display_text + "]" else ""
query = if self.related_query.is_nothing then "" else
query_text = self.related_query.to_text
## Our generated queries tend to be very long, so to still be readable,
we don't shorten them too much. We impose an upper limit to avoid unbounded error message size.
max_length = 1000
shortened_query_text = if query_text.length <= max_length then query_text else
query_text.take (Index_Sub_Range.First (max_length.div 2)) + " (...) " + query_text.take (Index_Sub_Range.Last (max_length.div 2))
" [Query was: " + shortened_query_text + "]"
message = self.java_exception.getMessage
max_length = 300
short_message = if message.length < max_length then message else
message.take (Index_Sub_Range.First max_length/2) + " (...) " + message.take (Index_Sub_Range.Last max_length/2)
message.take (Index_Sub_Range.First (max_length.div 2)) + " (...) " + message.take (Index_Sub_Range.Last (max_length.div 2))
"There was an SQL error: " + short_message + "." + query

## PRIVATE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ type Postgres_Dialect

## PRIVATE
make_cast : Internal_Column -> SQL_Type -> (SQL_Expression -> SQL_Type_Reference) -> Internal_Column
make_cast self column target_type infer_result_type_from_database_callback =
make_cast self (column : Internal_Column) (target_type : SQL_Type) (infer_result_type_from_database_callback : SQL_Expression -> SQL_Type_Reference) =
mapping = self.get_type_mapping
source_type = mapping.sql_type_to_value_type column.sql_type_reference.get
target_value_type = mapping.sql_type_to_value_type target_type
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ type SQLite_Dialect

## PRIVATE
make_cast : Internal_Column -> SQL_Type -> (SQL_Expression -> SQL_Type_Reference) -> Internal_Column
make_cast self column target_type infer_result_type_from_database_callback =
make_cast self (column : Internal_Column) (target_type : SQL_Type) (infer_result_type_from_database_callback : SQL_Expression -> SQL_Type_Reference) =
_ = [infer_result_type_from_database_callback]
mapping = self.get_type_mapping
target_value_type = mapping.sql_type_to_value_type target_type
Expand Down
35 changes: 35 additions & 0 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Columns_To_Keep.enso
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from Standard.Base import Vector, Text
from Standard.Base.Metadata import make_single_choice, Widget

## Specifies which columns to keep in a union operation.
type Columns_To_Keep
## All columns are kept.

If a column is present only in some of the tables, it is padded with
`Nothing` for tables where it is missing.
In_Any

## Only columns that are present in all tables are kept.

If there are columns that are only present in some of the tables,
a problem is reported.
In_All

## Specific list of column names to keep.

If a table does not have a column that is specified in the list, it is
padded with `Nothing` and a problem is reported.
In_List (column_names : Vector Text)

## PRIVATE
Same as `In_Any`, but it will warn about columns that are not present in
all tables.
In_Any_Warn_On_Missing

## PRIVATE
The default widget for `Columns_To_Keep`.
It does not display the internal `In_Any_Warn_On_Missing` variant, since
that variant is only meant to be used as the default value.
default_widget -> Widget =
make_single_choice <|
["In_Any", "In_All", "In_List"].map c-> [c, ".."+c]
54 changes: 46 additions & 8 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Errors.enso
Original file line number Diff line number Diff line change
Expand Up @@ -494,26 +494,42 @@ type Column_Type_Mismatch

type No_Common_Type
## PRIVATE
An error indicating that no common type could be found.
An error indicating that no common type could be found, and the operation
could not be performed.

Arguments:
- types: The types that were tried to be unified.
- related_column_name: The name of the resulting column that was being
unified, if applicable.
Error (types : Vector Value_Type) (related_column_name : Nothing|Text)

## PRIVATE
A warning indicating that no common type could be found, so the operation
had to fall back to converting all values to text.
Warning_Convert_To_Text (types : Vector Value_Type) (related_column_name:Text)

## PRIVATE

Create a human-readable version of the error.
to_display_text : Text
to_display_text self =
types = self.types.map .to_display_text . join ", "
prefix = "No common type was found for types: "+types
infix = case self.related_column_name of
column_name : Text -> " when unifying column ["+column_name+"]."
_ -> "."
suffix = " If you want to allow mixed types, please cast one of the columns to `Mixed` beforehand."
prefix + infix + suffix
location = case self.related_column_name of
column_name : Text -> " when unifying column ["+column_name+"]"
_ -> ""
suffix_type = case self of
No_Common_Type.Error _ _ -> "."
No_Common_Type.Warning_Convert_To_Text _ _ -> ", so the values were converted to text."
suffix_mixed = " If you want to have mixed types instead, please cast one of the columns to `Mixed` beforehand."
prefix + location + suffix_type + suffix_mixed

## PRIVATE
to_text self -> Text =
ctor = case self of
No_Common_Type.Error _ _ -> "Error"
No_Common_Type.Warning_Convert_To_Text _ _ -> "Warning_Convert_To_Text"
"No_Common_Type."+ctor+" "+self.types.to_text+" "+self.related_column_name.to_text

type Unmatched_Columns
## PRIVATE
Expand Down Expand Up @@ -637,9 +653,11 @@ type Conversion_Failure

type Loss_Of_Integer_Precision
## PRIVATE
Indicates that an automatic conversion of an integer column to a decimal
Indicates that an automatic conversion of an Integer column to a Float
column is losing precision because some of the large integers cannot be
exactly represented by the `double` type.
exactly represented by the floating-point type.

Currently, this error is only reported in-memory.
Warning (affected_rows_count : Integer) (example_value : Integer) (example_value_converted : Float)

## PRIVATE
Expand Down Expand Up @@ -834,3 +852,23 @@ type Nothing_Value_In_Filter_Condition
to_display_text : Text
to_display_text self =
"Using `Nothing` as an argument to a `"+self.filter_condition.to_text+"` cannot match anything."

## Indicates that different Date_Time (with or without timezone) or Date types
are mixed in the result, causing implicit coercions.

This is a warning, because using the `00:00` time and default time-zone may
not always be the expected choice, so the user should be aware of this.
type Mixing_Date_Time_Types
## PRIVATE
Date_To_Date_Time (related_column_name : Text | Nothing)

## PRIVATE
Implicit_Time_Zone (related_column_name : Text | Nothing)

to_display_text self -> Text =
location = if self.related_column_name.is_nothing then "" else " (in column ["+self.related_column_name+"])"
case self of
Mixing_Date_Time_Types.Date_To_Date_Time _ ->
"Mixing Date and Date_Time values"+location+": the Date values have been automatically converted to Date_Time by adding a time of 00:00 in the default time-zone."
Mixing_Date_Time_Types.Implicit_Time_Zone _ ->
"Mixing Date_Time values with and without timezone"+location+". A default timezone has been assumed where it was missing."
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ from Standard.Base.Data.Filter_Condition import sql_like_to_regex
from Standard.Base.Metadata.Choice import Option
from Standard.Base.Metadata.Widget import Multiple_Choice, Single_Choice

import project.Columns_To_Keep.Columns_To_Keep
import project.Excel.Excel_Range.Excel_Range
import project.Headers.Headers
import project.Internal.Excel_Reader
Expand Down Expand Up @@ -331,15 +332,15 @@ type Excel_Workbook
tables = sheet_names.map on_problems=on_problems address-> self.read address headers on_problems=on_problems
case return of
Return_As.Table_Of_Tables -> Table.new [["Sheet Name", sheet_names], ["Table", tables]]
Return_As.Merged_Table match ->
Return_As.Merged_Table columns_to_keep match ->
first_tbl = tables.find t-> t != Nothing
if first_tbl == Nothing then Error.throw (Illegal_Argument.Error "No valid sheets found.") else
unique = first_tbl.column_naming_helper.create_unique_name_strategy
tables.each tbl-> if tbl != Nothing then unique.mark_used tbl.column_names
new_column_name = unique.make_unique "Sheet Name"

with_names = tables.zip sheet_names tbl->name-> if tbl == Nothing then Nothing else tbl.set name new_column_name . reorder_columns [new_column_name]
result = Table.from_union (with_names.filter Filter_Condition.Not_Nothing) match keep_unmatched_columns=True
result = Table.from_union (with_names.filter Filter_Condition.Not_Nothing) columns_to_keep=columns_to_keep match_columns=match

problem_builder = Problem_Builder.new
problem_builder.report_unique_name_strategy unique
Expand All @@ -359,4 +360,4 @@ type Return_As
Table_Of_Tables

## All sheets are merged into a single table. A union operation is performed.
Merged_Table match:Match_Columns=Match_Columns.By_Name
Merged_Table (columns_to_keep : Columns_To_Keep = Columns_To_Keep.In_Any) (match : Match_Columns = Match_Columns.By_Name)
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import project.Expression.Expression
import project.Internal.Column_Naming_Helper.Column_Naming_Helper
import project.Internal.Problem_Builder.Problem_Builder
import project.Internal.Value_Type_Helpers
import project.Match_Columns.Column_Set
import project.Position.Position
import project.Set_Mode.Set_Mode
import project.Sort_Column.Sort_Column
Expand Down Expand Up @@ -530,30 +531,38 @@ is_column obj =
## PRIVATE
A helper method that resolves what should be the result type of a particular
column set based on the union settings.
unify_result_type_for_union column_set all_tables allow_type_widening problem_builder =
unify_result_type_for_union (column_set : Column_Set) (all_tables : Vector) (problem_builder : Problem_Builder) -> Union_Result_Type =
columns = column_set.resolve_columns all_tables
case allow_type_widening of
True ->
types = columns.filter Filter_Condition.Not_Nothing . map .value_type
common_type = Value_Type_Helpers.find_common_type types strict=True
if common_type.is_nothing then
problem_builder.report_other_warning (No_Common_Type.Error types related_column_name=column_set.name)
common_type
False ->
is_not_nothing c = case c of
Nothing -> False
_ -> True
first_column = columns.find is_not_nothing
first_type = first_column.value_type
if first_type == Value_Type.Mixed then Value_Type.Mixed else
first_wrong_column = columns.find if_missing=Nothing col->
is_not_nothing col && col.value_type != first_type
case first_wrong_column of
Nothing -> first_type
_ ->
got_type = first_wrong_column.value_type
problem_builder.report_other_warning (Column_Type_Mismatch.Error column_set.name first_type got_type)
Nothing
. filter Filter_Condition.Not_Nothing
types = columns.map .value_type

if types.is_empty then Union_Result_Type.No_Types_To_Unify else
## First we check if we can find a generic common type.
This includes widening numeric column sizes, or converting Integer to Float.
common_type = Value_Type_Helpers.find_common_type types strict=True
if common_type.is_nothing.not then Union_Result_Type.Common_Type common_type else
## Union has less strict requirements than other operations relying on `find_common_type`,
so if the common type was not found, we still check some fallbacks.
common_numeric_boolean = Value_Type_Helpers.find_common_numeric_boolean_type types
if common_numeric_boolean.is_nothing.not then Union_Result_Type.Common_Type common_numeric_boolean else
common_date_type = Value_Type_Helpers.find_common_date_types types column_set.name problem_builder
if common_date_type.is_nothing.not then Union_Result_Type.Common_Type common_date_type else
# Lastly, we fall back to text, reporting a warning.
problem_builder.report_other_warning (No_Common_Type.Warning_Convert_To_Text types column_set.name)
Union_Result_Type.Fallback_To_Text

## PRIVATE
type Union_Result_Type
## PRIVATE
Common_Type (value_type : Value_Type)

## PRIVATE
Fallback_To_Text

## PRIVATE
This case is returned if the requested column was missing from _all_ tables,
so there were no types to unify. An all-null column should be created.
No_Types_To_Unify

## PRIVATE
Replace a set of columns in the table with a new set of columns. The old
Expand Down
Loading

0 comments on commit 1e0649f

Please sign in to comment.