Skip to content

Commit

Permalink
Adjust {Table|Column}.parse to use Value_Type (#6213)
Browse files Browse the repository at this point in the history
Closes #5660
  • Loading branch information
radeusgd authored and MichaelMauderer committed Apr 12, 2023
1 parent 9f2dc03 commit 11049ff
Show file tree
Hide file tree
Showing 13 changed files with 223 additions and 170 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -902,7 +902,7 @@ type Column

## Parsing values is not supported in database columns.
@type Widget_Helpers.parse_type_selector
parse : (Auto|Integer|Decimal|Date|Date_Time|Time_Of_Day|Boolean) -> Text | Data_Formatter -> Problem_Behavior -> Column
parse : Value_Type | Auto -> Text | Data_Formatter -> Problem_Behavior -> Column
parse self type=Auto format=Data_Formatter.Value on_problems=Report_Warning =
_ = [type, format, on_problems]
Error.throw <| Unsupported_Database_Operation.Error "`Column.parse` is not implemented yet for the Database backends."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1382,8 +1382,8 @@ type Table

## Parsing values is not supported in database tables, the table has to be
loaded into memory first with `read`.
parse_values : Text | Integer | Column_Selector | Vector (Text | Integer | Column_Selector) -> (Auto|Integer|Decimal|Date|Date_Time|Time_Of_Day|Boolean) -> Text | Data_Formatter -> Boolean -> Problem_Behavior -> Table
parse_values columns=(self.columns . filter (c-> c.value_type.is_text) . map .name) type=Auto format=Data_Formatter.Value error_on_missing_columns=True on_problems=Report_Warning =
parse : Text | Integer | Column_Selector | Vector (Text | Integer | Column_Selector) -> Value_Type | Auto -> Text | Data_Formatter -> Boolean -> Problem_Behavior -> Table
parse columns=(self.columns . filter (c-> c.value_type.is_text) . map .name) type=Auto format=Data_Formatter.Value error_on_missing_columns=True on_problems=Report_Warning =
## Avoid unused arguments warning. We cannot rename arguments to `_`,
because we need to keep the API consistent with the in-memory table.
_ = [columns, type, format, error_on_missing_columns, on_problems]
Expand Down
10 changes: 5 additions & 5 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ import project.Internal.Widget_Helpers

from project.Data.Table import print_table
from project.Data.Type.Value_Type import Value_Type, Auto
from project.Data.Type.Value_Type_Helpers import ensure_valid_parse_target
from project.Errors import No_Index_Set_Error, Floating_Point_Equality, Invalid_Value_Type

polyglot java import org.enso.table.data.column.operation.map.MapOperationProblemBuilder
Expand Down Expand Up @@ -1029,15 +1028,16 @@ type Column

example_contains = Examples.text_column_1.parse Boolean 'Yes|No'
@type Widget_Helpers.parse_type_selector
parse : (Auto|Integer|Decimal|Date|Date_Time|Time_Of_Day|Boolean) -> Text | Data_Formatter -> Problem_Behavior -> Column
parse : Value_Type | Auto -> Text | Data_Formatter -> Problem_Behavior -> Column
parse self type=Auto format=Data_Formatter.Value on_problems=Report_Warning =
Value_Type.expect_text self.value_type related_column=self.name <| ensure_valid_parse_target type <|
Value_Type.expect_text self.value_type related_column=self.name <|
formatter = case format of
_ : Text ->
Data_Formatter.Value.with_format type format
_ -> format
_ : Data_Formatter -> format
_ -> Error.throw (Illegal_Argument.Error "Invalid format type. Expected Text or Data_Formatter.")

parser = if type == Auto then formatter.make_auto_parser else formatter.make_datatype_parser type
parser = formatter.make_value_type_parser type
storage = self.java_column.getStorage
new_storage_and_problems = parser.parseColumn self.name storage

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ from Standard.Base import all
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument

import project.Internal.Parse_Values_Helper
from project.Data.Type.Value_Type import Value_Type, Auto
from project.Data.Type.Value_Type import Value_Type, Auto, Bits

polyglot java import org.enso.table.parsing.IntegerParser
polyglot java import org.enso.table.parsing.DecimalParser
Expand Down Expand Up @@ -67,16 +67,15 @@ type Data_Formatter

Arguments:
- text: Text value to parse.
- datatype: Text value to parse.
- datatype: The expected Enso type to parse the value into. If set to
`Auto`, the type will be inferred automatically.
- on_problems: Specifies the behavior when a problem occurs.
By default, a warning is issued, but the operation proceeds.
If set to `Report_Error`, the operation fails with a dataflow error.
If set to `Ignore`, the operation proceeds without errors or warnings.
parse : Text -> (Auto|Integer|Number|Date|Date_Time|Time_Of_Day|Boolean) -> Problem_Behavior -> Any
parse self text datatype=Auto on_problems=Problem_Behavior.Report_Warning =
parser = case datatype of
Auto -> self.make_auto_parser
_ -> self.make_datatype_parser datatype
parser = self.make_datatype_parser datatype
result = parser.parseIndependentValue text
problems = Vector.from_polyglot_array result.problems . map (Parse_Values_Helper.translate_parsing_problem datatype)
on_problems.attach_problems_after result.value problems
Expand Down Expand Up @@ -145,23 +144,25 @@ type Data_Formatter
It is mostly a convenience function to easily specify a datatype format.

Arguments:
- type: The datatype for which to change the format. The format can be
changed only for Date_Time, Date, Time_Of_Day and Boolean types.
- type: The value type for which to change the format. The format can be
changed only for `Date_Time`, `Date`, `Time` and `Boolean` value types.
- format: The new format string to set. For dates, it is the usual date
format notation, and for booleans it should be two values that
represent true and false, separated by a `|`.
with_format : (Auto|Integer|Number|Date|Date_Time|Time_Of_Day|Boolean) -> Text -> Data_Formatter
with_format : Value_Type | Auto -> Text -> Data_Formatter
with_format self type format = case type of
Auto -> Error.throw (Illegal_Argument.Error "Cannot specify a `format` with type `Auto`.")
Integer -> Error.throw (Illegal_Argument.Error "Cannot specify a `format` with type `Integer`.")
Decimal -> Error.throw (Illegal_Argument.Error "Cannot specify a `format` with type `Decimal`.")
Date -> self.with_datetime_formats date_formats=[format]
Date_Time -> self.with_datetime_formats datetime_formats=[format]
Time_Of_Day -> self.with_datetime_formats time_formats=[format]
Boolean ->
Value_Type.Date -> self.with_datetime_formats date_formats=[format]
Value_Type.Time -> self.with_datetime_formats time_formats=[format]
Value_Type.Date_Time _ ->
self.with_datetime_formats datetime_formats=[format]
Value_Type.Boolean ->
formats = format.split "|"
if formats.length != 2 then Error.throw (Illegal_Argument.Error "The `format` for Booleans must be a string with two values separated by `|`, for example: 'Yes|No'.") else
self.with_boolean_values true_values=[formats.at 0] false_values=[formats.at 1]
Auto ->
Error.throw (Illegal_Argument.Error "Cannot specify a `format` with type `Auto`.")
_ : Value_Type ->
Error.throw (Illegal_Argument.Error "Cannot specify a `format` for type `"+type.to_text+"`.")

## PRIVATE
Clone the instance with some properties overridden.
Expand Down Expand Up @@ -216,7 +217,26 @@ type Data_Formatter
Date -> self.make_date_parser
Date_Time -> self.make_date_time_parser
Time_Of_Day -> self.make_time_of_day_parser
_ -> Error.throw (Illegal_Argument.Error "Unsupported datatype: "+datatype.to_text)
Auto -> self.make_auto_parser
_ ->
type_name = case datatype.to_text of
text : Text -> text
_ -> Meta.meta datatype . to_text
Error.throw (Illegal_Argument.Error "Unsupported datatype: "+type_name)

## PRIVATE
make_value_type_parser self value_type = case value_type of
# TODO once we implement #5159 we will need to add checks for bounds here and support 16/32-bit ints
Value_Type.Integer Bits.Bits_64 -> self.make_integer_parser
# TODO once we implement #6109 we can support 32-bit floats
Value_Type.Float Bits.Bits_64 -> self.make_decimal_parser
Value_Type.Boolean -> self.make_boolean_parser
Value_Type.Date -> self.make_date_parser
Value_Type.Date_Time True -> self.make_date_time_parser
Value_Type.Time -> self.make_time_of_day_parser
Auto -> self.make_auto_parser
_ ->
Error.throw (Illegal_Argument.Error "Unsupported value type: "+value_type.to_display_text)

## PRIVATE
get_specific_type_parsers self =
Expand Down
19 changes: 9 additions & 10 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ import project.Data.Expression.Expression_Error
import project.Delimited.Delimited_Format.Delimited_Format

from project.Data.Type.Value_Type import Value_Type, Auto
from project.Data.Type.Value_Type_Helpers import ensure_valid_parse_target
from project.Internal.Rows_View import Rows_View
from project.Errors import all

Expand Down Expand Up @@ -796,31 +795,31 @@ type Table
> Example
Parse the first and last columns containing Yes/No values as booleans.

table.parse_values columns=[0, -1] type=Boolean format="Yes|No"
table.parse columns=[0, -1] type=Boolean format="Yes|No"

> Example
Parse dates in a column in the format `yyyy-MM-dd` (the default format).

table.parse_values "birthday" Date
table.parse "birthday" Date

> Example
Parse dates in a column in the format `dd/MM/yyyy`.

table.parse_values "birthday" Date 'dd/MM/yyyy'
table.parse "birthday" Date 'dd/MM/yyyy'

> Example
Parse all columns inferring their types, using `,` as the decimal point for numbers.

table.parse_values format=(Data_Formatter.Value.with_number_formatting decimal_point=',')
parse_values : Text | Integer | Column_Selector | Vector (Text | Integer | Column_Selector) -> (Auto|Integer|Decimal|Date|Date_Time|Time_Of_Day|Boolean) -> Text | Data_Formatter -> Boolean -> Problem_Behavior -> Table
parse_values self columns=(self.columns . filter (c-> c.value_type.is_text) . map .name) type=Auto format=Data_Formatter.Value error_on_missing_columns=True on_problems=Report_Warning = ensure_valid_parse_target type <|
table.parse format=(Data_Formatter.Value.with_number_formatting decimal_point=',')
parse : Text | Integer | Column_Selector | Vector (Text | Integer | Column_Selector) -> Value_Type | Auto -> Text | Data_Formatter -> Boolean -> Problem_Behavior -> Table
parse self columns=(self.columns . filter (c-> c.value_type.is_text) . map .name) type=Auto format=Data_Formatter.Value error_on_missing_columns=True on_problems=Report_Warning =
formatter = case format of
_ : Text ->
Data_Formatter.Value.with_format type format
_ -> format
_ : Data_Formatter -> format
_ -> Error.throw (Illegal_Argument.Error "Invalid format type. Expected Text or Data_Formatter.")

parser = if type == Auto then formatter.make_auto_parser else
formatter.make_datatype_parser type
parser = formatter.make_value_type_parser type

select_problem_builder = Problem_Builder.new error_on_missing_columns=error_on_missing_columns
selected_columns = self.columns_helper.select_columns_helper columns reorder=True select_problem_builder
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,3 @@ find_common_type types strict =
# Double check if Mixed was really allowed to come out.
if types.contains Value_Type.Mixed then Value_Type.Mixed else
Nothing

## PRIVATE
Checks if the given type is a valid target type for parsing.

This will be replaced once we change parse to rely on `Value_Type` instead.
ensure_valid_parse_target type ~action =
expected_types = [Auto, Integer, Decimal, Date, Date_Time, Time_Of_Day, Boolean]
if expected_types.contains type . not then Error.throw (Illegal_Argument.Error "Unsupported target type "+type.to_text+".") else action
9 changes: 5 additions & 4 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Errors.enso
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from Standard.Base import all

import Standard.Table.Data.Expression.Expression_Error
import Standard.Table.Data.Type.Value_Type.Value_Type

polyglot java import org.enso.table.error.ColumnCountMismatchException
polyglot java import org.enso.table.error.ColumnNameMismatchException
Expand Down Expand Up @@ -243,18 +244,18 @@ type Invalid_Location
Arguments:
- column: the column in which the problematic cells appeared, if applicable.
It may be empty if the value is parsed outside of a context of a column.
- datatype: The expected datatype.
- value_type: The expected value type.
- cells: Contents of the cells that did not match the expected datatype
format.
type Invalid_Format
## PRIVATE
Error column:(Text|Nothing) (datatype:(Integer|Number|Date|Time|Time_Of_Day|Boolean)) (cells:[Text])
Error column:(Text|Nothing) (value_type:Value_Type|Integer|Number|Date|Time|Time_Of_Day|Boolean) (cells:[Text])

## PRIVATE
Pretty print the invalid format error.
to_display_text : Text
to_display_text self =
self.cells.length+" cells in column "+self.column+" had invalid format for datatype "+self.datatype.to_text+"."
self.cells.length+" cells in column "+self.column+" had invalid format for type "+self.value_type.to_text+"."

## Indicates that some values contained leading zeros even though these were not allowed.

Expand All @@ -270,7 +271,7 @@ type Leading_Zeros
## PRIVATE
Pretty print the leading zeros error.
to_display_text : Text
to_display_text self = "Leading zeros in column "+self.column+" with datatype "+self.datatype.to_text+"."
to_display_text self = "Leading zeros in column "+self.column+" with datatype "+self.value_type.to_text+"."

## Indicates that an empty file was encountered, so no data could be loaded.
type Empty_File_Error
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ polyglot java import org.enso.table.parsing.problems.LeadingZeros
## PRIVATE
Translates a parse related problem additionally enriching it with expected
datatype information that is not originally present on the Java side.
translate_parsing_problem expected_datatype problem = case problem of
translate_parsing_problem expected_value_type problem = case problem of
java_problem : InvalidFormat ->
Invalid_Format.Error java_problem.column expected_datatype (Vector.from_polyglot_array java_problem.cells)
Invalid_Format.Error java_problem.column expected_value_type (Vector.from_polyglot_array java_problem.cells)
java_problem : LeadingZeros ->
Leading_Zeros.Error java_problem.column expected_datatype (Vector.from_polyglot_array java_problem.cells)
Leading_Zeros.Error java_problem.column expected_value_type (Vector.from_polyglot_array java_problem.cells)
_ ->
Panic.throw (Illegal_State.Error "Reported an unknown problem type: "+problem.to_text)
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@ make_column_name_selector table display=Display.Always =
Selector for type argument on `Column.parse`.
parse_type_selector : Single_Choice
parse_type_selector =
choice = ['Auto', 'Integer', 'Decimal', 'Date', 'Date_Time', 'Time_Of_Day', 'Boolean']
Single_Choice display=Display.Always values=(choice.map n->(Option n))
choice = ['Auto', 'Value_Type.Integer', 'Value_Type.Float', 'Value_Type.Date', 'Value_Type.Date_Time', 'Value_Type.Time', 'Value_Type.Boolean']
names = ['Auto', 'Integer', 'Float', 'Date', 'Date_Time', 'Time', 'Boolean']
options = names.zip choice . map pair-> Option pair.first pair.second
Single_Choice display=Display.Always values=options

## PRIVATE
Selector for type argument on `Column.parse`.
Expand Down
6 changes: 5 additions & 1 deletion distribution/lib/Standard/Test/0.0.0-dev/src/Problems.enso
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,14 @@ expect_warning expected_warning result =
## UNSTABLE
Checks if the provided value has a specific warning attached and if there are
no other warnings.

As a utility, it also returns the found warning.

Arguments:
- expected_warning: The expected warning. It can either by a warning type or
a concrete value.
- result: The value to check.
expect_only_warning : Any -> Any -> Nothing
expect_only_warning : Any -> Any -> Any
expect_only_warning expected_warning result =
warnings = get_attached_warnings result
is_expected x =
Expand All @@ -114,6 +117,7 @@ expect_only_warning expected_warning result =
if invalid.not_empty then
loc = Meta.get_source_location 3
Test.fail "Expected the result to contain only the warning: "+found.to_text+", but it also contained: "+invalid.to_text+' (at '+loc+').'
found


## UNSTABLE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,11 @@ public Object parseSingleValue(String text, ProblemAggregator problemAggregator)

@Override
public WithProblems<Storage<?>> parseColumn(String columnName, Storage<String> sourceStorage) {
// If there are now rows, the Auto parser would guess some random type (the first one that is
// checked). Instead,
// we just return the empty column unchanged.
if (sourceStorage.size() == 0) {
// If there are no values, the Auto parser would guess some random type (the first one that is
// checked). Instead, we just return the empty column unchanged.
boolean hasNoValues =
(sourceStorage.size() == 0) || (sourceStorage.countMissing() == sourceStorage.size());
if (hasNoValues) {
return fallbackParser.parseColumn(columnName, sourceStorage);
}

Expand Down
21 changes: 21 additions & 0 deletions test/Table_Tests/src/Formatting/Data_Formatter_Spec.enso
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,27 @@ spec =
And newlines toO!
formatter.parse complex_text . should_equal complex_text

Test.specify "should report Invalid_Format errors" <|
formatter = Data_Formatter.Value
expect_warning r =
r.should_equal Nothing
Problems.expect_only_warning Invalid_Format r

r1 = formatter.parse "Text" datatype=Decimal
w1 = expect_warning r1
w1.value_type . should_equal Decimal
w1.column . should_equal Nothing

expect_warning <| formatter.parse "Text" datatype=Integer
expect_warning <| formatter.parse "Text" datatype=Boolean
expect_warning <| formatter.parse "Text" datatype=Date
expect_warning <| formatter.parse "Text" datatype=Date_Time
expect_warning <| formatter.parse "Text" datatype=Time_Of_Day

Test.specify "should not allow unexpected types" <|
formatter = Data_Formatter.Value
formatter.parse "Text" datatype=List . should_fail_with Illegal_Argument

Test.group "DataFormatter.format" <|
Test.specify "should handle Nothing" <|
Data_Formatter.Value.format Nothing . should_equal Nothing
Expand Down
Loading

0 comments on commit 11049ff

Please sign in to comment.