From ff7700ebb1f36bd78b762652632cad59585511d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Fri, 20 May 2022 17:08:36 +0200 Subject: [PATCH] Automatic inference of value types when parsing table columns (#3462) Implements https://www.pivotaltracker.com/story/show/182199966 --- CHANGELOG.md | 2 + .../Database/0.0.0-dev/src/Data/Table.enso | 4 +- .../0.0.0-dev/src/Data/Data_Formatter.enso | 66 +++++++++++++++++ .../Table/0.0.0-dev/src/Data/Table.enso | 29 +------- .../builder/string/StringStorageBuilder.java | 4 +- .../enso/table/parsing/BaseTimeParser.java | 13 ++-- .../org/enso/table/parsing/BooleanParser.java | 13 ++-- .../enso/table/parsing/DatatypeParser.java | 59 +-------------- .../org/enso/table/parsing/DecimalParser.java | 13 ++-- .../enso/table/parsing/IdentityParser.java | 27 +++++++ .../parsing/IncrementalDatatypeParser.java | 58 +++++++++++++++ .../org/enso/table/parsing/IntegerParser.java | 13 ++-- .../table/parsing/TypeInferringParser.java | 52 ++++++++++++++ .../parsing/WhitespaceStrippingParser.java | 19 +++-- .../InvalidFormatProblemAggregator.java | 28 -------- .../problems/NumericProblemAggregator.java | 27 ------- .../parsing/problems/ProblemAggregator.java | 49 ++++++++++++- test/Table_Tests/src/Parse_Values_Spec.enso | 71 ++++++++++++++++--- 18 files changed, 347 insertions(+), 200 deletions(-) create mode 100644 std-bits/table/src/main/java/org/enso/table/parsing/IdentityParser.java create mode 100644 std-bits/table/src/main/java/org/enso/table/parsing/IncrementalDatatypeParser.java create mode 100644 std-bits/table/src/main/java/org/enso/table/parsing/TypeInferringParser.java delete mode 100644 std-bits/table/src/main/java/org/enso/table/parsing/problems/InvalidFormatProblemAggregator.java delete mode 100644 std-bits/table/src/main/java/org/enso/table/parsing/problems/NumericProblemAggregator.java diff --git a/CHANGELOG.md b/CHANGELOG.md index 762a0d931bf5..686c918f21d4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -124,6 +124,7 @@ specified type.][3455] - [Promote with, take, finalize to be methods of Managed_Resource instance][3460] +- [Implemented automatic type detection for `Table.parse_values`.][3462] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -192,6 +193,7 @@ [3457]: https://github.com/enso-org/enso/pull/3457 [3455]: https://github.com/enso-org/enso/pull/3455 [3460]: https://github.com/enso-org/enso/pull/3460 +[3462]: https://github.com/enso-org/enso/pull/3462 #### Enso Compiler diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso index de997dfe8afd..0c928955caab 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso @@ -670,10 +670,10 @@ type Table ## Parsing values is not supported in database tables, the table has to be materialized first with `to_dataframe`. parse_values : Data_Formatter -> (Nothing | [Column_Type_Selection]) -> Problem_Behavior -> Table - parse_values parser=Data_Formatter column_types=Nothing on_problems=Report_Warning = + parse_values value_formatter=Data_Formatter column_types=Nothing on_problems=Report_Warning = ## Avoid unused arguments warning. We cannot rename arguments to `_`, because we need to keep the API consistent with the in-memory table. - _ = [parser, column_types, on_problems] + _ = [value_formatter, column_types, on_problems] msg = "Parsing values is not supported in database tables, the table has to be materialized first with `to_dataframe`." Error.throw (Unsupported_Database_Operation_Error msg) diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Data_Formatter.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Data_Formatter.enso index e0ae5e320308..86866ab3fbde 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Data_Formatter.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Data_Formatter.enso @@ -1,5 +1,19 @@ from Standard.Base import all +from Standard.Base.Data.Time.Date as Date_Module import Date +from Standard.Base.Data.Time as Time_Module import Time +from Standard.Base.Data.Time.Time_Of_Day as Time_Of_Day_Module import Time_Of_Day + +polyglot java import org.enso.table.parsing.IntegerParser +polyglot java import org.enso.table.parsing.DecimalParser +polyglot java import org.enso.table.parsing.BooleanParser +polyglot java import org.enso.table.parsing.DateParser +polyglot java import org.enso.table.parsing.TimeParser +polyglot java import org.enso.table.parsing.DateTimeParser +polyglot java import org.enso.table.parsing.WhitespaceStrippingParser +polyglot java import org.enso.table.parsing.IdentityParser +polyglot java import org.enso.table.parsing.TypeInferringParser + ## Specifies options for reading text data in a table to more specific types and serializing them back. @@ -27,3 +41,55 @@ from Standard.Base import all - true_values: Values representing True. - false_values: Values representing False. type Data_Formatter trim_values:Boolean=True allow_leading_zeros:Boolean=False decimal_point:Text='.' thousand_separator:Text='' datetime_formats:[Text]=["yyyy-MM-dd HH:mm:ss"] date_formats:[Text]=["yyyy-MM-dd"] time_formats:[Text]=["HH:mm:ss"] locale:Locale=Locale.default true_values:[Text]=["True","true","TRUE"] false_values:[Text]=["False","false","FALSE"] + +## PRIVATE +Data_Formatter.get_thousand_separator = if this.thousand_separator.is_empty then Nothing else this.thousand_separator + +## PRIVATE +Data_Formatter.wrap_base_parser base_parser = + if this.trim_values.not then base_parser else + WhitespaceStrippingParser.new base_parser + +## PRIVATE +Data_Formatter.make_integer_parser = this.wrap_base_parser <| + IntegerParser.new this.get_thousand_separator this.allow_leading_zeros + +## PRIVATE +Data_Formatter.make_decimal_parser = this.wrap_base_parser <| + DecimalParser.new this.decimal_point this.get_thousand_separator this.allow_leading_zeros + +## PRIVATE +Data_Formatter.make_boolean_parser = this.wrap_base_parser <| + BooleanParser.new this.true_values.to_array this.false_values.to_array + +## PRIVATE +Data_Formatter.make_date_parser = this.wrap_base_parser <| + DateParser.new this.date_formats.to_array this.locale.java_locale + +## PRIVATE +Data_Formatter.make_identity_parser = this.wrap_base_parser IdentityParser.new + +## PRIVATE +Data_Formatter.make_datetime_parser = this.wrap_base_parser <| + DateTimeParser.new this.datetime_formats.to_array this.locale.java_locale + +## PRIVATE +Data_Formatter.make_time_parser = this.wrap_base_parser <| + TimeParser.new this.time_formats.to_array this.locale.java_locale + +## PRIVATE +Data_Formatter.make_datatype_parser datatype = case datatype of + Integer -> this.make_integer_parser + Decimal -> this.make_decimal_parser + Boolean -> this.make_boolean_parser + _ -> + if datatype == Date then this.make_date_parser else + if datatype == Time then this.make_datetime_parser else + if datatype == Time_Of_Day then this.make_time_parser else + Error.throw (Illegal_Argument_Error "Unsupported datatype: "+datatype.to_text) + +## PRIVATE +Data_Formatter.make_auto_parser = + parsers = [this.make_integer_parser, this.make_decimal_parser, this.make_datetime_parser, this.make_date_parser, this.make_time_parser, this.make_boolean_parser] + fallback_parser = this.make_identity_parser + TypeInferringParser.new parsers.to_array fallback_parser diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso index db9f859f2292..1284bbd94ed0 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso @@ -5,8 +5,6 @@ import Standard.Table.Data.Column import Standard.Table.Io.Csv import Standard.Visualization from Standard.Base.Data.Time.Date as Date_Module import Date -from Standard.Base.Data.Time as Time_Module import Time -from Standard.Base.Data.Time.Time_Of_Day as Time_Of_Day_Module import Time_Of_Day import Standard.Table.Io.Spreadsheet_Write_Mode import Standard.Table.Io.Format import Standard.Table.Internal.Table_Helpers @@ -31,14 +29,6 @@ polyglot java import org.enso.table.operations.OrderBuilder polyglot java import org.enso.table.format.csv.Writer as Csv_Writer polyglot java import org.enso.table.format.xlsx.Writer as Spreadsheet_Writer -polyglot java import org.enso.table.parsing.IntegerParser -polyglot java import org.enso.table.parsing.DecimalParser -polyglot java import org.enso.table.parsing.BooleanParser -polyglot java import org.enso.table.parsing.DateParser -polyglot java import org.enso.table.parsing.TimeParser -polyglot java import org.enso.table.parsing.DateTimeParser -polyglot java import org.enso.table.parsing.WhitespaceStrippingParser - ## Creates a new table from a vector of `[name, items]` pairs. Arguments: @@ -553,7 +543,7 @@ type Table a leading 0). However, settings in the `Data_Formatter` can control this. parse_values : Data_Formatter -> (Nothing | [Column_Type_Selection]) -> Problem_Behavior -> Table - parse_values parser=Data_Formatter column_types=Nothing on_problems=Report_Warning = + parse_values value_formatter=Data_Formatter column_types=Nothing on_problems=Report_Warning = columns = this.columns problem_builder = Vector.new_builder @@ -595,22 +585,9 @@ type Table new_columns = columns.zip expected_types column-> expected_type-> case expected_type of Nothing -> column - Auto -> Error.unimplemented "Automatic datatype inference is not implemented yet." _ -> - parse_options = parser - thousand_separator = if parse_options.thousand_separator.is_empty then Nothing else parse_options.thousand_separator - base_parser = case expected_type of - Integer -> IntegerParser.new thousand_separator parse_options.allow_leading_zeros - Decimal -> DecimalParser.new parse_options.decimal_point thousand_separator parse_options.allow_leading_zeros - Boolean -> BooleanParser.new parse_options.true_values.to_array parse_options.false_values.to_array - _ -> - if expected_type == Date then DateParser.new parse_options.date_formats.to_array parse_options.locale.java_locale else - if expected_type == Time then DateTimeParser.new parse_options.datetime_formats.to_array parse_options.locale.java_locale else - if expected_type == Time_Of_Day then TimeParser.new parse_options.time_formats.to_array parse_options.locale.java_locale else - Error.throw (Illegal_Argument_Error "Unsupported target datatype: "+expected_type.to_text) - parser = case parse_options.trim_values of - False -> base_parser - True -> WhitespaceStrippingParser.new base_parser + parser = if expected_type == Auto then value_formatter.make_auto_parser else + value_formatter.make_datatype_parser expected_type storage = column.java_column.getStorage new_storage_and_problems = parser.parseColumn storage new_storage = new_storage_and_problems.value diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/builder/string/StringStorageBuilder.java b/std-bits/table/src/main/java/org/enso/table/data/column/builder/string/StringStorageBuilder.java index f98faa9c0f80..27c3c82a9032 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/builder/string/StringStorageBuilder.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/builder/string/StringStorageBuilder.java @@ -29,7 +29,7 @@ public StringStorageBuilder() { /** @inheritDoc */ @Override - public StorageBuilder parseAndAppend(String value) { + public StringStorageBuilder parseAndAppend(String value) { ensureAppendable(); data[size++] = value; return this; @@ -45,7 +45,7 @@ private void ensureAppendable() { /** @inheritDoc */ @Override - public Storage seal() { + public StringStorage seal() { return new StringStorage(data, size); } } diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/BaseTimeParser.java b/std-bits/table/src/main/java/org/enso/table/parsing/BaseTimeParser.java index abf0a5baaefe..0caeba50def8 100644 --- a/std-bits/table/src/main/java/org/enso/table/parsing/BaseTimeParser.java +++ b/std-bits/table/src/main/java/org/enso/table/parsing/BaseTimeParser.java @@ -5,9 +5,9 @@ import java.util.Locale; import org.enso.table.data.column.builder.object.Builder; import org.enso.table.data.column.builder.object.ObjectBuilder; -import org.enso.table.parsing.problems.InvalidFormatProblemAggregator; +import org.enso.table.parsing.problems.ProblemAggregator; -public abstract class BaseTimeParser extends DatatypeParser { +public abstract class BaseTimeParser extends IncrementalDatatypeParser { protected interface ParseStrategy { Object parse(String text, DateTimeFormatter formatter) throws DateTimeParseException; } @@ -25,7 +25,7 @@ protected BaseTimeParser(String[] formats, Locale locale, ParseStrategy parseStr } @Override - public Object parseSingleValue(String text, InvalidFormatProblemAggregator problemAggregator) { + protected Object parseSingleValue(String text, ProblemAggregator problemAggregator) { for (var formatter : formatters) { try { return parseStrategy.parse(text, formatter); @@ -38,14 +38,9 @@ public Object parseSingleValue(String text, InvalidFormatProblemAggregator probl } @Override - public Builder makeBuilderWithCapacity(long capacity) { + protected Builder makeBuilderWithCapacity(long capacity) { // Once datetime gets first-class support in our dataframes, a more specific builder type should // be used. return new ObjectBuilder((int) capacity); } - - @Override - public InvalidFormatProblemAggregator makeProblemAggregator() { - return new InvalidFormatProblemAggregator(); - } } diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/BooleanParser.java b/std-bits/table/src/main/java/org/enso/table/parsing/BooleanParser.java index 8bcd9052c62f..75bf818494b1 100644 --- a/std-bits/table/src/main/java/org/enso/table/parsing/BooleanParser.java +++ b/std-bits/table/src/main/java/org/enso/table/parsing/BooleanParser.java @@ -2,10 +2,10 @@ import org.enso.table.data.column.builder.object.BoolBuilder; import org.enso.table.data.column.builder.object.Builder; -import org.enso.table.parsing.problems.InvalidFormatProblemAggregator; +import org.enso.table.parsing.problems.ProblemAggregator; import org.graalvm.collections.EconomicSet; -public class BooleanParser extends DatatypeParser { +public class BooleanParser extends IncrementalDatatypeParser { private final EconomicSet trueValues; private final EconomicSet falseValues; @@ -22,7 +22,7 @@ public BooleanParser(String[] trueValues, String[] falseValues) { } @Override - public Object parseSingleValue(String text, InvalidFormatProblemAggregator problemAggregator) { + protected Object parseSingleValue(String text, ProblemAggregator problemAggregator) { // TODO we may want to use equality checks taking Unicode Normalization into account, to be // revised in: https://www.pivotaltracker.com/story/show/182166382 if (trueValues.contains(text)) return true; @@ -33,12 +33,7 @@ public Object parseSingleValue(String text, InvalidFormatProblemAggregator probl } @Override - public Builder makeBuilderWithCapacity(long capacity) { + protected Builder makeBuilderWithCapacity(long capacity) { return new BoolBuilder((int) capacity); } - - @Override - public InvalidFormatProblemAggregator makeProblemAggregator() { - return new InvalidFormatProblemAggregator(); - } } diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/DatatypeParser.java b/std-bits/table/src/main/java/org/enso/table/parsing/DatatypeParser.java index 8bd127a1609f..c45508ccdc67 100644 --- a/std-bits/table/src/main/java/org/enso/table/parsing/DatatypeParser.java +++ b/std-bits/table/src/main/java/org/enso/table/parsing/DatatypeParser.java @@ -1,67 +1,14 @@ package org.enso.table.parsing; -import org.enso.table.data.column.builder.object.Builder; import org.enso.table.data.column.storage.Storage; import org.enso.table.data.column.storage.StringStorage; -import org.enso.table.parsing.problems.ProblemAggregator; import org.enso.table.read.WithProblems; -/** - * A base type for a datatype parsing strategy. - * - *

It specifies the strategy for parsing text cells into some target type, reporting issues and - * building the resulting table column. - * - * @param the specific problem aggregator type; the type is refined to be able to handle - * various strategies for aggregating problems, depending on the particular datatype - */ -public abstract class DatatypeParser { - - /** - * Parses a single cell. - * - * @param text the text contents to parse, it will never be null in the default implementation - - * null values are just passed as-is without any parsing attempts by default - * @param problemAggregator an instance of the problem aggregator, used for reporting parsing - * problems - * @return the parsed value or null if the value could not be parsed or could be parsed but should - * be treated as missing value - */ - public abstract Object parseSingleValue(String text, PA problemAggregator); - - /** - * Creates a new column builder expecting the specific datatype, with a specified capacity. - * - *

The {@code parseColumn} method will use {@code appendNoGrow} function, so the initial - * capacity should be set properly so that the builder can hold all expected elements. - * - *

The type returned from {@code parseSingleValue} should be consistent with the types that the - * builder returned here expects - it should never return a value that cannot be accepted by the - * builder. - */ - public abstract Builder makeBuilderWithCapacity(long capacity); - - /** Creates a new instance of the specific problem aggregator type. */ - public abstract PA makeProblemAggregator(); - +/** A base type for a parser capable of parsing a column of text values into some other type. */ +public interface DatatypeParser { /** * Parses a column of texts (represented as a {@code StringStorage}) and returns a new storage, * containing the parsed elements. */ - public WithProblems parseColumn(StringStorage sourceStorage) { - Builder builder = makeBuilderWithCapacity(sourceStorage.size()); - PA aggregator = makeProblemAggregator(); - - for (int i = 0; i < sourceStorage.size(); ++i) { - String cell = sourceStorage.getItem(i); - if (cell != null) { - Object parsed = parseSingleValue(cell, aggregator); - builder.appendNoGrow(parsed); - } else { - builder.appendNoGrow(null); - } - } - - return new WithProblems<>(builder.seal(), aggregator.getAggregatedProblems()); - } + WithProblems parseColumn(StringStorage sourceStorage); } diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/DecimalParser.java b/std-bits/table/src/main/java/org/enso/table/parsing/DecimalParser.java index 0e5f285f3560..38bd979ab8f3 100644 --- a/std-bits/table/src/main/java/org/enso/table/parsing/DecimalParser.java +++ b/std-bits/table/src/main/java/org/enso/table/parsing/DecimalParser.java @@ -4,9 +4,9 @@ import java.text.ParsePosition; import org.enso.table.data.column.builder.object.Builder; import org.enso.table.data.column.builder.object.NumericBuilder; -import org.enso.table.parsing.problems.NumericProblemAggregator; +import org.enso.table.parsing.problems.ProblemAggregator; -public class DecimalParser extends DatatypeParser { +public class DecimalParser extends IncrementalDatatypeParser { private final String thousandsSeparator; private final char decimalPoint; private final DecimalFormat decimalFormat; @@ -38,7 +38,7 @@ public DecimalParser( } @Override - public Object parseSingleValue(String text, NumericProblemAggregator problemAggregator) { + protected Object parseSingleValue(String text, ProblemAggregator problemAggregator) { if (thousandsSeparator != null && (text.startsWith(thousandsSeparator) || text.endsWith(thousandsSeparator))) { problemAggregator.reportInvalidFormat(text); @@ -84,12 +84,7 @@ private boolean hasLeadingZeros(String s) { } @Override - public Builder makeBuilderWithCapacity(long capacity) { + protected Builder makeBuilderWithCapacity(long capacity) { return NumericBuilder.createDoubleBuilder((int) capacity); } - - @Override - public NumericProblemAggregator makeProblemAggregator() { - return new NumericProblemAggregator(); - } } diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/IdentityParser.java b/std-bits/table/src/main/java/org/enso/table/parsing/IdentityParser.java new file mode 100644 index 000000000000..87f41aeb9197 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/parsing/IdentityParser.java @@ -0,0 +1,27 @@ +package org.enso.table.parsing; + +import java.util.List; +import org.enso.table.data.column.builder.object.StringBuilder; +import org.enso.table.data.column.storage.Storage; +import org.enso.table.data.column.storage.StringStorage; +import org.enso.table.parsing.problems.ProblemAggregator; +import org.enso.table.read.WithProblems; + +/** A parser that just returns its input. Useful as a fallback. */ +public class IdentityParser extends IncrementalDatatypeParser { + + @Override + public Object parseSingleValue(String text, ProblemAggregator problemAggregator) { + return text; + } + + @Override + public StringBuilder makeBuilderWithCapacity(long capacity) { + return new StringBuilder((int) capacity); + } + + @Override + public WithProblems parseColumn(StringStorage sourceStorage) { + return new WithProblems<>(sourceStorage, List.of()); + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/IncrementalDatatypeParser.java b/std-bits/table/src/main/java/org/enso/table/parsing/IncrementalDatatypeParser.java new file mode 100644 index 000000000000..0bd42b9231e1 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/parsing/IncrementalDatatypeParser.java @@ -0,0 +1,58 @@ +package org.enso.table.parsing; + +import org.enso.table.data.column.builder.object.Builder; +import org.enso.table.data.column.storage.Storage; +import org.enso.table.data.column.storage.StringStorage; +import org.enso.table.parsing.problems.ProblemAggregator; +import org.enso.table.read.WithProblems; + +/** + * A base type for a datatype parsing strategy which relies on a method parsing a single value. + * + *

It specifies the strategy for parsing text cells into some target type, reporting issues and + * building the resulting table column. + */ +public abstract class IncrementalDatatypeParser implements DatatypeParser { + + /** + * Parses a single cell. + * + * @param text the text contents to parse, it will never be null in the default implementation - + * null values are just passed as-is without any parsing attempts by default + * @param problemAggregator an instance of the problem aggregator, used for reporting parsing + * problems + * @return the parsed value or null if the value could not be parsed or could be parsed but should + * be treated as missing value + */ + protected abstract Object parseSingleValue(String text, ProblemAggregator problemAggregator); + + /** + * Creates a new column builder expecting the specific datatype, with a specified capacity. + * + *

The {@code parseColumn} method will use {@code appendNoGrow} function, so the initial + * capacity should be set properly so that the builder can hold all expected elements. + * + *

The type returned from {@code parseSingleValue} should be consistent with the types that the + * builder returned here expects - it should never return a value that cannot be accepted by the + * builder. + */ + protected abstract Builder makeBuilderWithCapacity(long capacity); + + @Override + public WithProblems parseColumn(StringStorage sourceStorage) { + Builder builder = makeBuilderWithCapacity(sourceStorage.size()); + var aggregator = new ProblemAggregator(); + + for (int i = 0; i < sourceStorage.size(); ++i) { + String cell = sourceStorage.getItem(i); + if (cell != null) { + Object parsed = parseSingleValue(cell, aggregator); + builder.appendNoGrow(parsed); + } else { + builder.appendNoGrow(null); + } + } + + return new WithProblems<>(builder.seal(), aggregator.getAggregatedProblems()); + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/IntegerParser.java b/std-bits/table/src/main/java/org/enso/table/parsing/IntegerParser.java index 2280acc3fc3b..18a78365baf6 100644 --- a/std-bits/table/src/main/java/org/enso/table/parsing/IntegerParser.java +++ b/std-bits/table/src/main/java/org/enso/table/parsing/IntegerParser.java @@ -2,9 +2,9 @@ import org.enso.table.data.column.builder.object.Builder; import org.enso.table.data.column.builder.object.NumericBuilder; -import org.enso.table.parsing.problems.NumericProblemAggregator; +import org.enso.table.parsing.problems.ProblemAggregator; -public class IntegerParser extends DatatypeParser { +public class IntegerParser extends IncrementalDatatypeParser { private final String thousandsSeparator; private final boolean leadingZerosAllowed; @@ -18,7 +18,7 @@ public IntegerParser(final String thousandsSeparator, final boolean leadingZeros } @Override - public Object parseSingleValue(String text, NumericProblemAggregator problemAggregator) { + protected Object parseSingleValue(String text, ProblemAggregator problemAggregator) { if (thousandsSeparator != null && (text.startsWith(thousandsSeparator) || text.endsWith(thousandsSeparator))) { problemAggregator.reportInvalidFormat(text); @@ -55,12 +55,7 @@ private boolean hasLeadingZeros(String s) { } @Override - public Builder makeBuilderWithCapacity(long capacity) { + protected Builder makeBuilderWithCapacity(long capacity) { return NumericBuilder.createLongBuilder((int) capacity); } - - @Override - public NumericProblemAggregator makeProblemAggregator() { - return new NumericProblemAggregator(); - } } diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/TypeInferringParser.java b/std-bits/table/src/main/java/org/enso/table/parsing/TypeInferringParser.java new file mode 100644 index 000000000000..e80ada9caa8f --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/parsing/TypeInferringParser.java @@ -0,0 +1,52 @@ +package org.enso.table.parsing; + +import org.enso.table.data.column.builder.object.Builder; +import org.enso.table.data.column.storage.Storage; +import org.enso.table.data.column.storage.StringStorage; +import org.enso.table.parsing.problems.ProblemAggregator; +import org.enso.table.read.WithProblems; + +/** + * The type inferring parser tries to parse the given column using a set of provided parsers. It + * returns the result of the first parser that succeeds without reporting any problems. + * + *

If all parsers from the set reported problems, the fallback parser is used and its result is + * returned regardless of any problems. + */ +public class TypeInferringParser implements DatatypeParser { + + private final IncrementalDatatypeParser[] baseParsers; + private final DatatypeParser fallbackParser; + + public TypeInferringParser( + IncrementalDatatypeParser[] baseParsers, DatatypeParser fallbackParser) { + this.baseParsers = baseParsers; + this.fallbackParser = fallbackParser; + } + + @Override + public WithProblems parseColumn(StringStorage sourceStorage) { + parsers: + for (IncrementalDatatypeParser parser : baseParsers) { + Builder builder = parser.makeBuilderWithCapacity(sourceStorage.size()); + var aggregator = new ProblemAggregator(); + + for (int i = 0; i < sourceStorage.size(); ++i) { + String cell = sourceStorage.getItem(i); + if (cell != null) { + Object parsed = parser.parseSingleValue(cell, aggregator); + if (aggregator.hasProblems()) { + continue parsers; + } + builder.appendNoGrow(parsed); + } else { + builder.appendNoGrow(null); + } + } + + return new WithProblems<>(builder.seal(), aggregator.getAggregatedProblems()); + } + + return fallbackParser.parseColumn(sourceStorage); + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/WhitespaceStrippingParser.java b/std-bits/table/src/main/java/org/enso/table/parsing/WhitespaceStrippingParser.java index 2a496b469612..6108746e138e 100644 --- a/std-bits/table/src/main/java/org/enso/table/parsing/WhitespaceStrippingParser.java +++ b/std-bits/table/src/main/java/org/enso/table/parsing/WhitespaceStrippingParser.java @@ -3,26 +3,25 @@ import org.enso.table.data.column.builder.object.Builder; import org.enso.table.parsing.problems.ProblemAggregator; -public class WhitespaceStrippingParser extends DatatypeParser { - private final DatatypeParser innerParser; +/** + * An incremental parser which wraps another parser of that type, delegating the parsing logic to + * it, but first transforming the input text by stripping any leading and trailing whitespace. + */ +public class WhitespaceStrippingParser extends IncrementalDatatypeParser { + private final IncrementalDatatypeParser innerParser; - public WhitespaceStrippingParser(DatatypeParser innerParser) { + public WhitespaceStrippingParser(IncrementalDatatypeParser innerParser) { this.innerParser = innerParser; } @Override - public Object parseSingleValue(String text, PA problemAggregator) { + protected Object parseSingleValue(String text, ProblemAggregator problemAggregator) { String stripped = text.strip(); return innerParser.parseSingleValue(stripped, problemAggregator); } @Override - public Builder makeBuilderWithCapacity(long capacity) { + protected Builder makeBuilderWithCapacity(long capacity) { return innerParser.makeBuilderWithCapacity(capacity); } - - @Override - public PA makeProblemAggregator() { - return innerParser.makeProblemAggregator(); - } } diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/problems/InvalidFormatProblemAggregator.java b/std-bits/table/src/main/java/org/enso/table/parsing/problems/InvalidFormatProblemAggregator.java deleted file mode 100644 index 4d43cf8c39d4..000000000000 --- a/std-bits/table/src/main/java/org/enso/table/parsing/problems/InvalidFormatProblemAggregator.java +++ /dev/null @@ -1,28 +0,0 @@ -package org.enso.table.parsing.problems; - -import java.util.ArrayList; -import java.util.List; - -/** - * A base problem aggregator that allows reporting the most generic {@code InvalidFormat} problem. - */ -public class InvalidFormatProblemAggregator implements ProblemAggregator { - - private final List invalidFormatCells = new ArrayList<>(); - - /** - * Reports a cell with an invalid format. - * - *

The reports are aggregated and finally a single problem containing all invalid cell for the - * given column is reported. - */ - public void reportInvalidFormat(String cell) { - invalidFormatCells.add(cell); - } - - @Override - public List getAggregatedProblems() { - if (invalidFormatCells.isEmpty()) return List.of(); - else return List.of(new InvalidFormat(invalidFormatCells)); - } -} diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/problems/NumericProblemAggregator.java b/std-bits/table/src/main/java/org/enso/table/parsing/problems/NumericProblemAggregator.java deleted file mode 100644 index fe5a09ee385a..000000000000 --- a/std-bits/table/src/main/java/org/enso/table/parsing/problems/NumericProblemAggregator.java +++ /dev/null @@ -1,27 +0,0 @@ -package org.enso.table.parsing.problems; - -import java.util.ArrayList; -import java.util.List; - -/** - * A problem aggregator capable of reporting {@code InvalidFormat} and {@code LeadingZeros} - * problems. - */ -public class NumericProblemAggregator extends InvalidFormatProblemAggregator { - private final List leadingZerosCells = new ArrayList<>(); - - public void reportLeadingZeroes(String cell) { - leadingZerosCells.add(cell); - } - - @Override - public List getAggregatedProblems() { - List problems = new ArrayList<>(super.getAggregatedProblems()); - - if (!leadingZerosCells.isEmpty()) { - problems.add(new LeadingZeros(leadingZerosCells)); - } - - return problems; - } -} diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/problems/ProblemAggregator.java b/std-bits/table/src/main/java/org/enso/table/parsing/problems/ProblemAggregator.java index c0476a5dd113..ad34d11b2287 100644 --- a/std-bits/table/src/main/java/org/enso/table/parsing/problems/ProblemAggregator.java +++ b/std-bits/table/src/main/java/org/enso/table/parsing/problems/ProblemAggregator.java @@ -1,14 +1,57 @@ package org.enso.table.parsing.problems; +import java.util.ArrayList; import java.util.List; /** - * A base class for strategies for aggregating problems. + * An aggregator for parsing problems. * *

Each strategy exposes a method that returns a summary of the problems. The particular methods * for reporting each problem are defined in particular subclasses. */ -public interface ProblemAggregator { +public class ProblemAggregator { + + private final List invalidFormatCells = new ArrayList<>(); + private final List leadingZerosCells = new ArrayList<>(); + + /** + * Reports a cell with an invalid format. + * + *

The reports are aggregated and finally a single problem containing all invalid cell for the + * given column is reported. + */ + public void reportInvalidFormat(String cell) { + invalidFormatCells.add(cell); + } + + public void reportLeadingZeroes(String cell) { + leadingZerosCells.add(cell); + } + + /** + * Checks if there are any problems already reported. + * + *

This method returns true if and only if {@code getAggregatedProblems} would return a + * non-empty list. + */ + public boolean hasProblems() { + return !invalidFormatCells.isEmpty() || !leadingZerosCells.isEmpty(); + } + /** Return an aggregated summary of problems that have been reported. */ - List getAggregatedProblems(); + public List getAggregatedProblems() { + List problems = new ArrayList<>(); + + if (!invalidFormatCells.isEmpty()) { + problems.add(new InvalidFormat(invalidFormatCells)); + } + + if (!leadingZerosCells.isEmpty()) { + problems.add(new LeadingZeros(leadingZerosCells)); + } + + assert problems.isEmpty() == !hasProblems(); + + return problems; + } } diff --git a/test/Table_Tests/src/Parse_Values_Spec.enso b/test/Table_Tests/src/Parse_Values_Spec.enso index d0b8fa555c6e..bbbeec02b0ea 100644 --- a/test/Table_Tests/src/Parse_Values_Spec.enso +++ b/test/Table_Tests/src/Parse_Values_Spec.enso @@ -11,7 +11,7 @@ import Standard.Test.Problems import Standard.Visualization from Standard.Table.Data.Data_Formatter as Data_Formatter_Module import Data_Formatter -from Standard.Table.Data.Column_Type_Selection as Column_Type_Selection_Module import Column_Type_Selection +from Standard.Table.Data.Column_Type_Selection as Column_Type_Selection_Module import Column_Type_Selection, Auto from Standard.Table.Error as Table_Errors import Invalid_Format, Leading_Zeros, Missing_Input_Columns, Column_Indexes_Out_Of_Range, Duplicate_Type_Selector @@ -55,14 +55,14 @@ spec = Test.group "Table.parse_values" <| opts = Data_Formatter allow_leading_zeros=True t1_parsed_zeros = [0, 0, 0, 1, -1, 1, 0, 10, 12345, Nothing] - t6 = t1.parse_values parser=opts column_types=[Column_Type_Selection 0 Integer] + t6 = t1.parse_values value_formatter=opts column_types=[Column_Type_Selection 0 Integer] t6.at "ints" . to_vector . should_equal t1_parsed_zeros Warning.get_all t6 . should_equal [] - t7 = t1.parse_values parser=opts column_types=[Column_Type_Selection 0 Decimal] + t7 = t1.parse_values value_formatter=opts column_types=[Column_Type_Selection 0 Decimal] t7.at "ints" . to_vector . should_equal t1_parsed_zeros Warning.get_all t7 . should_equal [] - t8 = t2.parse_values parser=opts column_types=[Column_Type_Selection 0 Decimal] + t8 = t2.parse_values value_formatter=opts column_types=[Column_Type_Selection 0 Decimal] t8.at "floats" . to_vector . should_equal [0.0, 0.0, 0.0, 1.0, -10.0, 1.0] Warning.get_all t8 . should_equal [] @@ -158,9 +158,52 @@ spec = Test.group "Table.parse_values" <| problems = [(Duplicate_Type_Selector "floats" ambiguous=True), (Duplicate_Type_Selector "bools" ambiguous=False)] Problems.test_problem_handling action problems tester - Test.specify "should guess the datatype for columns" pending="TODO" <| - # TODO (next PR): ints, decimals, int+dec, just text, all dates, mixed dates, ints+text, ints in quotes - Error.throw "TODO" + Test.specify "should guess the datatype for columns" <| + c1 = ["ints", ["1", " +2", "-123", Nothing]] + c2 = ["ints0", ["01", "02 ", Nothing, "-1"]] + c3 = ["floats", [" 1.0 ", "2.2", Nothing, "-1.0"]] + c4 = ["bools", ["true", " False", Nothing, "True"]] + c5 = ["floats+ints", ["1", "2.2 ", "-1.0", Nothing]] + c6 = ["text", ["foobar", "foo", "", Nothing]] + c7 = ["dates", ["2022-10-01", " 2000-01-01", "1999-01-02", Nothing]] + c8 = ["datetimes", ["2022-10-01 01:02:03 ", "2000-01-01 01:02:03", "1999-01-02 01:02:03", Nothing]] + c9 = ["times", ["01:02:03", " 00:00:00 ", "01:02:03", Nothing]] + c10 = ["mixeddates", ["2022-10-01", "2000-01-01 01:02:03", "01:02:03", Nothing]] + c11 = ["text+ints", ["1", "2", " foobar", Nothing]] + t = Table.new [c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11] + t2 = t.parse_values + + Warning.get_all t2 . should_equal [] + t2.at "ints" . to_vector . should_equal [1, 2, -123, Nothing] + t2.at "ints" . to_vector . first . should_be_an Integer + t2.at "ints0" . to_vector . should_equal ["01", "02", Nothing, "-1"] + t2.at "floats" . to_vector . should_equal [1.0, 2.2, Nothing, -1.0] + t2.at "bools" . to_vector . should_equal [True, False, Nothing, True] + t2.at "floats+ints" . to_vector . should_equal [1.0, 2.2, -1.0, Nothing] + t2.at "text" . to_vector . should_equal ["foobar", "foo", "", Nothing] + t2.at "dates" . to_vector . map date_as_vector . should_equal [[2022, 10, 1], [2000, 1, 1], [1999, 1, 2], Nothing] + t2.at "datetimes" . to_vector . map datetime_as_vector . should_equal [[2022, 10, 1, 1, 2, 3, 0], [2000, 1, 1, 1, 2, 3, 0], [1999, 1, 2, 1, 2, 3, 0], Nothing] + t2.at "times" . to_vector . map time_as_vector . should_equal [[1, 2, 3, 0], [0, 0, 0, 0], [1, 2, 3, 0], Nothing] + t2.at "mixeddates" . to_vector . should_equal ["2022-10-01", "2000-01-01 01:02:03", "01:02:03", Nothing] + t2.at "text+ints" . to_vector . should_equal ["1", "2", "foobar", Nothing] + + t3 = Table.new [["bools", ["1", "0", "True"]], ["ints", ["1", "0", "0"]]] . parse_values (Data_Formatter true_values=["1", "True"] false_values=["0", "False"]) + t3.at "bools" . to_vector . should_equal [True, False, True] + t3.at "ints" . to_vector . should_equal [1, 0, 0] + + t4 = Table.new [c2] . parse_values (Data_Formatter allow_leading_zeros=True) + t4 . at "ints0" . to_vector . should_equal [1, 2, Nothing, -1] + + t5 = t.parse_values column_types=[Column_Type_Selection "ints" Decimal, Column_Type_Selection "floats" Auto, Column_Type_Selection "text+ints" Auto] + t5.at "ints" . to_vector . should_equal [1.0, 2.0, -123.0, Nothing] + # `ints` are requested to be parsed as decimals. + t5.at "ints" . to_vector . first . should_be_a Decimal + # `floats` are auto-detected as decimals. + t5.at "floats" . to_vector . should_equal [1.0, 2.2, Nothing, -1.0] + # `text+ints` is attempted to be parsed (hence whitespace is stripped), but it only fits the text type. + t5.at "text+ints" . to_vector . should_equal ["1", "2", "foobar", Nothing] + # `bools` are not requested to be parsed, so they are kept as-is, with leading whitespace etc. + t5.at "bools" . to_vector . should_equal ["true", " False", Nothing, "True"] Test.specify "should allow to specify a thousands separator and a custom decimal point" <| opts = Data_Formatter decimal_point=',' thousand_separator='_' @@ -234,8 +277,16 @@ spec = Test.group "Table.parse_values" <| expected_warnings.append (Invalid_Format "times" Time_Of_Day.Time_Of_Day ["11:00:00 ", " 00:00:00", "00 : 00 : 00"]) warnings.should_contain_the_same_elements_as expected_warnings.to_vector - Test.specify "should fallback to text if whitespace is present and trimming is turned off" pending="TODO" <| - ## TODO next PR - Error.throw "TODO" + Test.specify "should fallback to text if whitespace is present and trimming is turned off" <| + c1 = ["1", " +2", "-123", Nothing] + c2 = [" 1.0 ", "2.2", Nothing, "-1.0"] + c3 = ["true", " False", Nothing, "True"] + t = Table.new [["ints", c1], ["floats", c2], ["bools", c3]] + t2 = t.parse_values (Data_Formatter trim_values=False) + + Warning.get_all t2 . should_equal [] + t2.at "ints" . to_vector . should_equal c1 + t2.at "floats" . to_vector . should_equal c2 + t2.at "bools" . to_vector . should_equal c3 main = Test.Suite.run_main here.spec