Skip to content

Commit

Permalink
Automatic inference of value types when parsing table columns (#3462)
Browse files Browse the repository at this point in the history
  • Loading branch information
radeusgd authored May 20, 2022
1 parent 0073f46 commit ff7700e
Show file tree
Hide file tree
Showing 18 changed files with 347 additions and 200 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@
specified type.][3455]
- [Promote with, take, finalize to be methods of Managed_Resource
instance][3460]
- [Implemented automatic type detection for `Table.parse_values`.][3462]

[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
Expand Down Expand Up @@ -192,6 +193,7 @@
[3457]: https://github.com/enso-org/enso/pull/3457
[3455]: https://github.com/enso-org/enso/pull/3455
[3460]: https://github.com/enso-org/enso/pull/3460
[3462]: https://github.com/enso-org/enso/pull/3462

#### Enso Compiler

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -670,10 +670,10 @@ type Table
## Parsing values is not supported in database tables, the table has to be
materialized first with `to_dataframe`.
parse_values : Data_Formatter -> (Nothing | [Column_Type_Selection]) -> Problem_Behavior -> Table
parse_values parser=Data_Formatter column_types=Nothing on_problems=Report_Warning =
parse_values value_formatter=Data_Formatter column_types=Nothing on_problems=Report_Warning =
## Avoid unused arguments warning. We cannot rename arguments to `_`,
because we need to keep the API consistent with the in-memory table.
_ = [parser, column_types, on_problems]
_ = [value_formatter, column_types, on_problems]
msg = "Parsing values is not supported in database tables, the table has to be materialized first with `to_dataframe`."
Error.throw (Unsupported_Database_Operation_Error msg)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
from Standard.Base import all

from Standard.Base.Data.Time.Date as Date_Module import Date
from Standard.Base.Data.Time as Time_Module import Time
from Standard.Base.Data.Time.Time_Of_Day as Time_Of_Day_Module import Time_Of_Day

polyglot java import org.enso.table.parsing.IntegerParser
polyglot java import org.enso.table.parsing.DecimalParser
polyglot java import org.enso.table.parsing.BooleanParser
polyglot java import org.enso.table.parsing.DateParser
polyglot java import org.enso.table.parsing.TimeParser
polyglot java import org.enso.table.parsing.DateTimeParser
polyglot java import org.enso.table.parsing.WhitespaceStrippingParser
polyglot java import org.enso.table.parsing.IdentityParser
polyglot java import org.enso.table.parsing.TypeInferringParser

## Specifies options for reading text data in a table to more specific types and
serializing them back.

Expand Down Expand Up @@ -27,3 +41,55 @@ from Standard.Base import all
- true_values: Values representing True.
- false_values: Values representing False.
type Data_Formatter trim_values:Boolean=True allow_leading_zeros:Boolean=False decimal_point:Text='.' thousand_separator:Text='' datetime_formats:[Text]=["yyyy-MM-dd HH:mm:ss"] date_formats:[Text]=["yyyy-MM-dd"] time_formats:[Text]=["HH:mm:ss"] locale:Locale=Locale.default true_values:[Text]=["True","true","TRUE"] false_values:[Text]=["False","false","FALSE"]

## PRIVATE
Data_Formatter.get_thousand_separator = if this.thousand_separator.is_empty then Nothing else this.thousand_separator

## PRIVATE
Data_Formatter.wrap_base_parser base_parser =
if this.trim_values.not then base_parser else
WhitespaceStrippingParser.new base_parser

## PRIVATE
Data_Formatter.make_integer_parser = this.wrap_base_parser <|
IntegerParser.new this.get_thousand_separator this.allow_leading_zeros

## PRIVATE
Data_Formatter.make_decimal_parser = this.wrap_base_parser <|
DecimalParser.new this.decimal_point this.get_thousand_separator this.allow_leading_zeros

## PRIVATE
Data_Formatter.make_boolean_parser = this.wrap_base_parser <|
BooleanParser.new this.true_values.to_array this.false_values.to_array

## PRIVATE
Data_Formatter.make_date_parser = this.wrap_base_parser <|
DateParser.new this.date_formats.to_array this.locale.java_locale

## PRIVATE
Data_Formatter.make_identity_parser = this.wrap_base_parser IdentityParser.new

## PRIVATE
Data_Formatter.make_datetime_parser = this.wrap_base_parser <|
DateTimeParser.new this.datetime_formats.to_array this.locale.java_locale

## PRIVATE
Data_Formatter.make_time_parser = this.wrap_base_parser <|
TimeParser.new this.time_formats.to_array this.locale.java_locale

## PRIVATE
Data_Formatter.make_datatype_parser datatype = case datatype of
Integer -> this.make_integer_parser
Decimal -> this.make_decimal_parser
Boolean -> this.make_boolean_parser
_ ->
if datatype == Date then this.make_date_parser else
if datatype == Time then this.make_datetime_parser else
if datatype == Time_Of_Day then this.make_time_parser else
Error.throw (Illegal_Argument_Error "Unsupported datatype: "+datatype.to_text)

## PRIVATE
Data_Formatter.make_auto_parser =
parsers = [this.make_integer_parser, this.make_decimal_parser, this.make_datetime_parser, this.make_date_parser, this.make_time_parser, this.make_boolean_parser]
fallback_parser = this.make_identity_parser
TypeInferringParser.new parsers.to_array fallback_parser
29 changes: 3 additions & 26 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@ import Standard.Table.Data.Column
import Standard.Table.Io.Csv
import Standard.Visualization
from Standard.Base.Data.Time.Date as Date_Module import Date
from Standard.Base.Data.Time as Time_Module import Time
from Standard.Base.Data.Time.Time_Of_Day as Time_Of_Day_Module import Time_Of_Day
import Standard.Table.Io.Spreadsheet_Write_Mode
import Standard.Table.Io.Format
import Standard.Table.Internal.Table_Helpers
Expand All @@ -31,14 +29,6 @@ polyglot java import org.enso.table.operations.OrderBuilder
polyglot java import org.enso.table.format.csv.Writer as Csv_Writer
polyglot java import org.enso.table.format.xlsx.Writer as Spreadsheet_Writer

polyglot java import org.enso.table.parsing.IntegerParser
polyglot java import org.enso.table.parsing.DecimalParser
polyglot java import org.enso.table.parsing.BooleanParser
polyglot java import org.enso.table.parsing.DateParser
polyglot java import org.enso.table.parsing.TimeParser
polyglot java import org.enso.table.parsing.DateTimeParser
polyglot java import org.enso.table.parsing.WhitespaceStrippingParser

## Creates a new table from a vector of `[name, items]` pairs.

Arguments:
Expand Down Expand Up @@ -553,7 +543,7 @@ type Table
a leading 0). However, settings in the `Data_Formatter` can
control this.
parse_values : Data_Formatter -> (Nothing | [Column_Type_Selection]) -> Problem_Behavior -> Table
parse_values parser=Data_Formatter column_types=Nothing on_problems=Report_Warning =
parse_values value_formatter=Data_Formatter column_types=Nothing on_problems=Report_Warning =
columns = this.columns
problem_builder = Vector.new_builder

Expand Down Expand Up @@ -595,22 +585,9 @@ type Table

new_columns = columns.zip expected_types column-> expected_type-> case expected_type of
Nothing -> column
Auto -> Error.unimplemented "Automatic datatype inference is not implemented yet."
_ ->
parse_options = parser
thousand_separator = if parse_options.thousand_separator.is_empty then Nothing else parse_options.thousand_separator
base_parser = case expected_type of
Integer -> IntegerParser.new thousand_separator parse_options.allow_leading_zeros
Decimal -> DecimalParser.new parse_options.decimal_point thousand_separator parse_options.allow_leading_zeros
Boolean -> BooleanParser.new parse_options.true_values.to_array parse_options.false_values.to_array
_ ->
if expected_type == Date then DateParser.new parse_options.date_formats.to_array parse_options.locale.java_locale else
if expected_type == Time then DateTimeParser.new parse_options.datetime_formats.to_array parse_options.locale.java_locale else
if expected_type == Time_Of_Day then TimeParser.new parse_options.time_formats.to_array parse_options.locale.java_locale else
Error.throw (Illegal_Argument_Error "Unsupported target datatype: "+expected_type.to_text)
parser = case parse_options.trim_values of
False -> base_parser
True -> WhitespaceStrippingParser.new base_parser
parser = if expected_type == Auto then value_formatter.make_auto_parser else
value_formatter.make_datatype_parser expected_type
storage = column.java_column.getStorage
new_storage_and_problems = parser.parseColumn storage
new_storage = new_storage_and_problems.value
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ public StringStorageBuilder() {

/** @inheritDoc */
@Override
public StorageBuilder parseAndAppend(String value) {
public StringStorageBuilder parseAndAppend(String value) {
ensureAppendable();
data[size++] = value;
return this;
Expand All @@ -45,7 +45,7 @@ private void ensureAppendable() {

/** @inheritDoc */
@Override
public Storage seal() {
public StringStorage seal() {
return new StringStorage(data, size);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
import java.util.Locale;
import org.enso.table.data.column.builder.object.Builder;
import org.enso.table.data.column.builder.object.ObjectBuilder;
import org.enso.table.parsing.problems.InvalidFormatProblemAggregator;
import org.enso.table.parsing.problems.ProblemAggregator;

public abstract class BaseTimeParser extends DatatypeParser<InvalidFormatProblemAggregator> {
public abstract class BaseTimeParser extends IncrementalDatatypeParser {
protected interface ParseStrategy {
Object parse(String text, DateTimeFormatter formatter) throws DateTimeParseException;
}
Expand All @@ -25,7 +25,7 @@ protected BaseTimeParser(String[] formats, Locale locale, ParseStrategy parseStr
}

@Override
public Object parseSingleValue(String text, InvalidFormatProblemAggregator problemAggregator) {
protected Object parseSingleValue(String text, ProblemAggregator problemAggregator) {
for (var formatter : formatters) {
try {
return parseStrategy.parse(text, formatter);
Expand All @@ -38,14 +38,9 @@ public Object parseSingleValue(String text, InvalidFormatProblemAggregator probl
}

@Override
public Builder makeBuilderWithCapacity(long capacity) {
protected Builder makeBuilderWithCapacity(long capacity) {
// Once datetime gets first-class support in our dataframes, a more specific builder type should
// be used.
return new ObjectBuilder((int) capacity);
}

@Override
public InvalidFormatProblemAggregator makeProblemAggregator() {
return new InvalidFormatProblemAggregator();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

import org.enso.table.data.column.builder.object.BoolBuilder;
import org.enso.table.data.column.builder.object.Builder;
import org.enso.table.parsing.problems.InvalidFormatProblemAggregator;
import org.enso.table.parsing.problems.ProblemAggregator;
import org.graalvm.collections.EconomicSet;

public class BooleanParser extends DatatypeParser<InvalidFormatProblemAggregator> {
public class BooleanParser extends IncrementalDatatypeParser {

private final EconomicSet<String> trueValues;
private final EconomicSet<String> falseValues;
Expand All @@ -22,7 +22,7 @@ public BooleanParser(String[] trueValues, String[] falseValues) {
}

@Override
public Object parseSingleValue(String text, InvalidFormatProblemAggregator problemAggregator) {
protected Object parseSingleValue(String text, ProblemAggregator problemAggregator) {
// TODO we may want to use equality checks taking Unicode Normalization into account, to be
// revised in: https://www.pivotaltracker.com/story/show/182166382
if (trueValues.contains(text)) return true;
Expand All @@ -33,12 +33,7 @@ public Object parseSingleValue(String text, InvalidFormatProblemAggregator probl
}

@Override
public Builder makeBuilderWithCapacity(long capacity) {
protected Builder makeBuilderWithCapacity(long capacity) {
return new BoolBuilder((int) capacity);
}

@Override
public InvalidFormatProblemAggregator makeProblemAggregator() {
return new InvalidFormatProblemAggregator();
}
}
Original file line number Diff line number Diff line change
@@ -1,67 +1,14 @@
package org.enso.table.parsing;

import org.enso.table.data.column.builder.object.Builder;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.StringStorage;
import org.enso.table.parsing.problems.ProblemAggregator;
import org.enso.table.read.WithProblems;

/**
* A base type for a datatype parsing strategy.
*
* <p>It specifies the strategy for parsing text cells into some target type, reporting issues and
* building the resulting table column.
*
* @param <PA> the specific problem aggregator type; the type is refined to be able to handle
* various strategies for aggregating problems, depending on the particular datatype
*/
public abstract class DatatypeParser<PA extends ProblemAggregator> {

/**
* Parses a single cell.
*
* @param text the text contents to parse, it will never be null in the default implementation -
* null values are just passed as-is without any parsing attempts by default
* @param problemAggregator an instance of the problem aggregator, used for reporting parsing
* problems
* @return the parsed value or null if the value could not be parsed or could be parsed but should
* be treated as missing value
*/
public abstract Object parseSingleValue(String text, PA problemAggregator);

/**
* Creates a new column builder expecting the specific datatype, with a specified capacity.
*
* <p>The {@code parseColumn} method will use {@code appendNoGrow} function, so the initial
* capacity should be set properly so that the builder can hold all expected elements.
*
* <p>The type returned from {@code parseSingleValue} should be consistent with the types that the
* builder returned here expects - it should never return a value that cannot be accepted by the
* builder.
*/
public abstract Builder makeBuilderWithCapacity(long capacity);

/** Creates a new instance of the specific problem aggregator type. */
public abstract PA makeProblemAggregator();

/** A base type for a parser capable of parsing a column of text values into some other type. */
public interface DatatypeParser {
/**
* Parses a column of texts (represented as a {@code StringStorage}) and returns a new storage,
* containing the parsed elements.
*/
public WithProblems<Storage> parseColumn(StringStorage sourceStorage) {
Builder builder = makeBuilderWithCapacity(sourceStorage.size());
PA aggregator = makeProblemAggregator();

for (int i = 0; i < sourceStorage.size(); ++i) {
String cell = sourceStorage.getItem(i);
if (cell != null) {
Object parsed = parseSingleValue(cell, aggregator);
builder.appendNoGrow(parsed);
} else {
builder.appendNoGrow(null);
}
}

return new WithProblems<>(builder.seal(), aggregator.getAggregatedProblems());
}
WithProblems<Storage> parseColumn(StringStorage sourceStorage);
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
import java.text.ParsePosition;
import org.enso.table.data.column.builder.object.Builder;
import org.enso.table.data.column.builder.object.NumericBuilder;
import org.enso.table.parsing.problems.NumericProblemAggregator;
import org.enso.table.parsing.problems.ProblemAggregator;

public class DecimalParser extends DatatypeParser<NumericProblemAggregator> {
public class DecimalParser extends IncrementalDatatypeParser {
private final String thousandsSeparator;
private final char decimalPoint;
private final DecimalFormat decimalFormat;
Expand Down Expand Up @@ -38,7 +38,7 @@ public DecimalParser(
}

@Override
public Object parseSingleValue(String text, NumericProblemAggregator problemAggregator) {
protected Object parseSingleValue(String text, ProblemAggregator problemAggregator) {
if (thousandsSeparator != null
&& (text.startsWith(thousandsSeparator) || text.endsWith(thousandsSeparator))) {
problemAggregator.reportInvalidFormat(text);
Expand Down Expand Up @@ -84,12 +84,7 @@ private boolean hasLeadingZeros(String s) {
}

@Override
public Builder makeBuilderWithCapacity(long capacity) {
protected Builder makeBuilderWithCapacity(long capacity) {
return NumericBuilder.createDoubleBuilder((int) capacity);
}

@Override
public NumericProblemAggregator makeProblemAggregator() {
return new NumericProblemAggregator();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package org.enso.table.parsing;

import java.util.List;
import org.enso.table.data.column.builder.object.StringBuilder;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.StringStorage;
import org.enso.table.parsing.problems.ProblemAggregator;
import org.enso.table.read.WithProblems;

/** A parser that just returns its input. Useful as a fallback. */
public class IdentityParser extends IncrementalDatatypeParser {

@Override
public Object parseSingleValue(String text, ProblemAggregator problemAggregator) {
return text;
}

@Override
public StringBuilder makeBuilderWithCapacity(long capacity) {
return new StringBuilder((int) capacity);
}

@Override
public WithProblems<Storage> parseColumn(StringStorage sourceStorage) {
return new WithProblems<>(sourceStorage, List.of());
}
}
Loading

0 comments on commit ff7700e

Please sign in to comment.