Skip to content

Commit

Permalink
Parsing values with known types (#3455)
Browse files Browse the repository at this point in the history
  • Loading branch information
radeusgd authored May 18, 2022
1 parent 78e7d69 commit 8430ce2
Show file tree
Hide file tree
Showing 40 changed files with 955 additions and 65 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@
- [Implemented `compute` method on `Vector` for statistics calculations.][3442]
- [Promote get and put to be methods of Ref type rather than of Ref
module][3457]
- [Implemented `Table.parse_values`, parsing text columns according to a
specified type.][3455]

[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
Expand Down Expand Up @@ -186,6 +188,7 @@
[3430]: https://github.com/enso-org/enso/pull/3430
[3442]: https://github.com/enso-org/enso/pull/3442
[3457]: https://github.com/enso-org/enso/pull/3457
[3455]: https://github.com/enso-org/enso/pull/3455

#### Enso Compiler

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,6 @@ type Boolean
not : Boolean
not = @Builtin_Method "Boolean.not"

## Generates a human-readable text representation of the boolean.

> Example
Converting the value True to text.

True.to_text
to_text : Text
to_text = @Builtin_Method "Boolean.to_text"

## The if-then-else control flow operator that executes one of two branches
based on a conditional.

Expand Down
3 changes: 3 additions & 0 deletions distribution/lib/Standard/Base/0.0.0-dev/src/Data/Vector.enso
Original file line number Diff line number Diff line change
Expand Up @@ -980,6 +980,9 @@ type Vector
json = this.take_start 100 . to_json
json.to_text

## PRIVATE
type Wrapped_Error error

type Builder

## PRIVATE
Expand Down
4 changes: 2 additions & 2 deletions distribution/lib/Standard/Base/0.0.0-dev/src/Meta.enso
Original file line number Diff line number Diff line change
Expand Up @@ -322,8 +322,8 @@ is_a value typ = if typ == Any then True else
Constructor _ ->
meta_typ = here.meta typ
case meta_typ of
Atom _ -> meta_val.constructor == meta_typ.constructor
Constructor _ -> meta_val.constructor == meta_typ
Atom _ -> meta_val == meta_typ.constructor
Constructor _ -> meta_val == meta_typ
_ -> False
Error _ -> typ == Error
Unresolved_Symbol _ -> typ == Unresolved_Symbol
Expand Down
11 changes: 11 additions & 0 deletions distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ from Standard.Table.Data.Table import No_Such_Column_Error
from Standard.Table.Data.Order_Rule as Order_Rule_Module import Order_Rule
from Standard.Table.Data.Column_Selector as Column_Selector_Module import Column_Selector, By_Index
from Standard.Table.Data.Sort_Method as Sort_Method_Module import Sort_Method
from Standard.Table.Data.Data_Formatter as Data_Formatter_Module import Data_Formatter
from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior, Report_Warning
from Standard.Database.Error as Database_Errors import Unsupported_Database_Operation_Error
import Standard.Table.Data.Column_Mapping
Expand Down Expand Up @@ -666,6 +667,16 @@ type Table
on_problems.attach_problems_before problems <|
this.updated_context_and_columns new_ctx new_columns

## Parsing values is not supported in database tables, the table has to be
materialized first with `to_dataframe`.
parse_values : Data_Formatter -> (Nothing | [Column_Type_Selection]) -> Problem_Behavior -> Table
parse_values parser=Data_Formatter column_types=Nothing on_problems=Report_Warning =
## Avoid unused arguments warning. We cannot rename arguments to `_`,
because we need to keep the API consistent with the in-memory table.
_ = [parser, column_types, on_problems]
msg = "Parsing values is not supported in database tables, the table has to be materialized first with `to_dataframe`."
Error.throw (Unsupported_Database_Operation_Error msg)

## UNSTABLE

Returns a new Table without rows that contained missing values in any of
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from Standard.Base import all
import Standard.Base.Data.Time

## The type representing inferring the column type automatically based on values
present in the column.

The most specific type which is valid for all values in a column is chosen:
- if all values are integers, `Integer` is chosen,
- if all values are decimals or integers, `Decimal` is chosen,
- if all values are booleans, `Boolean` is chosen,
- if the values are all the same time type (a date, a time or a date-time),
the corresponding type is chosen, `Date`, `Time_Of_Day` or `Time`,
respectively,
- otherwise, `Text` is chosen as a fallback and the column is kept as-is
without parsing.
type Auto

## Specifies the desired datatype for parsing a particular column.

Arguments:
- column: the column selector which can either be the column name or the
index.
- datatype: The desired datatype for the column or `Auto` to infer the type
from the data.
type Column_Type_Selection (column:Text|Integer) datatype:(Auto|Integer|Decimal|Date|Time|Time_Of_Day|Boolean)=Auto
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from Standard.Base import all

## Specifies options for reading text data in a table to more specific types and
serializing them back.

Arguments:
- trim_values: Trim whitespace before parsing.
- allow_leading_zeros: Specifies how to treat numeric values starting with
leading zeroes. Defaults to `False`, because converting such
values to numbers is a lossy operation - after converting such a number
back to text the leading zeroes will get lost. If leading zeroes are not
allowed and the column contains any values with leading zeroes, it will not
get automatically converted to numbers, remaining as text. However, if the
column is specifically requested to be converted to a numeric column, only
a warning will be issued indicating that some leading zeroes were present,
but the conversion will proceed.
- decimal_point: The character used to separate the integer part from the
fractional part of a number. Defaults to '.'. Can be changed for example to
',' to allow for European format.
- thousand_separator: A separator that can be used to separate groups of
digits in numbers. For example, it can be set to ',' to allow for notation
like '1,000,000.0'.
- datetime_formats: Expected datetime formats.
- date_formats: Expected date formats.
- time_formats: Expected time formats.
- locale: The locale to use when parsing dates and times.
- true_values: Values representing True.
- false_values: Values representing False.
type Data_Formatter trim_values:Boolean=True allow_leading_zeros:Boolean=False decimal_point:Text='.' thousand_separator:Text='' datetime_formats:[Text]=["yyyy-MM-dd HH:mm:ss"] date_formats:[Text]=["yyyy-MM-dd"] time_formats:[Text]=["HH:mm:ss"] locale:Locale=Locale.default true_values:[Text]=["True","true","TRUE"] false_values:[Text]=["False","false","FALSE"]
100 changes: 98 additions & 2 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,41 @@ import Standard.Base.System.Platform
import Standard.Table.Data.Column
import Standard.Table.Io.Csv
import Standard.Visualization
import Standard.Base.Data.Time.Date
from Standard.Base.Data.Time.Date as Date_Module import Date
from Standard.Base.Data.Time as Time_Module import Time
from Standard.Base.Data.Time.Time_Of_Day as Time_Of_Day_Module import Time_Of_Day
import Standard.Table.Io.Spreadsheet_Write_Mode
import Standard.Table.Io.Format
import Standard.Table.Internal.Table_Helpers
import Standard.Table.Internal.Aggregate_Column_Helper
import Standard.Table.Internal.Parse_Values_Helper

from Standard.Table.Data.Order_Rule as Order_Rule_Module import Order_Rule
from Standard.Table.Data.Column_Selector as Column_Selector_Module import Column_Selector, By_Index
from Standard.Table.Data.Column_Type_Selection as Column_Type_Selection_Module import Column_Type_Selection, Auto
from Standard.Table.Data.Data_Formatter as Data_Formatter_Module import Data_Formatter
from Standard.Table.Data.Sort_Method as Sort_Method_Module import Sort_Method
from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior, Report_Warning
from Standard.Table.Error as Error_Module import Missing_Input_Columns, Column_Indexes_Out_Of_Range, Duplicate_Type_Selector
import Standard.Table.Data.Column_Mapping
import Standard.Table.Data.Position

import Standard.Table.Data.Aggregate_Column

polyglot java import org.enso.table.data.table.Table as Java_Table
polyglot java import org.enso.table.data.table.Column as Java_Column
polyglot java import org.enso.table.operations.OrderBuilder
polyglot java import org.enso.table.format.csv.Writer as Csv_Writer
polyglot java import org.enso.table.format.xlsx.Writer as Spreadsheet_Writer

polyglot java import org.enso.table.parsing.IntegerParser
polyglot java import org.enso.table.parsing.DecimalParser
polyglot java import org.enso.table.parsing.BooleanParser
polyglot java import org.enso.table.parsing.DateParser
polyglot java import org.enso.table.parsing.TimeParser
polyglot java import org.enso.table.parsing.DateTimeParser
polyglot java import org.enso.table.parsing.WhitespaceStrippingParser

## Creates a new table from a vector of `[name, items]` pairs.

Arguments:
Expand Down Expand Up @@ -527,6 +542,87 @@ type Table
problems = java_table.getProblems
Aggregate_Column_Helper.parse_aggregated_problems problems

## Parses columns within a Table to a specific value type.
By default, it looks at all `Text` columns and attempts to deduce the
type (columns with other types are not affected). If `column_types` are
provided, only selected columns are parsed, according to the specified
type.

The default parser options only parse values where the process is
reversible (e.g., 0123 would not be converted to an integer as there is
a leading 0). However, settings in the `Data_Formatter` can
control this.
parse_values : Data_Formatter -> (Nothing | [Column_Type_Selection]) -> Problem_Behavior -> Table
parse_values parser=Data_Formatter column_types=Nothing on_problems=Report_Warning =
columns = this.columns
problem_builder = Vector.new_builder

find_datatype index column =
matching_input = column_types.filter selection->
selector = selection.column
case selector of
Text -> column.name == selector
Integer -> if selector >= 0 then index == selector else
index == columns.length + selector
if matching_input.length == 0 then Nothing else
if matching_input.length == 1 then matching_input.first.datatype else
first_type = matching_input.first.datatype
ambiguous = matching_input.exists s-> s.datatype != first_type
problem_builder.append (Duplicate_Type_Selector column.name ambiguous)
if ambiguous then Nothing else first_type

expected_types = case column_types of
Nothing -> columns.map _->Auto
_ ->
missing_columns = Vector.new_builder
invalid_indices = Vector.new_builder
column_types.each selection->
selector = selection.column
case selector of
Integer ->
valid = Table_Helpers.is_index_valid columns.length selector
if valid.not then
invalid_indices.append selector
Text ->
found = columns.exists col-> col.name == selector
if found.not then
missing_columns.append selector
if missing_columns.is_empty.not then
problem_builder.append (Missing_Input_Columns missing_columns.to_vector)
if invalid_indices.is_empty.not then
problem_builder.append (Column_Indexes_Out_Of_Range invalid_indices.to_vector)
columns.map_with_index find_datatype

new_columns = columns.zip expected_types column-> expected_type-> case expected_type of
Nothing -> column
Auto -> Error.unimplemented "Automatic datatype inference is not implemented yet."
_ ->
parse_options = parser
thousand_separator = if parse_options.thousand_separator.is_empty then Nothing else parse_options.thousand_separator
base_parser = case expected_type of
Integer -> IntegerParser.new thousand_separator parse_options.allow_leading_zeros
Decimal -> DecimalParser.new parse_options.decimal_point thousand_separator parse_options.allow_leading_zeros
Boolean -> BooleanParser.new parse_options.true_values.to_array parse_options.false_values.to_array
_ ->
if expected_type == Date then DateParser.new parse_options.date_formats.to_array parse_options.locale.java_locale else
if expected_type == Time then DateTimeParser.new parse_options.datetime_formats.to_array parse_options.locale.java_locale else
if expected_type == Time_Of_Day then TimeParser.new parse_options.time_formats.to_array parse_options.locale.java_locale else
Error.throw (Illegal_Argument_Error "Unsupported target datatype: "+expected_type.to_text)
parser = case parse_options.trim_values of
False -> base_parser
True -> WhitespaceStrippingParser.new base_parser
storage = column.java_column.getStorage
new_storage_and_problems = parser.parseColumn storage
new_storage = new_storage_and_problems.value
problems = Vector.Vector new_storage_and_problems.problems . map (Parse_Values_Helper.translate_parsing_problem column.name expected_type)
problems.each problem_builder.append
Column.Column (Java_Column.new column.name column.java_column.getIndex new_storage)

## TODO [RW] this case of is a workaround for wrong dataflow handling on arrays, it can be removed once the PR fixing it is merged, the relevant PR is:
https://github.com/enso-org/enso/pull/3400
result = here.new new_columns
on_problems.attach_problems_after result problem_builder.to_vector

## ALIAS Filter Rows
ALIAS Mask Columns

Expand Down Expand Up @@ -1264,7 +1360,7 @@ Text.write_to_spreadsheet cell = cell.setCellValue this
Arguments:
- cell: an instance of `org.apache.poi.ss.usermodel.Cell`, the value of
which should be set by this method.
Date.Date.write_to_spreadsheet cell = cell.setCellValue this.internal_local_date
Date.write_to_spreadsheet cell = cell.setCellValue this.internal_local_date



Expand Down
17 changes: 17 additions & 0 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Error.enso
Original file line number Diff line number Diff line change
Expand Up @@ -118,3 +118,20 @@ type Invalid_Location (location:Text)
Invalid_Location.to_display_text : Text
Invalid_Location.to_display_text =
"The location '"+this.location+"' is not valid."

## Indicates that some values did not match the expected datatype format.
type Invalid_Format column:Text (datatype:(Integer|Number|Date|Time|Time_Of_Day|Boolean)) (cells:[Text])

Invalid_Format.to_display_text : Text
Invalid_Format.to_display_text =
this.cells.length+" cells in column "+this.column+" had invalid format for datatype "+this.datatype.to_text+"."

## Indicates that some values contained leading zeros even though these were not allowed.
type Leading_Zeros column:Text (datatype:(Integer|Number|Date|Time|Time_Of_Day|Boolean)) (cells:[Text])

## Indicates that multiple `Column_Type_Selector` match the same column.

If all matching selectors indicate the same type, the warning is reported but
a parse is attempted anyway. If mixed types are requested, the column is not
parsed due to ambiguity.
type Duplicate_Type_Selector column:Text ambiguous:Boolean
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ from Standard.Table.Io.File_Format import Infer

polyglot java import org.enso.table.read.DelimitedReader
polyglot java import org.enso.table.read.ParsingFailedException
polyglot java import org.enso.table.read.InvalidRow
polyglot java import org.enso.table.read.MismatchedQuote
polyglot java import org.enso.table.read.AdditionalInvalidRows
polyglot java import org.enso.table.parsing.problems.InvalidRow
polyglot java import org.enso.table.parsing.problems.MismatchedQuote
polyglot java import org.enso.table.parsing.problems.AdditionalInvalidRows
polyglot java import java.lang.IllegalArgumentException
polyglot java import java.io.IOException
polyglot java import com.univocity.parsers.common.TextParsingException
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from Standard.Base import all

from Standard.Table.Error as Table_Errors import Invalid_Format, Leading_Zeros

polyglot java import org.enso.table.parsing.problems.InvalidRow
polyglot java import org.enso.table.parsing.problems.InvalidFormat
polyglot java import org.enso.table.parsing.problems.LeadingZeros
polyglot java import org.enso.table.parsing.problems.MismatchedQuote
polyglot java import org.enso.table.parsing.problems.AdditionalInvalidRows

translate_parsing_problem column_name expected_datatype problem =
invalid_format = [InvalidFormat, (java_problem-> Invalid_Format column_name expected_datatype (Vector.Vector java_problem.cells))]
leading_zeros = [LeadingZeros, (java_problem-> Leading_Zeros column_name expected_datatype (Vector.Vector java_problem.cells))]
translations = [invalid_format, leading_zeros]
found = translations.find t->
Java.is_instance problem t.first
translation = found.catch _->
Error.throw (Illegal_State_Error "Reported an unknown problem type: "+problem.to_text)
translation.second problem

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -6772,7 +6772,7 @@ object IR {
case object UnresolvedSequenceMacro extends Reason {
override def explain(originalName: Name): String =
"No definition for the sequence macro could be found. Try" +
" importing the default definition from the Base.Data.Vector module."
" importing the default definition from the Standard.Base module."
}

/** An error coming from an unknown annotation name.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,20 @@

/** A builder for boolean columns. */
public class BoolBuilder extends TypedBuilder {
private final BitSet vals = new BitSet();
private final BitSet isNa = new BitSet();
private final BitSet vals;
private final BitSet isNa;
int size = 0;

public BoolBuilder() {
vals = new BitSet();
isNa = new BitSet();
}

public BoolBuilder(int capacity) {
vals = new BitSet(capacity);
isNa = new BitSet(capacity);
}

@Override
public void appendNoGrow(Object o) {
if (o == null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import java.util.Comparator;
import java.util.function.BiConsumer;
import java.util.function.Function;

import org.apache.poi.ss.usermodel.Cell;
import org.enso.table.data.column.builder.object.StringBuilder;
import org.enso.table.data.column.operation.map.MapOpStorage;
Expand Down Expand Up @@ -33,7 +32,9 @@ public String getItem(long idx) {
return (String) super.getItem(idx);
}

/** @inheritDoc */
/**
* @inheritDoc
*/
@Override
public long getType() {
return Type.STRING;
Expand Down
Loading

0 comments on commit 8430ce2

Please sign in to comment.