Parsing values with known types (#3455)

Implements https://www.pivotaltracker.com/story/show/181824146
enso-org · May 18, 2022 · 8430ce2 · 8430ce2
1 parent 78e7d69
commit 8430ce2
Show file tree

Hide file tree

Showing 40 changed files with 955 additions and 65 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -120,6 +120,8 @@
 - [Implemented `compute` method on `Vector` for statistics calculations.][3442]
 - [Promote get and put to be methods of Ref type rather than of Ref
   module][3457]
+- [Implemented `Table.parse_values`, parsing text columns according to a
+  specified type.][3455]
 
 [debug-shortcuts]:
   https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@@ -186,6 +188,7 @@
 [3430]: https://github.com/enso-org/enso/pull/3430
 [3442]: https://github.com/enso-org/enso/pull/3442
 [3457]: https://github.com/enso-org/enso/pull/3457
+[3455]: https://github.com/enso-org/enso/pull/3455
 
 #### Enso Compiler
 

diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Boolean.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Boolean.enso
@@ -63,15 +63,6 @@ type Boolean
     not : Boolean
     not = @Builtin_Method "Boolean.not"
 
-    ## Generates a human-readable text representation of the boolean.
-
-       > Example
-         Converting the value True to text.
-
-             True.to_text
-    to_text : Text
-    to_text = @Builtin_Method "Boolean.to_text"
-
     ## The if-then-else control flow operator that executes one of two branches
        based on a conditional.
 

diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Vector.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Vector.enso
@@ -980,6 +980,9 @@ type Vector
         json = this.take_start 100 . to_json
         json.to_text
 
+## PRIVATE
+type Wrapped_Error error
+
 type Builder
 
     ## PRIVATE

diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Meta.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Meta.enso
@@ -322,8 +322,8 @@ is_a value typ = if typ == Any then True else
                     Constructor _ ->
                         meta_typ = here.meta typ
                         case meta_typ of
-                            Atom _ -> meta_val.constructor == meta_typ.constructor
-                            Constructor _ -> meta_val.constructor == meta_typ
+                            Atom _ -> meta_val == meta_typ.constructor
+                            Constructor _ -> meta_val == meta_typ
                             _ -> False
                     Error _ -> typ == Error
                     Unresolved_Symbol _ -> typ == Unresolved_Symbol

diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso
@@ -17,6 +17,7 @@ from Standard.Table.Data.Table import No_Such_Column_Error
 from Standard.Table.Data.Order_Rule as Order_Rule_Module import Order_Rule
 from Standard.Table.Data.Column_Selector as Column_Selector_Module import Column_Selector, By_Index
 from Standard.Table.Data.Sort_Method as Sort_Method_Module import Sort_Method
+from Standard.Table.Data.Data_Formatter as Data_Formatter_Module import Data_Formatter
 from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior, Report_Warning
 from Standard.Database.Error as Database_Errors import Unsupported_Database_Operation_Error
 import Standard.Table.Data.Column_Mapping
@@ -666,6 +667,16 @@ type Table
             on_problems.attach_problems_before problems <|
                 this.updated_context_and_columns new_ctx new_columns
 
+    ## Parsing values is not supported in database tables, the table has to be
+       materialized first with `to_dataframe`.
+    parse_values : Data_Formatter -> (Nothing | [Column_Type_Selection]) -> Problem_Behavior -> Table
+    parse_values parser=Data_Formatter column_types=Nothing on_problems=Report_Warning =
+        ## Avoid unused arguments warning. We cannot rename arguments to `_`,
+           because we need to keep the API consistent with the in-memory table.
+        _ = [parser, column_types, on_problems]
+        msg = "Parsing values is not supported in database tables, the table has to be materialized first with `to_dataframe`."
+        Error.throw (Unsupported_Database_Operation_Error msg)
+
     ## UNSTABLE
 
        Returns a new Table without rows that contained missing values in any of

diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column_Type_Selection.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column_Type_Selection.enso
@@ -0,0 +1,25 @@
+from Standard.Base import all
+import Standard.Base.Data.Time
+
+## The type representing inferring the column type automatically based on values
+   present in the column.
+
+   The most specific type which is valid for all values in a column is chosen:
+   - if all values are integers, `Integer` is chosen,
+   - if all values are decimals or integers, `Decimal` is chosen,
+   - if all values are booleans, `Boolean` is chosen,
+   - if the values are all the same time type (a date, a time or a date-time),
+     the corresponding type is chosen, `Date`, `Time_Of_Day` or `Time`,
+     respectively,
+   - otherwise, `Text` is chosen as a fallback and the column is kept as-is
+     without parsing.
+type Auto
+
+## Specifies the desired datatype for parsing a particular column.
+
+   Arguments:
+   - column: the column selector which can either be the column name or the
+     index.
+   - datatype: The desired datatype for the column or `Auto` to infer the type
+     from the data.
+type Column_Type_Selection (column:Text|Integer) datatype:(Auto|Integer|Decimal|Date|Time|Time_Of_Day|Boolean)=Auto
diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Data_Formatter.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Data_Formatter.enso
@@ -0,0 +1,29 @@
+from Standard.Base import all
+
+## Specifies options for reading text data in a table to more specific types and
+   serializing them back.
+
+   Arguments:
+   - trim_values: Trim whitespace before parsing.
+   - allow_leading_zeros: Specifies how to treat numeric values starting with
+     leading zeroes. Defaults to `False`, because converting such
+     values to numbers is a lossy operation - after converting such a number
+     back to text the leading zeroes will get lost. If leading zeroes are not
+     allowed and the column contains any values with leading zeroes, it will not
+     get automatically converted to numbers, remaining as text. However, if the
+     column is specifically requested to be converted to a numeric column, only
+     a warning will be issued indicating that some leading zeroes were present,
+     but the conversion will proceed.
+   - decimal_point: The character used to separate the integer part from the
+     fractional part of a number. Defaults to '.'. Can be changed for example to
+     ',' to allow for European format.
+   - thousand_separator: A separator that can be used to separate groups of
+     digits in numbers. For example, it can be set to ',' to allow for notation
+     like '1,000,000.0'.
+   - datetime_formats: Expected datetime formats.
+   - date_formats: Expected date formats.
+   - time_formats: Expected time formats.
+   - locale: The locale to use when parsing dates and times.
+   - true_values: Values representing True.
+   - false_values: Values representing False.
+type Data_Formatter trim_values:Boolean=True allow_leading_zeros:Boolean=False decimal_point:Text='.' thousand_separator:Text='' datetime_formats:[Text]=["yyyy-MM-dd HH:mm:ss"] date_formats:[Text]=["yyyy-MM-dd"] time_formats:[Text]=["HH:mm:ss"] locale:Locale=Locale.default true_values:[Text]=["True","true","TRUE"] false_values:[Text]=["False","false","FALSE"]
diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso
@@ -4,26 +4,41 @@ import Standard.Base.System.Platform
 import Standard.Table.Data.Column
 import Standard.Table.Io.Csv
 import Standard.Visualization
-import Standard.Base.Data.Time.Date
+from Standard.Base.Data.Time.Date as Date_Module import Date
+from Standard.Base.Data.Time as Time_Module import Time
+from Standard.Base.Data.Time.Time_Of_Day as Time_Of_Day_Module import Time_Of_Day
 import Standard.Table.Io.Spreadsheet_Write_Mode
 import Standard.Table.Io.Format
 import Standard.Table.Internal.Table_Helpers
 import Standard.Table.Internal.Aggregate_Column_Helper
+import Standard.Table.Internal.Parse_Values_Helper
 
 from Standard.Table.Data.Order_Rule as Order_Rule_Module import Order_Rule
 from Standard.Table.Data.Column_Selector as Column_Selector_Module import Column_Selector, By_Index
+from Standard.Table.Data.Column_Type_Selection as Column_Type_Selection_Module import Column_Type_Selection, Auto
+from Standard.Table.Data.Data_Formatter as Data_Formatter_Module import Data_Formatter
 from Standard.Table.Data.Sort_Method as Sort_Method_Module import Sort_Method
 from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior, Report_Warning
+from Standard.Table.Error as Error_Module import Missing_Input_Columns, Column_Indexes_Out_Of_Range, Duplicate_Type_Selector
 import Standard.Table.Data.Column_Mapping
 import Standard.Table.Data.Position
 
 import Standard.Table.Data.Aggregate_Column
 
 polyglot java import org.enso.table.data.table.Table as Java_Table
+polyglot java import org.enso.table.data.table.Column as Java_Column
 polyglot java import org.enso.table.operations.OrderBuilder
 polyglot java import org.enso.table.format.csv.Writer as Csv_Writer
 polyglot java import org.enso.table.format.xlsx.Writer as Spreadsheet_Writer
 
+polyglot java import org.enso.table.parsing.IntegerParser
+polyglot java import org.enso.table.parsing.DecimalParser
+polyglot java import org.enso.table.parsing.BooleanParser
+polyglot java import org.enso.table.parsing.DateParser
+polyglot java import org.enso.table.parsing.TimeParser
+polyglot java import org.enso.table.parsing.DateTimeParser
+polyglot java import org.enso.table.parsing.WhitespaceStrippingParser
+
 ## Creates a new table from a vector of `[name, items]` pairs.
 
    Arguments:
@@ -527,6 +542,87 @@ type Table
                 problems = java_table.getProblems
                 Aggregate_Column_Helper.parse_aggregated_problems problems
 
+    ## Parses columns within a Table to a specific value type.
+       By default, it looks at all `Text` columns and attempts to deduce the
+       type (columns with other types are not affected). If `column_types` are
+       provided, only selected columns are parsed, according to the specified
+       type.
+
+       The default parser options only parse values where the process is
+       reversible (e.g., 0123 would not be converted to an integer as there is
+       a leading 0). However, settings in the `Data_Formatter` can
+       control this.
+    parse_values : Data_Formatter -> (Nothing | [Column_Type_Selection]) -> Problem_Behavior -> Table
+    parse_values parser=Data_Formatter column_types=Nothing on_problems=Report_Warning =
+        columns = this.columns
+        problem_builder = Vector.new_builder
+
+        find_datatype index column =
+            matching_input = column_types.filter selection->
+                selector = selection.column
+                case selector of
+                    Text -> column.name == selector
+                    Integer -> if selector >= 0 then index == selector else
+                        index == columns.length + selector
+            if matching_input.length == 0 then Nothing else
+                if matching_input.length == 1 then matching_input.first.datatype else
+                    first_type = matching_input.first.datatype
+                    ambiguous = matching_input.exists s-> s.datatype != first_type
+                    problem_builder.append (Duplicate_Type_Selector column.name ambiguous)
+                    if ambiguous then Nothing else first_type
+
+        expected_types = case column_types of
+            Nothing -> columns.map _->Auto
+            _ ->
+                missing_columns = Vector.new_builder
+                invalid_indices = Vector.new_builder
+                column_types.each selection->
+                    selector = selection.column
+                    case selector of
+                        Integer ->
+                            valid = Table_Helpers.is_index_valid columns.length selector
+                            if valid.not then
+                                invalid_indices.append selector
+                        Text ->
+                            found = columns.exists col-> col.name == selector
+                            if found.not then
+                                missing_columns.append selector
+                if missing_columns.is_empty.not then
+                    problem_builder.append (Missing_Input_Columns missing_columns.to_vector)
+                if invalid_indices.is_empty.not then
+                    problem_builder.append (Column_Indexes_Out_Of_Range invalid_indices.to_vector)
+                columns.map_with_index find_datatype
+
+        new_columns = columns.zip expected_types column-> expected_type-> case expected_type of
+            Nothing -> column
+            Auto -> Error.unimplemented "Automatic datatype inference is not implemented yet."
+            _ ->
+                parse_options = parser
+                thousand_separator = if parse_options.thousand_separator.is_empty then Nothing else parse_options.thousand_separator
+                base_parser = case expected_type of
+                    Integer -> IntegerParser.new thousand_separator parse_options.allow_leading_zeros
+                    Decimal -> DecimalParser.new parse_options.decimal_point thousand_separator parse_options.allow_leading_zeros
+                    Boolean -> BooleanParser.new parse_options.true_values.to_array parse_options.false_values.to_array
+                    _ ->
+                        if expected_type == Date then DateParser.new parse_options.date_formats.to_array parse_options.locale.java_locale else
+                            if expected_type == Time then DateTimeParser.new parse_options.datetime_formats.to_array parse_options.locale.java_locale else
+                                if expected_type == Time_Of_Day then TimeParser.new parse_options.time_formats.to_array parse_options.locale.java_locale else
+                                    Error.throw (Illegal_Argument_Error "Unsupported target datatype: "+expected_type.to_text)
+                parser = case parse_options.trim_values of
+                    False -> base_parser
+                    True -> WhitespaceStrippingParser.new base_parser
+                storage = column.java_column.getStorage
+                new_storage_and_problems = parser.parseColumn storage
+                new_storage = new_storage_and_problems.value
+                problems = Vector.Vector new_storage_and_problems.problems . map (Parse_Values_Helper.translate_parsing_problem column.name expected_type)
+                problems.each problem_builder.append
+                Column.Column (Java_Column.new column.name column.java_column.getIndex new_storage)
+
+        ## TODO [RW] this case of is a workaround for wrong dataflow handling on arrays, it can be removed once the PR fixing it is merged, the relevant PR is:
+           https://github.com/enso-org/enso/pull/3400
+        result = here.new new_columns
+        on_problems.attach_problems_after result problem_builder.to_vector
+
     ## ALIAS Filter Rows
        ALIAS Mask Columns
 
@@ -1264,7 +1360,7 @@ Text.write_to_spreadsheet cell = cell.setCellValue this
    Arguments:
    - cell: an instance of `org.apache.poi.ss.usermodel.Cell`, the value of
      which should be set by this method.
-Date.Date.write_to_spreadsheet cell = cell.setCellValue this.internal_local_date
+Date.write_to_spreadsheet cell = cell.setCellValue this.internal_local_date
 
 
 

diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Error.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Error.enso
@@ -118,3 +118,20 @@ type Invalid_Location (location:Text)
 Invalid_Location.to_display_text : Text
 Invalid_Location.to_display_text =
     "The location '"+this.location+"' is not valid."
+
+## Indicates that some values did not match the expected datatype format.
+type Invalid_Format column:Text (datatype:(Integer|Number|Date|Time|Time_Of_Day|Boolean)) (cells:[Text])
+
+Invalid_Format.to_display_text : Text
+Invalid_Format.to_display_text =
+    this.cells.length+" cells in column "+this.column+" had invalid format for datatype "+this.datatype.to_text+"."
+
+## Indicates that some values contained leading zeros even though these were not allowed.
+type Leading_Zeros column:Text (datatype:(Integer|Number|Date|Time|Time_Of_Day|Boolean)) (cells:[Text])
+
+## Indicates that multiple `Column_Type_Selector` match the same column.
+
+   If all matching selectors indicate the same type, the warning is reported but
+   a parse is attempted anyway. If mixed types are requested, the column is not
+   parsed due to ambiguity.
+type Duplicate_Type_Selector column:Text ambiguous:Boolean
diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso
@@ -9,9 +9,9 @@ from Standard.Table.Io.File_Format import Infer
 
 polyglot java import org.enso.table.read.DelimitedReader
 polyglot java import org.enso.table.read.ParsingFailedException
-polyglot java import org.enso.table.read.InvalidRow
-polyglot java import org.enso.table.read.MismatchedQuote
-polyglot java import org.enso.table.read.AdditionalInvalidRows
+polyglot java import org.enso.table.parsing.problems.InvalidRow
+polyglot java import org.enso.table.parsing.problems.MismatchedQuote
+polyglot java import org.enso.table.parsing.problems.AdditionalInvalidRows
 polyglot java import java.lang.IllegalArgumentException
 polyglot java import java.io.IOException
 polyglot java import com.univocity.parsers.common.TextParsingException

diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Parse_Values_Helper.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Parse_Values_Helper.enso
@@ -0,0 +1,19 @@
+from Standard.Base import all
+
+from Standard.Table.Error as Table_Errors import Invalid_Format, Leading_Zeros
+
+polyglot java import org.enso.table.parsing.problems.InvalidRow
+polyglot java import org.enso.table.parsing.problems.InvalidFormat
+polyglot java import org.enso.table.parsing.problems.LeadingZeros
+polyglot java import org.enso.table.parsing.problems.MismatchedQuote
+polyglot java import org.enso.table.parsing.problems.AdditionalInvalidRows
+
+translate_parsing_problem column_name expected_datatype problem =
+    invalid_format = [InvalidFormat, (java_problem-> Invalid_Format column_name expected_datatype (Vector.Vector java_problem.cells))]
+    leading_zeros = [LeadingZeros, (java_problem-> Leading_Zeros column_name expected_datatype (Vector.Vector java_problem.cells))]
+    translations = [invalid_format, leading_zeros]
+    found = translations.find t->
+        Java.is_instance problem t.first
+    translation = found.catch _->
+        Error.throw (Illegal_State_Error "Reported an unknown problem type: "+problem.to_text)
+    translation.second problem
diff --git a/...e/runtime/src/main/java/org/enso/interpreter/node/expression/builtin/bool/ToTextNode.java b/...e/runtime/src/main/java/org/enso/interpreter/node/expression/builtin/bool/ToTextNode.java
diff --git a/engine/runtime/src/main/scala/org/enso/compiler/core/IR.scala b/engine/runtime/src/main/scala/org/enso/compiler/core/IR.scala
@@ -6772,7 +6772,7 @@ object IR {
       case object UnresolvedSequenceMacro extends Reason {
         override def explain(originalName: Name): String =
           "No definition for the sequence macro could be found. Try" +
-          " importing the default definition from the Base.Data.Vector module."
+          " importing the default definition from the Standard.Base module."
       }
 
       /** An error coming from an unknown annotation name.

diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/BoolBuilder.java b/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/BoolBuilder.java
@@ -6,10 +6,20 @@
 
 /** A builder for boolean columns. */
 public class BoolBuilder extends TypedBuilder {
-  private final BitSet vals = new BitSet();
-  private final BitSet isNa = new BitSet();
+  private final BitSet vals;
+  private final BitSet isNa;
   int size = 0;
 
+  public BoolBuilder() {
+    vals = new BitSet();
+    isNa = new BitSet();
+  }
+
+  public BoolBuilder(int capacity) {
+    vals = new BitSet(capacity);
+    isNa = new BitSet(capacity);
+  }
+
   @Override
   public void appendNoGrow(Object o) {
     if (o == null) {

diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java
@@ -4,7 +4,6 @@
 import java.util.Comparator;
 import java.util.function.BiConsumer;
 import java.util.function.Function;
-
 import org.apache.poi.ss.usermodel.Cell;
 import org.enso.table.data.column.builder.object.StringBuilder;
 import org.enso.table.data.column.operation.map.MapOpStorage;
@@ -33,7 +32,9 @@ public String getItem(long idx) {
     return (String) super.getItem(idx);
   }
 
-  /** @inheritDoc */
+  /**
+   * @inheritDoc
+   */
   @Override
   public long getType() {
     return Type.STRING;