From 0350762386d176c41dc717b3f0ebdda54bdc4ce1 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Thu, 20 Apr 2023 17:04:59 +0100 Subject: [PATCH] Add `replace`, `trim` to Column. Better number parsing. (#6253) - Add `replace` with same syntax as on `Text` to an in-memory `Column`. - Add `trim` with same syntax as on `Text` to an in-memory `Column`. - Add `trim` to in-database `Column`. - Added `is_supported` to dialects and exposed the dialect consistently on the `Connection`. - Add `write_table` support to `JSON_File` allowing `Table.write` to write JSON. - Updated the parsing for integers and decimals: - Support for currency symbols. - Support for brackets for negative numbers. - Automatic detection of decimal points and thousand separators. - Tighter rules for scientific and thousand separated numbers. - Remove `replace_text` from `Table`. - Remove `write_json` from `Table`. --- CHANGELOG.md | 3 + .../0.0.0-dev/src/Data/Text/Extensions.enso | 9 +- .../Database/0.0.0-dev/src/Data/Column.enso | 58 ++++ .../Database/0.0.0-dev/src/Data/SQL.enso | 12 + .../src/Internal/Base_Generator.enso | 6 + .../Postgres/Postgres_Connection.enso | 4 + .../Internal/Postgres/Postgres_Dialect.enso | 22 +- .../Internal/SQLite/SQLite_Connection.enso | 4 + .../src/Internal/SQLite/SQLite_Dialect.enso | 24 +- .../lib/Standard/Table/0.0.0-dev/package.yaml | 1 + .../Table/0.0.0-dev/src/Data/Column.enso | 106 +++++- .../0.0.0-dev/src/Data/Data_Formatter.enso | 46 ++- .../Table/0.0.0-dev/src/Data/Table.enso | 150 +++------ .../0.0.0-dev/src/Data/Table_Conversions.enso | 29 ++ .../Standard/Table/0.0.0-dev/src/Errors.enso | 16 - .../0.0.0-dev/src/Internal/Java_Problems.enso | 3 - .../src/Internal/Parse_Values_Helper.enso | 5 +- .../org/enso/table/parsing/DecimalParser.java | 105 ------ .../org/enso/table/parsing/IntegerParser.java | 61 ---- .../org/enso/table/parsing/NumberParser.java | 303 ++++++++++++++++++ .../table/parsing/problems/LeadingZeros.java | 8 - .../problems/NoOpProblemAggregator.java | 3 - .../parsing/problems/ProblemAggregator.java | 3 - .../problems/ProblemAggregatorImpl.java | 9 - .../problems/SimplifiedProblemAggregator.java | 5 - .../Column_Operations_Spec.enso | 63 +++- .../src/Formatting/Data_Formatter_Spec.enso | 4 +- .../src/Formatting/Parse_Values_Spec.enso | 115 +++++-- test/Table_Tests/src/IO/Formats_Spec.enso | 40 ++- test/Table_Tests/src/IO/Json_Spec.enso | 8 - .../Table_Tests/src/In_Memory/Table_Spec.enso | 59 ---- 31 files changed, 826 insertions(+), 458 deletions(-) delete mode 100644 std-bits/table/src/main/java/org/enso/table/parsing/DecimalParser.java delete mode 100644 std-bits/table/src/main/java/org/enso/table/parsing/IntegerParser.java create mode 100644 std-bits/table/src/main/java/org/enso/table/parsing/NumberParser.java delete mode 100644 std-bits/table/src/main/java/org/enso/table/parsing/problems/LeadingZeros.java diff --git a/CHANGELOG.md b/CHANGELOG.md index 271435a81f32..16a11dd97edc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -385,6 +385,8 @@ - [Implemented `Table.union` for the Database backend.][6204] - [Array & Vector have the same methods & behavior][6218] - [Implemented `Table.split` and `Table.tokenize` for in-memory tables.][6233] +- [Added `trim` and `replace` to `Column`. Enhanced number parsing with support + for thousands and decimal point automatic detection.][6253] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -583,6 +585,7 @@ [6077]: https://github.com/enso-org/enso/pull/6077 [6218]: https://github.com/enso-org/enso/pull/6218 [6233]: https://github.com/enso-org/enso/pull/6233 +[6253]: https://github.com/enso-org/enso/pull/6253 #### Enso Compiler diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso index 95d7ffc615b9..fef524fb9ae6 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso @@ -1068,12 +1068,11 @@ Text.pad self length=0 with_pad=' ' at=Location.End = case at of whitespace, from the start, the end, or both ends of the input. Arguments: - - trim_characters: A Text containing characters that should be removed or a - predicate taking single character strings and specifying if they should be - removed. By default, this should be any Unicode whitespace characters and - all line terminator characters. - - from: The location of where to trim the input. By default, this function + - where: The location of where to trim the input. By default, this function trims both ends of the input. + - what: Either a `Text` containing characters that should be removed or a + predicate taking single character strings and specifying if they should be + removed. By default, all Unicode whitespace is removed. > Example Trimming whitespace from a string. diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso index 42afd4a5a820..236ea652187c 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso @@ -811,6 +811,58 @@ type Column new_name = self.naming_helpers.binary_operation_name "like" self pattern self.make_binary_op "LIKE" pattern new_name + ## This function removes the specified characters, by default any + whitespace, from the start, the end, or both ends of the input. + + Arguments: + - where: The location of where to trim the input. By default, this + function trims both ends of the input. + - what: A `Text` (or text `Column`) containing characters that should be + removed. By default, spaces, tabs, returns and new lines are removed. + trim : Location -> Column | Text -> Column + trim self where=Location.Both what='' = Value_Type.expect_text self.value_type <| check_text_argument what "what" <| + new_name = self.naming_helpers.function_name "trim" [self] + operator = case where of + Location.Both -> "TRIM" + Location.Start -> "LTRIM" + Location.End -> "RTRIM" + if self.connection.dialect.is_supported operator then self.make_binary_op operator what new_name else + Error.throw (Unsupported_Database_Operation.Error ("`Column.trim "+where.to_text+"` is not supported by this connection.")) + + ## Replaces the first, or all occurrences of `term` with `new_text` in each + row. If `term` is empty, the function returns the table unchanged. + + This method follows the exact replacement semantics of the + `Text.replace` method. + + Arguments: + - term: The term to find. + - replacement: The text to replace matches with. + - case_sensitivity: Specifies if the text values should be compared case + sensitively. + - only_first: If True, only replace the first match. + - use_regex: If true, the term is used as a regular expression. + + > Example + Replace dashes with underscores. + + column.replace "-" "_" + + > Example + Remove leading and trailing spaces from cells. + + column.replace "^\s*(.*?)\s*$" "$1" use_regex=True + + > Example + Replace texts in quotes with parentheses. + + column.replace '"(.*?)"' '($1)' use_regex=True + replace : Text | Column -> Text | Column -> Case_Sensitivity -> Boolean -> Boolean -> Column + replace self term="" new_text="" case_sensitivity=Case_Sensitivity.Sensitive only_first=False use_regex=False = + _ = [term, new_text, case_sensitivity, only_first, use_regex] + msg = "`Column.replace` is not yet implemented." + Error.throw (Unsupported_Database_Operation.Error msg) + ## Gets the year as a number from the date stored in the column. Applies only to columns that hold the `Date` or `Date_Time` types. @@ -1089,3 +1141,9 @@ make_equality_check_with_floating_point_handling column other op = result = column.make_binary_op op other Problem_Behavior.Report_Warning.attach_problems_after result <| Java_Problems.parse_aggregated_problems problem_builder.getProblems + +## PRIVATE +check_text_argument val field ~action = case val of + _ : Text -> action + _ : Column -> Value_Type.expect_text val.value_type <| action + _ -> Error.throw (Illegal_Argument.Error "The `"+field+"` must be Text or a Text Column.") diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/SQL.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/SQL.enso index 229d210111ea..c5429fc6d7c0 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/SQL.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/SQL.enso @@ -105,6 +105,18 @@ type Builder is_empty : Boolean is_empty self = self.fragments.is_empty + ## UNSTABLE + + Checks if a code fragment is a constant. + is_constant : Boolean + is_constant self = + if self.fragments.length != 1 then False else + case self.fragments of + Vector_Builder.Leaf vec -> case vec.at 0 of + SQL_Fragment.Interpolation _ -> True + _ -> False + _ -> False + ## PRIVATE ADVANCED Builds a SQL statement. diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Base_Generator.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Base_Generator.enso index 723ba07e48b9..0ed276359ec3 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Base_Generator.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Base_Generator.enso @@ -40,6 +40,12 @@ type Internal_Dialect new_map = mappings.fold self.operation_map (m -> el -> m.insert (el.at 0) (el.at 1)) Internal_Dialect.Value new_map self.wrap_identifier + ## PRIVATE + Checks if an operation is supported by the dialect. + is_supported : Text -> Boolean + is_supported self operation = + self.operation_map.contains_key operation + ## PRIVATE A helper function to create a binary operator. diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Postgres/Postgres_Connection.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Postgres/Postgres_Connection.enso index d8badf6f1ecc..95bac49793f9 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Postgres/Postgres_Connection.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Postgres/Postgres_Connection.enso @@ -154,6 +154,10 @@ type Postgres_Connection upload_table self name table temporary=True batch_size=1000 = Panic.recover Illegal_State <| self.connection.upload_table name table temporary batch_size + ## PRIVATE + Access the dialect. + dialect self = self.connection.dialect + ## PRIVATE Creates a Postgres connection based on a URL, properties and a dialect. diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Postgres/Postgres_Dialect.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Postgres/Postgres_Dialect.enso index a698436946f6..6f4766c6ae2d 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Postgres/Postgres_Dialect.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Postgres/Postgres_Dialect.enso @@ -162,10 +162,16 @@ type Postgres_Dialect _ = aggregate True + ## PRIVATE + Checks if an operation is supported by the dialect. + is_supported : Text -> Boolean + is_supported self operation = + self.internal_generator_dialect.is_supported operation + ## PRIVATE make_internal_generator_dialect = cases = [["LOWER", Base_Generator.make_function "LOWER"], ["UPPER", Base_Generator.make_function "UPPER"]] - text = [starts_with, contains, ends_with, agg_shortest, agg_longest, make_case_sensitive]+concat_ops+cases + text = [starts_with, contains, ends_with, agg_shortest, agg_longest, make_case_sensitive]+concat_ops+cases+trim_ops counts = [agg_count_is_null, agg_count_empty, agg_count_not_empty, ["COUNT_DISTINCT", agg_count_distinct], ["COUNT_DISTINCT_INCLUDE_NULL", agg_count_distinct_include_null]] arith_extensions = [is_nan, decimal_div, mod_op, ["ROW_MIN", Base_Generator.make_function "LEAST"], ["ROW_MAX", Base_Generator.make_function "GREATEST"]] bool = [bool_or] @@ -264,6 +270,20 @@ concat_ops = concat = Base_Generator.make_concat make_raw_concat_expr make_contains_expr [["CONCAT", concat (has_quote=False)], ["CONCAT_QUOTE_IF_NEEDED", concat (has_quote=True)]] +## PRIVATE +trim_ops = + whitespace = "' ' || CHR(9) || CHR(10) || CHR(13)" + make_fn fn_name = Base_Generator.lift_binary_op fn_name input-> chars-> case chars of + Nothing -> Builder.code fn_name+"(" ++ input ++ ", " ++ whitespace ++ ")" + _ -> + case chars.is_constant of + True -> + const = chars.fragments.vec.first.object + if const.is_nothing || const.is_empty then Builder.code fn_name+"(" ++ input ++ ", " ++ whitespace ++ ")" else + Builder.code fn_name+"(" ++ input ++ ", " ++ chars ++ ")" + False -> + Builder.code "CASE WHEN " ++ chars ++ " IS NULL OR " ++ chars ++ " = '' THEN " ++ fn_name ++ "(" ++ input ++ ") ELSE " ++ fn_name ++ "(" ++ input ++ ", " ++ chars ++ ") END" + [make_fn "TRIM", make_fn "LTRIM", make_fn "RTRIM"] ## PRIVATE agg_count_distinct args = if args.is_empty then (Error.throw (Illegal_Argument.Error "COUNT_DISTINCT requires at least one argument.")) else diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/SQLite/SQLite_Connection.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/SQLite/SQLite_Connection.enso index 7d4b2ab01b21..8e7ba1de5592 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/SQLite/SQLite_Connection.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/SQLite/SQLite_Connection.enso @@ -147,6 +147,10 @@ type SQLite_Connection upload_table self name table temporary=True batch_size=1000 = Panic.recover Illegal_State <| self.connection.upload_table name table temporary batch_size + ## PRIVATE + Access the dialect. + dialect self = self.connection.dialect + ## PRIVATE Creates a SQLite connection based on a JDBC URL and properties. diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/SQLite/SQLite_Dialect.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/SQLite/SQLite_Dialect.enso index 41dd8548206b..6e903c5ee437 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/SQLite/SQLite_Dialect.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/SQLite/SQLite_Dialect.enso @@ -188,9 +188,17 @@ type SQLite_Dialect Average _ _ -> True Median _ _ -> unsupported "Median" + ## PRIVATE + Checks if an operation is supported by the dialect. + is_supported : Text -> Boolean + is_supported self operation = + self.internal_generator_dialect.is_supported operation + + + ## PRIVATE make_internal_generator_dialect = - text = [starts_with, contains, ends_with, make_case_sensitive]+concat_ops + text = [starts_with, contains, ends_with, make_case_sensitive]+concat_ops+trim_ops counts = [agg_count_is_null, agg_count_empty, agg_count_not_empty, ["COUNT_DISTINCT", agg_count_distinct], ["COUNT_DISTINCT_INCLUDE_NULL", agg_count_distinct_include_null]] stats = [agg_stddev_pop, agg_stddev_samp] arith_extensions = [decimal_div, mod_op] @@ -261,6 +269,20 @@ concat_ops = concat = Base_Generator.make_concat make_raw_concat_expr make_contains_expr [["CONCAT", concat (has_quote=False)], ["CONCAT_QUOTE_IF_NEEDED", concat (has_quote=True)]] +## PRIVATE +trim_ops = + whitespace = "' ' || CHAR(9) || CHAR(10) || CHAR(13)" + make_fn fn_name = Base_Generator.lift_binary_op fn_name input-> chars-> case chars of + Nothing -> Builder.code fn_name+"(" ++ input ++ ", " ++ whitespace ++ ")" + _ -> + case chars.is_constant of + True -> + const = chars.fragments.vec.first.object + if const.is_nothing || const.is_empty then Builder.code fn_name+"(" ++ input ++ ", " ++ whitespace ++ ")" else + Builder.code fn_name+"(" ++ input ++ ", " ++ chars ++ ")" + False -> + Builder.code "CASE WHEN " ++ chars ++ " IS NULL OR " ++ chars ++ " == '' THEN " ++ fn_name ++ "(" ++ input ++ ") ELSE " ++ fn_name ++ "(" ++ input ++ ", " ++ chars ++ ") END" + [make_fn "TRIM", make_fn "LTRIM", make_fn "RTRIM"] ## PRIVATE agg_count_distinct args = case args.length == 1 of diff --git a/distribution/lib/Standard/Table/0.0.0-dev/package.yaml b/distribution/lib/Standard/Table/0.0.0-dev/package.yaml index 383e63698216..3f96ef6f2ae9 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/package.yaml +++ b/distribution/lib/Standard/Table/0.0.0-dev/package.yaml @@ -19,6 +19,7 @@ component-groups: - Standard.Base.Select: exports: - Standard.Table.Data.Table.Table.at + - Standard.Table.Data.Table.Table.get - Standard.Table.Data.Table.Table.select_columns - Standard.Table.Data.Table.Table.remove_columns - Standard.Table.Data.Table.Table.reorder_columns diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso index b8b59be0f611..b7cc7439cde9 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso @@ -1,4 +1,5 @@ from Standard.Base import all +from Standard.Base import all import Standard.Base.Data.Array_Proxy.Array_Proxy import Standard.Base.Errors.Common.Index_Out_Of_Bounds import Standard.Base.Errors.Illegal_Argument.Illegal_Argument @@ -913,6 +914,75 @@ type Column like self pattern = run_vectorized_binary_op self "like" (_ -> _ -> Error.throw (Illegal_State.Error "The `Like` operation should only be used on Text columns.")) pattern + ## This function removes the specified characters, by default any + whitespace, from the start, the end, or both ends of the input. + + Arguments: + - where: The location of where to trim the input. By default, this + function trims both ends of the input. + - what: A `Text` (or text `Column`) containing characters that should be + removed. By default, all whitespace is removed. + trim : Location -> Column | Text -> Column + trim self where=Location.Both what='' = Value_Type.expect_text self.value_type <| + new_name = Naming_Helpers.function_name "trim" [self] + + trim_get = wrap_text_argument what 'what' + + trim_get.if_not_error <| + trim_fn t w = if w.is_empty then t.trim where else + t.trim where w + + self_vec = self.to_vector + mapped = self_vec.map_with_index i->t-> trim_fn t (trim_get i) + Column.from_vector new_name mapped + + ## Replaces the first, or all occurrences of `term` with `new_text` in each + row. If `term` is empty, the function returns the table unchanged. + + This method follows the exact replacement semantics of the + `Text.replace` method. + + Arguments: + - term: The term to find. + - replacement: The text to replace matches with. + - case_sensitivity: Specifies if the text values should be compared case + sensitively. + - only_first: If True, only replace the first match. + - use_regex: If true, the term is used as a regular expression. + + > Example + Replace dashes with underscores. + + column.replace "-" "_" + + > Example + Remove leading and trailing spaces from cells. + + column.replace "^\s*(.*?)\s*$" "$1" use_regex=True + + > Example + Replace texts in quotes with parentheses. + + column.replace '"(.*?)"' '($1)' use_regex=True + replace : Text | Column -> Text | Column -> Case_Sensitivity -> Boolean -> Boolean -> Column + replace self term="" new_text="" case_sensitivity=Case_Sensitivity.Sensitive only_first=False use_regex=False = + Value_Type.expect_text self.value_type <| + term_fn = wrap_text_argument term 'term' + new_text_fn = wrap_text_argument new_text 'new_text' + + term_fn.if_not_error <| new_text_fn.if_not_error <| + new_name = Naming_Helpers.function_name "replace" [self, term, new_text] + + do_replace index input = + term = term_fn index + if term.is_nothing || term.is_empty then input else + replace = new_text_fn index + input.replace term replace case_sensitivity only_first use_regex + + self_vec = self.to_vector + mapped = self_vec.map_with_index do_replace + Column.from_vector new_name mapped + ## Gets the year as a number from the date stored in the column. Applies only to columns that hold the `Date` or `Date_Time` types. @@ -921,7 +991,6 @@ type Column year self = Value_Type.expect_has_date self.value_type related_column=self.name <| simple_unary_op self "year" - ## Gets the month as a number (1-12) from the date stored in the column. Applies only to columns that hold the `Date` or `Date_Time` types. @@ -1002,10 +1071,29 @@ type Column - If some values in the column did not match the expected datatype format, an `Invalid_Format` problem is reported. The problematic cells are replaced with `Nothing`. - - If parsing a numeric column and the selected format does not allow - leading zeros (the default) and such cells are found, a - `Leading_Zeros` problem is reported. These cells are replaced with - `Nothing`. + + ? Number Formats + + If parsing a column to a number, by default, the parser will attempt + to find the most appropriate format for the column. This is done by + finding the format that parses the longest set without an issue from + the first record. + + It will try the following separators in British, German, French and + Swiss order. + + - Thousand separators must be followed by groups of 3 numbers. + - Scientific notation is only allowed on decimals and must be on a + value between -10 and 10. The notation is an `E` followed by an + integer and must be enabled on the `Data_Formatter`, + + The following formats are supported: + - Sign (+/-) followed by Number (e.g. +1,234.56) + - Using brackets to indicate a negative number (e.g. (1,234.56)) + - Currency symbols (if not in Auto mode) can be placed before or after + the sign and number. + - If using brackets, the currency symbol must be placed after the + opening bracket. > Example Parse dates in a column in the format `yyyy-MM-dd` (the default format). @@ -1577,3 +1665,11 @@ run_vectorized_binary_case_text_op left op other case_sensitivity fallback new_n simple_unary_op column op_name = new_name = Naming_Helpers.function_name op_name [column] run_vectorized_unary_op column op_name (_ -> Error.throw (Illegal_State.Error "Missing vectorized implementation for `"+op_name+"`. This is a bug in the Table library.")) new_name + +## PRIVATE +wrap_text_argument val field = case val of + _ : Text -> (_-> val) + _ : Column -> Value_Type.expect_text val.value_type <| + storage = val.java_column.getStorage + i-> storage.getItemBoxed i + _ -> Error.throw (Illegal_Argument.Error "The `"+field+"` must be Text or a Text Column.") diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Data_Formatter.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Data_Formatter.enso index 91b7190ddd3b..e321a52ccabe 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Data_Formatter.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Data_Formatter.enso @@ -4,8 +4,6 @@ import Standard.Base.Errors.Illegal_Argument.Illegal_Argument import project.Internal.Parse_Values_Helper from project.Data.Type.Value_Type import Value_Type, Auto, Bits -polyglot java import org.enso.table.parsing.IntegerParser -polyglot java import org.enso.table.parsing.DecimalParser polyglot java import org.enso.table.parsing.BooleanParser polyglot java import org.enso.table.parsing.DateParser polyglot java import org.enso.table.parsing.TimeOfDayParser @@ -13,6 +11,7 @@ polyglot java import org.enso.table.parsing.DateTimeParser polyglot java import org.enso.table.parsing.WhitespaceStrippingParser polyglot java import org.enso.table.parsing.IdentityParser polyglot java import org.enso.table.parsing.TypeInferringParser +polyglot java import org.enso.table.parsing.NumberParser polyglot java import org.enso.table.formatting.AnyObjectFormatter polyglot java import org.enso.table.formatting.BooleanFormatter @@ -36,30 +35,26 @@ type Data_Formatter Arguments: - trim_values: Trim whitespace before parsing. - - allow_leading_zeros: Specifies how to treat numeric values starting with - leading zeroes. Defaults to `False`, because converting such - values to numbers is a lossy operation - after converting such a number - back to text the leading zeroes will get lost. If leading zeroes are not - allowed and the column contains any values with leading zeroes, it will not - get automatically converted to numbers, remaining as text. However, if the - column is specifically requested to be converted to a numeric column, only - a warning will be issued indicating that some leading zeroes were present, - but the conversion will proceed. + - allow_leading_zeros: Specifies how to treat numeric values starting + with leading zeroes when detecting the type of a column automatically. + If set to `False` (the default), then if found values will be left as + text. If the type is a number then leading zeroes are accepted + regardless of this setting. - decimal_point: The character used to separate the integer part from the - fractional part of a number. Defaults to '.'. Can be changed for example to - ',' to allow for European format. - - thousand_separator: A separator that can be used to separate groups of digits in numbers. - - allow_exponential_notation: Specifies if exponential notation is allowed. + fractional part of a number. If `Auto` then the format for numbers will + be guessed automatically. Can be changed for example to ',' to allow + for European format. - thousand_separator: A separator that can be used to separate groups of - digits in numbers. For example, it can be set to ',' to allow for notation - like '1,000,000.0'. + digits in numbers (must be in groups of 3 digits). + - allow_exponential_notation: Specifies if exponential notation is + allowed. - datetime_formats: Expected datetime formats. - date_formats: Expected date formats. - time_formats: Expected time formats. - datetime_locale: The locale to use when parsing dates and times. - true_values: Values representing True. - false_values: Values representing False. - Value trim_values:Boolean=True allow_leading_zeros:Boolean=False decimal_point:Text='.' thousand_separator:Text='' allow_exponential_notation:Boolean=False datetime_formats:(Vector Text)=["ENSO_ZONED_DATE_TIME"] date_formats:(Vector Text)=["ISO_LOCAL_DATE"] time_formats:(Vector Text)=["ISO_LOCAL_TIME"] datetime_locale:Locale=Locale.default true_values:(Vector Text)=["True","true","TRUE"] false_values:(Vector Text)=["False","false","FALSE"] + Value trim_values:Boolean=True allow_leading_zeros:Boolean=False decimal_point:Text|Auto=Auto thousand_separator:Text='' allow_exponential_notation:Boolean=False datetime_formats:(Vector Text)=["ENSO_ZONED_DATE_TIME"] date_formats:(Vector Text)=["ISO_LOCAL_DATE"] time_formats:(Vector Text)=["ISO_LOCAL_TIME"] datetime_locale:Locale=Locale.default true_values:(Vector Text)=["True","true","TRUE"] false_values:(Vector Text)=["False","false","FALSE"] ## PRIVATE ADVANCED @@ -180,12 +175,14 @@ type Data_Formatter WhitespaceStrippingParser.new base_parser ## PRIVATE - make_integer_parser self = self.wrap_base_parser <| - IntegerParser.new self.get_thousand_separator self.allow_leading_zeros + make_integer_parser self auto_mode=False = + separator = if self.thousand_separator.is_empty then Nothing else self.thousand_separator + NumberParser.createIntegerParser auto_mode.not (auto_mode.not || self.allow_leading_zeros) self.trim_values separator ## PRIVATE - make_decimal_parser self = self.wrap_base_parser <| - DecimalParser.new self.decimal_point self.get_thousand_separator self.allow_leading_zeros self.allow_exponential_notation + make_decimal_parser self auto_mode=False = + if self.decimal_point == Auto then NumberParser.createAutoDecimalParser auto_mode.not (auto_mode.not || self.allow_leading_zeros) self.trim_values self.allow_exponential_notation else + NumberParser.createFixedDecimalParser auto_mode.not (auto_mode.not || self.allow_leading_zeros) self.trim_values self.allow_exponential_notation self.thousand_separator self.decimal_point ## PRIVATE make_boolean_parser self = self.wrap_base_parser <| @@ -240,7 +237,7 @@ type Data_Formatter ## PRIVATE get_specific_type_parsers self = - [self.make_integer_parser, self.make_decimal_parser, self.make_date_time_parser, self.make_date_parser, self.make_time_of_day_parser, self.make_boolean_parser] + [self.make_integer_parser True, self.make_decimal_parser True, self.make_date_time_parser, self.make_date_parser, self.make_time_of_day_parser, self.make_boolean_parser] ## PRIVATE make_auto_parser self = @@ -253,7 +250,8 @@ type Data_Formatter ## PRIVATE make_decimal_formatter self = - DecimalFormatter.new self.get_thousand_separator self.decimal_point + decimal_point = if self.decimal_point == Auto then '.' else self.decimal_point + DecimalFormatter.new self.get_thousand_separator decimal_point ## PRIVATE make_date_formatter self = diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso index c36b1b818bf8..bb993556ce2b 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso @@ -14,6 +14,8 @@ import project.Data.Column.Column import project.Data.Column as Column_Module import project.Data.Column_Selector.Column_Selector import project.Data.Data_Formatter.Data_Formatter +import project.Data.Expression.Expression +import project.Data.Expression.Expression_Error import project.Data.Join_Condition.Join_Condition import project.Data.Join_Kind.Join_Kind import project.Data.Match_Columns.Match_Columns @@ -23,6 +25,8 @@ import project.Data.Report_Unmatched.Report_Unmatched import project.Data.Row.Row import project.Data.Set_Mode.Set_Mode import project.Data.Sort_Column.Sort_Column +import project.Data.Table_Conversions +import project.Delimited.Delimited_Format.Delimited_Format import project.Internal.Aggregate_Column_Helper import project.Internal.Java_Problems import project.Internal.Join_Helpers @@ -34,9 +38,6 @@ import project.Internal.Table_Helpers import project.Internal.Table_Helpers.Table_Column_Helper import project.Internal.Unique_Name_Strategy.Unique_Name_Strategy import project.Internal.Widget_Helpers -import project.Data.Expression.Expression -import project.Data.Expression.Expression_Error -import project.Delimited.Delimited_Format.Delimited_Format from project.Data.Type.Value_Type import Value_Type, Auto from project.Internal.Rows_View import Rows_View @@ -788,30 +789,49 @@ type Table - If some values in a column did not match the expected datatype format, an `Invalid_Format` problem is reported. The problematic cells are replaced with `Nothing`. - - If parsing a numeric column and the selected format does not allow - leading zeros (the default) and such cells are found, a - `Leading_Zeros` problem is reported. These cells are replaced with - `Nothing`. - > Example - Parse the first and last columns containing Yes/No values as booleans. + ? Number Formats + + If parsing a column to a number, by default, the parser will attempt + to find the most appropriate format for the column. This is done by + finding the format that parses the longest set without an issue from + the first record. + + It will try the following separators in British, German, French and + Swiss order. + + - Thousand separators must be followed by groups of 3 numbers. + - Scientific notation is only allowed on decimals and must be on a + value between -10 and 10. The notation is an `E` followed by an + integer and must be enabled on the `Data_Formatter`, + + The following formats are supported: + - Sign (+/-) followed by Number (e.g. +1,234.56) + - Using brackets to indicate a negative number (e.g. (1,234.56)) + - Currency symbols (if not in Auto mode) can be placed before or after + the sign and number. + - If using brackets, the currency symbol must be placed after the + opening bracket. + + > Example + Parse the first and last columns containing Yes/No values as booleans. - table.parse columns=[0, -1] type=Boolean format="Yes|No" + table.parse columns=[0, -1] type=Boolean format="Yes|No" - > Example - Parse dates in a column in the format `yyyy-MM-dd` (the default format). + > Example + Parse dates in a column in the format `yyyy-MM-dd` (the default format). - table.parse "birthday" Date + table.parse "birthday" Date - > Example - Parse dates in a column in the format `dd/MM/yyyy`. + > Example + Parse dates in a column in the format `dd/MM/yyyy`. - table.parse "birthday" Date 'dd/MM/yyyy' + table.parse "birthday" Date 'dd/MM/yyyy' - > Example - Parse all columns inferring their types, using `,` as the decimal point for numbers. + > Example + Parse all columns inferring their types, using `,` as the decimal point for numbers. - table.parse format=(Data_Formatter.Value.with_number_formatting decimal_point=',') + table.parse format=(Data_Formatter.Value.with_number_formatting decimal_point=',') parse : Text | Integer | Column_Selector | Vector (Text | Integer | Column_Selector) -> Value_Type | Auto -> Text | Data_Formatter -> Boolean -> Problem_Behavior -> Table parse self columns=(self.columns . filter (c-> c.value_type.is_text) . map .name) type=Auto format=Data_Formatter.Value error_on_missing_columns=True on_problems=Report_Warning = formatter = case format of @@ -847,78 +867,6 @@ type Table parse_problem_builder.attach_problems_before on_problems <| Table.new new_columns - ## Replaces the first, or all occurrences of `term` with `new_text` in each - text row of selected columns. - If `term` is empty, the function returns the table unchanged. - - This method follows the exact replacement semantics of the - `Text.replace` method. - - Arguments: - - columns: Column selection criteria or a column name or index. - - term: The term to find. - - new_text: The new text to replace occurrences of `term` with. - If use_regex is true, `new_text` can include replacement patterns - (such as `$`) for a marked group. - - case_insensitive: Enables or disables case-insensitive matching. Case - insensitive matching behaves as if it normalises the case of all input - text before matching on it. - - only_first: If True, only replace the first match. - - use_regex: If true, the term is used as a regular expression. - - on_problems: Specifies how to handle if a problem occurs, raising as a - warning by default. - - The following problems can occur: - - If a column in columns is not in the input table, a `Missing_Input_Columns`. - - If a column index is out of range, a `Column_Indexes_Out_Of_Range`. - - If a column in columns does not have a storage type of `Text`, or `Any`, - thus it is guaranteed that it can't contain any text values, a - `Invalid_Value_Type`. - - > Example - Replace dashes with underscores in a column named "variable_names". - - table.replace_text "variable_names" "-" "_" - - > Example - Remove leading and trailing spaces from cells in multiple columns. - - table.replace_text By_Name ["foo", "bar"] "^\s*(.*?)\s*$" "$1" use_regex=True - - > Example - Replace texts in quotes with parentheses in column at index 1. - - table.replace_text 1 '"(.*?)"' '($1)' use_regex=True - replace_text : Text | Integer | Column_Selector | Vector (Integer | Text | Column_Selector) -> Text -> Text -> Case_Sensitivity -> Boolean -> Boolean -> Problem_Behavior -> Table - replace_text self columns=[0] term="" new_text="" case_sensitivity=Case_Sensitivity.Sensitive only_first=False use_regex=False on_problems=Problem_Behavior.Report_Warning = if term.is_empty then self else - problem_builder = Problem_Builder.new - - selection = self.columns_helper.select_columns_helper columns reorder=False problem_builder - selected_names = Map.from_vector (selection.map column-> [column.name, True]) - - ## TODO [RW] we should inherit the parent type here, but extend fixed length strings to varied length - To be done in #6106. - map_preserve_name column f = column.map f . rename column.name - do_replace = _.replace term new_text case_sensitivity=case_sensitivity only_first=only_first use_regex=use_regex - do_replace_only_text = case _ of - item : Text -> do_replace item - item -> item - - transform column = case column.value_type of - Value_Type.Char _ _ -> map_preserve_name column do_replace - Value_Type.Mixed -> map_preserve_name column do_replace_only_text - _ -> - problem = Invalid_Value_Type.Error Value_Type.Char column.value_type - problem_builder.report_other_warning problem - column - - new_columns = self.columns.map column-> - is_selected = selected_names.get column.name False - if is_selected then transform column else column - - result = Table.new new_columns - problem_builder.attach_problems_after on_problems result - ## Splits a column of text into a set of new columns. The original column will be removed from the table. The new columns will be named with the name of the input column with a @@ -1793,26 +1741,6 @@ type Table mask = OrderBuilder.buildReversedMask self.row_count Table.Value <| self.java_table.applyMask mask - ## ALIAS Write JSON - UNSTABLE - - Writes this table to a specified file, serialized into JSON. The JSON - serialization is such that the result is an array, in which every entry - is an object representing a single row, with column names as keys. - - Arguments: - - file: the file to write data to. If the file exists, it will be - overwritten. - - > Example - Write a table to a JSON file. - - import Standard.Examples - - example_to_json = Examples.inventory_table.write_json (enso_project.data / 'example.json') - write_json : File -> Nothing - write_json self file = self.to_json.write file - ## This function writes a table from memory into a file. The specific behavior of the various `File_Format`s is specified below. diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table_Conversions.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table_Conversions.enso index fb9b7c0412d3..6935731ef5b8 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table_Conversions.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table_Conversions.enso @@ -4,6 +4,7 @@ import Standard.Base.Errors.Illegal_Argument.Illegal_Argument import Standard.Base.Errors.Unimplemented.Unimplemented import project.Data.Table.Table +import project.Data.Match_Columns.Match_Columns import project.Delimited.Delimited_Format.Delimited_Format import project.Errors.Invalid_JSON_Format import project.Internal.Delimited_Reader @@ -89,3 +90,31 @@ Table.from_objects value fields=Nothing = Table.new (used_fields.zip used_values) _ : Array -> Table.from_objects (Vector.from_polyglot_array value) fields _ -> Error.throw (Illegal_Argument.Error "Invalid value for Table.from_objects. Currently must be one of JS_Object, Vector, Array, Number, Boolean, Text and Nothing are supported (got "+(Meta.get_simple_type_name value)+").") + + +## PRIVATE + ADVANCED + Implements the `Table.write` for this `JSON_File`. + + Arguments: + - file: The file to write to. + - table: The table to write. + - on_existing_file: What to do if the file already exists. + - match_columns: How to match columns between the table and the file. + Not used for JSON. + - on_problems: What to do if there are problems reading the file. +JSON_File.write_table : File -> Table -> Existing_File_Behavior -> Match_Columns -> Problem_Behavior -> File +JSON_File.write_table self file table on_existing_file match_columns on_problems = + _ = [match_columns, on_problems] + if file.exists.not then table.to_json.write file else + case on_existing_file of + Existing_File_Behavior.Append -> + ## Special handling - Need to remove the closing bracket and append. + old_text = file.read_text.trim + case old_text.ends_with "]" && old_text.starts_with "[" of + True -> + new_text = old_text.drop (Last 1) + "," + table.to_json.drop (First 1) + new_text.write file on_existing_file=Existing_File_Behavior.Overwrite on_problems=on_problems + False -> + Error.throw (Invalid_JSON_Format.Error old_text "File already exists and is not a JSON array.") + _ -> table.to_json.write file on_existing_file=on_existing_file on_problems=on_problems diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Errors.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Errors.enso index e478139156df..0960ad936397 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Errors.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Errors.enso @@ -257,22 +257,6 @@ type Invalid_Format to_display_text self = self.cells.length+" cells in column "+self.column+" had invalid format for type "+self.value_type.to_text+"." -## Indicates that some values contained leading zeros even though these were not allowed. - - Arguments: - - column: the column in which the problematic cells appeared, if applicable. - It may be empty if the value is parsed outside of a context of a column. - - datatype: The expected datatype. - - cells: Contents of the cells that contained leading zeros. -type Leading_Zeros - ## PRIVATE - Error column:(Text|Nothing) (datatype:(Integer|Number|Date|Time|Time_Of_Day|Boolean)) (cells:[Text]) - - ## PRIVATE - Pretty print the leading zeros error. - to_display_text : Text - to_display_text self = "Leading zeros in column "+self.column+" with datatype "+self.value_type.to_text+"." - ## Indicates that an empty file was encountered, so no data could be loaded. type Empty_File_Error ## PRIVATE diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Java_Problems.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Java_Problems.enso index 92af7500ab35..fe6d6d18afa7 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Java_Problems.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Java_Problems.enso @@ -11,7 +11,6 @@ polyglot java import org.enso.table.data.table.problems.InvalidAggregation polyglot java import org.enso.table.data.table.problems.UnquotedDelimiter polyglot java import org.enso.table.data.table.problems.UnquotedCharactersInOutput polyglot java import org.enso.table.parsing.problems.InvalidFormat -polyglot java import org.enso.table.parsing.problems.LeadingZeros polyglot java import org.enso.table.parsing.problems.InvalidRow polyglot java import org.enso.table.parsing.problems.AdditionalInvalidRows polyglot java import org.enso.table.util.problems.DuplicateNames @@ -41,8 +40,6 @@ translate_problem p = case p of Invalid_Output_Column_Names.Error (Vector.from_polyglot_array p.invalidNames) _ : InvalidFormat -> Panic.throw (Illegal_Argument.Error "InvalidFormat should be translated using the Parse_Values_Helper.translate_parsing_problem instead. This is a bug in the Table library.") - _ : LeadingZeros -> - Panic.throw (Illegal_Argument.Error "LeadingZeros should be translated using the Parse_Values_Helper.translate_parsing_problem instead. This is a bug in the Table library.") _ -> Panic.throw (Illegal_Argument.Error "Encountered an unknown problem type when converting Java problems into Enso. This is a bug in the Table library. The unexpected problem was: "+p.to_text) diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Parse_Values_Helper.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Parse_Values_Helper.enso index fef6b288d2d7..0464e15e49e3 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Parse_Values_Helper.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Parse_Values_Helper.enso @@ -1,10 +1,9 @@ from Standard.Base import all import Standard.Base.Errors.Illegal_State.Illegal_State -from project.Errors import Invalid_Format, Leading_Zeros +from project.Errors import Invalid_Format polyglot java import org.enso.table.parsing.problems.InvalidFormat -polyglot java import org.enso.table.parsing.problems.LeadingZeros ## PRIVATE Translates a parse related problem additionally enriching it with expected @@ -12,7 +11,5 @@ polyglot java import org.enso.table.parsing.problems.LeadingZeros translate_parsing_problem expected_value_type problem = case problem of java_problem : InvalidFormat -> Invalid_Format.Error java_problem.column expected_value_type (Vector.from_polyglot_array java_problem.cells) - java_problem : LeadingZeros -> - Leading_Zeros.Error java_problem.column expected_value_type (Vector.from_polyglot_array java_problem.cells) _ -> Panic.throw (Illegal_State.Error "Reported an unknown problem type: "+problem.to_text) diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/DecimalParser.java b/std-bits/table/src/main/java/org/enso/table/parsing/DecimalParser.java deleted file mode 100644 index 7860c77f1f84..000000000000 --- a/std-bits/table/src/main/java/org/enso/table/parsing/DecimalParser.java +++ /dev/null @@ -1,105 +0,0 @@ -package org.enso.table.parsing; - -import org.enso.table.data.column.builder.object.Builder; -import org.enso.table.data.column.builder.object.NumericBuilder; -import org.enso.table.formatting.DecimalFormatter; -import org.enso.table.parsing.problems.ProblemAggregator; - -import java.text.DecimalFormat; -import java.text.ParsePosition; - -public class DecimalParser extends IncrementalDatatypeParser { - private final String thousandsSeparator; - private final char decimalPoint; - private final DecimalFormat decimalFormat; - private final boolean leadingZerosAllowed; - private final boolean scientificNotationAllowed; - - public DecimalParser( - final String decimalPoint, - final String thousandsSeparator, - final boolean leadingZerosAllowed, - boolean scientificNotationAllowed) { - this.leadingZerosAllowed = leadingZerosAllowed; - this.scientificNotationAllowed = scientificNotationAllowed; - - if (decimalPoint.length() != 1) { - throw new IllegalArgumentException( - "The `decimalPoint` should consist of exactly one code point."); - } else { - this.decimalPoint = decimalPoint.charAt(0); - } - - if (thousandsSeparator != null && thousandsSeparator.length() != 1) { - throw new IllegalArgumentException( - "The `thousandsSeparator` should consist of exactly one code point."); - } - this.thousandsSeparator = thousandsSeparator; - - decimalFormat = new DecimalFormat(); - var symbols = decimalFormat.getDecimalFormatSymbols(); - symbols.setDecimalSeparator(this.decimalPoint); - symbols.setInfinity(DecimalFormatter.INFINITY); - decimalFormat.setDecimalFormatSymbols(symbols); - } - - @Override - protected Object parseSingleValue(String text, ProblemAggregator problemAggregator) { - if (thousandsSeparator != null - && (text.startsWith(thousandsSeparator) || text.endsWith(thousandsSeparator))) { - problemAggregator.reportInvalidFormat(text); - return null; - } - - String replaced = thousandsSeparator == null ? text : text.replace(thousandsSeparator, ""); - - // If the number starts with a plus, we need to remove it because DecimalFormat does not like - // it. But we also ensure that we do not let through a "+-" by accident. - if (replaced.length() >= 2 && replaced.charAt(0) == '+' && replaced.charAt(1) != '-') { - replaced = replaced.substring(1); - } - - ParsePosition pos = new ParsePosition(0); - Number result = decimalFormat.parse(replaced, pos); - if (result == null || pos.getIndex() != replaced.length()) { - problemAggregator.reportInvalidFormat(text); - return null; - } - - if (!leadingZerosAllowed && hasLeadingZeros(replaced)) { - problemAggregator.reportLeadingZeroes(text); - return null; - } - - if (!scientificNotationAllowed && hasExponentSymbol(replaced)) { - problemAggregator.reportInvalidFormat(text); - return null; - } - - return result.doubleValue(); - } - - /** - * Assumes that the provided string represents a valid integer, in particular that it is not - * empty. - */ - private boolean hasLeadingZeros(String s) { - int firstDigitPos = 0; - if (s.charAt(0) == '+' || s.charAt(0) == '-') { - firstDigitPos = 1; - } - - return s.charAt(firstDigitPos) == '0' - && firstDigitPos + 1 < s.length() - && s.charAt(firstDigitPos + 1) != decimalPoint; - } - - private boolean hasExponentSymbol(String s) { - return s.contains(decimalFormat.getDecimalFormatSymbols().getExponentSeparator()); - } - - @Override - protected Builder makeBuilderWithCapacity(int capacity) { - return NumericBuilder.createDoubleBuilder(capacity); - } -} diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/IntegerParser.java b/std-bits/table/src/main/java/org/enso/table/parsing/IntegerParser.java deleted file mode 100644 index 04df8485706f..000000000000 --- a/std-bits/table/src/main/java/org/enso/table/parsing/IntegerParser.java +++ /dev/null @@ -1,61 +0,0 @@ -package org.enso.table.parsing; - -import org.enso.table.data.column.builder.object.Builder; -import org.enso.table.data.column.builder.object.NumericBuilder; -import org.enso.table.parsing.problems.ProblemAggregator; - -public class IntegerParser extends IncrementalDatatypeParser { - private final String thousandsSeparator; - private final boolean leadingZerosAllowed; - - public IntegerParser(final String thousandsSeparator, final boolean leadingZerosAllowed) { - this.leadingZerosAllowed = leadingZerosAllowed; - if (thousandsSeparator != null && thousandsSeparator.length() != 1) { - throw new IllegalArgumentException( - "The `thousandsSeparator` should consist of exactly one code point."); - } - this.thousandsSeparator = thousandsSeparator; - } - - @Override - protected Object parseSingleValue(String text, ProblemAggregator problemAggregator) { - if (thousandsSeparator != null - && (text.startsWith(thousandsSeparator) || text.endsWith(thousandsSeparator))) { - problemAggregator.reportInvalidFormat(text); - return null; - } - - String replaced = thousandsSeparator == null ? text : text.replace(thousandsSeparator, ""); - try { - long value = Long.parseLong(replaced); - - if (!leadingZerosAllowed && hasLeadingZeros(replaced)) { - problemAggregator.reportLeadingZeroes(text); - return null; - } - - return value; - } catch (NumberFormatException exception) { - problemAggregator.reportInvalidFormat(text); - return null; - } - } - - /** - * Assumes that the provided string represents a valid integer, in particular that it is not - * empty. - */ - private boolean hasLeadingZeros(String s) { - int firstDigitPos = 0; - if (s.charAt(0) == '+' || s.charAt(0) == '-') { - firstDigitPos = 1; - } - - return s.charAt(firstDigitPos) == '0' && firstDigitPos + 1 < s.length(); - } - - @Override - protected Builder makeBuilderWithCapacity(int capacity) { - return NumericBuilder.createLongBuilder(capacity); - } -} diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/NumberParser.java b/std-bits/table/src/main/java/org/enso/table/parsing/NumberParser.java new file mode 100644 index 000000000000..cbff6cb9b8ab --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/parsing/NumberParser.java @@ -0,0 +1,303 @@ +package org.enso.table.parsing; + +import org.enso.table.data.column.builder.object.Builder; +import org.enso.table.data.column.builder.object.NumericBuilder; +import org.enso.table.data.column.storage.Storage; +import org.enso.table.parsing.problems.ProblemAggregator; +import org.enso.table.parsing.problems.ProblemAggregatorImpl; +import org.enso.table.problems.WithProblems; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.regex.Pattern; + +/** A parser for numbers. + * + * This parser will attempt to work out what the decimal point and thousand + * separators used in the input. It will try various ways of formatting a number + * and can be set to allow for scientific notation, currency symbols. + * + * If parsing a column it will select the format that parses the longest set + * without an issue from the top and then apply this format to all the rows. + * + * The separators will be tried in British, German, French and Swiss order. + * - Thousand separator must be followed by groups of 3 numbers. + * - Scientific notation is only allowed on decimals and must be on a value + * between -10 and 10. The notation is an `E` followed by an integer. + * + * The following formats are supported: + * - Sign (+/-) followed by Number (e.g. +1,234.56) + * - Using brackets to indicate a negative number (e.g. (1,234.56)) + * - Currency symbols (if enabled) can be placed before or after the sign and + * number. + * - If using brackets, the currency symbol must be placed after the opening + * bracket. + * */ +public class NumberParser extends IncrementalDatatypeParser { + private final static String SIGN = "(?[-+])?"; + private final static String BRACKETS = "(?\\((?=.*\\)\\s*$))?\\s*"; + private final static String BRACKET_CLOSE = "\\)?"; + private final static String CCY = "(?[^0-9(),. '+-]+)"; + private final static String EXP = "(?[eE][+-]?\\d+)?"; + private final static String SPACE = "\\s*"; + private final static String[] SEPARATORS = new String[] {",.", ".,", " ,", "',"}; + + private final static Map PATTERNS = new HashMap<>(); + + private static Pattern getPattern(boolean allowDecimal, boolean allowCurrency, boolean allowScientific, boolean trimValues, int index) { + int allowedSet = (allowCurrency ? ALLOWED_CCY_PATTERNS : ALLOWED_NON_CCY_PATTERNS); + int separatorsIndex = index / allowedSet; + int patternIndex = index % allowedSet; + + if (separatorsIndex >= SEPARATORS.length) { + return null; + } + + var separators = SEPARATORS[separatorsIndex]; + return getPattern(allowDecimal, allowCurrency, allowScientific, trimValues, patternIndex, separators); + } + + /** The number of patterns that are allowed for non-currency numbers. */ + private static final int ALLOWED_NON_CCY_PATTERNS = 2; + + /** The number of patterns that are allowed for currency numbers. */ + private static final int ALLOWED_CCY_PATTERNS = 6; + + private static Pattern getPattern(boolean allowDecimal, boolean allowCurrency, boolean allowScientific, boolean trimValues, int patternIndex, String separators) { + if (allowScientific && !allowDecimal) { + throw new IllegalArgumentException("Scientific notation requires decimal numbers."); + } + + if (patternIndex >= (allowCurrency ? ALLOWED_CCY_PATTERNS : ALLOWED_NON_CCY_PATTERNS)) { + return null; + } + + var INTEGER = "(?(\\d*)" + (separators.length() == 1 ? "" : "|(\\d{1,3}([" + separators.charAt(0) + "]\\d{3})*)") + ")"; + + var decimalPoint = (separators.length() == 1 ? separators : separators.charAt(1)); + var NUMBER = INTEGER + (allowDecimal ? "(?[" + decimalPoint + "]\\d*)?" : "") + (allowScientific ? EXP : ""); + + var pattern = switch (patternIndex) { + case 0 -> SIGN + NUMBER; + case 1 -> BRACKETS + NUMBER + BRACKET_CLOSE; + case 2 -> SIGN + CCY + SPACE + NUMBER; + case 3 -> CCY + SPACE + SIGN + NUMBER; + case 4 -> SIGN + NUMBER + CCY; + case 5 -> BRACKETS + CCY + SPACE + NUMBER + BRACKET_CLOSE; + default -> throw new IllegalArgumentException("Invalid pattern index: " + patternIndex); + }; + + if (trimValues) { + pattern = SPACE + pattern + SPACE; + } + + return PATTERNS.computeIfAbsent("^" + pattern + "$", Pattern::compile); + } + + private final boolean allowDecimal; + private final boolean allowCurrency; + private final boolean allowLeadingZeros; + private final boolean allowScientific; + private final boolean trimValues; + private final String separators; + + /** + * Creates a new integer instance of this parser. + * + * @param allowCurrency whether to allow currency symbols + * @param allowLeadingZeros whether to allow leading zeros + * @param trimValues whether to trim the input values + * @param thousandSeparator the thousand separator to use + */ + public static NumberParser createIntegerParser(boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, String thousandSeparator) { + var separator = thousandSeparator == null ? null : (thousandSeparator + '_'); + return new NumberParser(false, allowCurrency, allowLeadingZeros, trimValues, false, separator); + } + + /** + * Creates a new decimal instance of this parser. + * + * @param allowCurrency whether to allow currency symbols + * @param allowLeadingZeros whether to allow leading zeros + * @param trimValues whether to trim the input values + * @param allowScientific whether to allow scientific notation + */ + public static NumberParser createAutoDecimalParser(boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, boolean allowScientific) { + return new NumberParser(true, allowCurrency, allowLeadingZeros, trimValues, allowScientific, null); + } + + /** + * Creates a new decimal instance of this parser. + * + * @param allowCurrency whether to allow currency symbols + * @param allowLeadingZeros whether to allow leading zeros + * @param trimValues whether to trim the input values + * @param allowScientific whether to allow scientific notation + * @param thousandSeparator the thousand separator to use + * @param decimalSeparator the decimal separator to use + */ + public static NumberParser createFixedDecimalParser(boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, boolean allowScientific, String thousandSeparator, String decimalSeparator) { + if (decimalSeparator == null || decimalSeparator.length() != 1) { + throw new IllegalArgumentException("Decimal separator must be a single character."); + } + + thousandSeparator = thousandSeparator == null ? "" : thousandSeparator; + return new NumberParser(true, allowCurrency, allowLeadingZeros, trimValues, allowScientific, thousandSeparator + decimalSeparator); + } + + private NumberParser(boolean allowDecimal, boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, boolean allowScientific, String separators) { + this.allowDecimal = allowDecimal; + this.allowCurrency = allowCurrency; + this.allowLeadingZeros = allowLeadingZeros; + this.trimValues = trimValues; + this.allowScientific = allowScientific; + this.separators = separators; + } + + /** + * Creates a Pattern for the given index. + * The index will be decoded into a specific set of separators (unless fixed + * separators are used) and then paired with on of the valid patterns for + * the given parser. + */ + private Pattern patternForIndex(int index) { + return separators == null + ? getPattern(allowDecimal, allowCurrency, allowScientific, trimValues, index) + : getPattern(allowDecimal, allowCurrency, allowScientific, trimValues, index, separators); + } + + @Override + protected Object parseSingleValue(String text, ProblemAggregator problemAggregator) { + int index = 0; + var pattern = patternForIndex(index); + while (pattern != null) { + var value = innerParseSingleValue(text, pattern); + if (value != null) { + return value; + } + + index++; + pattern = patternForIndex(index); + } + + problemAggregator.reportInvalidFormat(text); + return null; + } + + @Override + public WithProblems> parseColumn(String columnName, Storage sourceStorage) { + int index = 0; + var pattern = patternForIndex(index); + + int bestIndex = 0; + int bestCount = -1; + while (pattern != null) { + Builder builder = makeBuilderWithCapacity(sourceStorage.size()); + int failedAt = parseColumnWithPattern(pattern, sourceStorage, builder, null); + if (failedAt == -1) { + return new WithProblems<>(builder.seal(), Collections.emptyList()); + } + + if (failedAt > bestCount) { + bestCount = failedAt; + bestIndex = index; + } + + index++; + pattern = patternForIndex(index); + } + + Builder fallback = makeBuilderWithCapacity(sourceStorage.size()); + ProblemAggregator aggregator = new ProblemAggregatorImpl(columnName); + parseColumnWithPattern(patternForIndex(bestIndex), sourceStorage, fallback, aggregator); + return new WithProblems<>(fallback.seal(), aggregator.getAggregatedProblems()); + } + + private int parseColumnWithPattern(Pattern pattern, Storage sourceStorage, Builder builder, ProblemAggregator aggregator) { + for (int i = 0; i < sourceStorage.size(); i++) { + var text = sourceStorage.getItemBoxed(i); + if (text == null) { + builder.appendNulls(1); + } else { + var value = innerParseSingleValue(text, pattern); + if (value != null) { + builder.appendNoGrow(value); + } else { + if (aggregator == null) { + return i; + } + + aggregator.reportInvalidFormat(text); + builder.appendNulls(1); + } + } + } + return -1; + } + + @Override + protected Builder makeBuilderWithCapacity(int capacity) { + return allowDecimal + ? NumericBuilder.createDoubleBuilder(capacity) + : NumericBuilder.createLongBuilder(capacity); + } + + private Object innerParseSingleValue(String text, Pattern pattern) { + if (allowDecimal) { + var trimmed = trimValues ? text.trim() : text; + if (trimmed.equals("NaN")) { + return Double.NaN; + } + if (trimmed.equals("Infinity")) { + return Double.POSITIVE_INFINITY; + } + if (trimmed.equals("-Infinity")) { + return Double.NEGATIVE_INFINITY; + } + } + + var parsed = pattern.matcher(text); + if (!parsed.matches()) { + return null; + } + + try { + var sign = parsed.group("sign"); + var sign_value = sign != null && !sign.equals("+") ? -1 : 1; + + var integer = parsed.group("integer").replaceAll("\\D", ""); + + if (!allowLeadingZeros && integer.length() > 1 && integer.charAt(0) == '0') { + return null; + } + + if (allowDecimal) { + var decimal = parsed.group("decimal"); + var decimalPrepared = decimal == null ? "" : ("." + decimal.substring(1)); + + if (integer.equals("") && decimalPrepared.equals("")) { + return null; + } + + integer = integer.equals("") ? "0" : integer; + + if (allowScientific) { + var exp = parsed.group("exp"); + if (exp != null) { + if (integer.length() > 1) { + return null; + } + decimalPrepared = decimalPrepared + exp; + } + } + + return sign_value * Double.parseDouble(integer + decimalPrepared); + } + + return integer.equals("") ? null : sign_value * Long.parseLong(integer); + } catch (NumberFormatException e) { + throw new IllegalStateException("Java parse failed to parse number: " + text, e); + } + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/problems/LeadingZeros.java b/std-bits/table/src/main/java/org/enso/table/parsing/problems/LeadingZeros.java deleted file mode 100644 index 3e9cfe609bf0..000000000000 --- a/std-bits/table/src/main/java/org/enso/table/parsing/problems/LeadingZeros.java +++ /dev/null @@ -1,8 +0,0 @@ -package org.enso.table.parsing.problems; - -import org.enso.table.problems.Problem; - -import java.util.List; - -/** Indicates that some values contained leading zeros when leading zeros where not allowed in the given numeric conversion. */ -public record LeadingZeros(String column, List cells) implements Problem {} diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/problems/NoOpProblemAggregator.java b/std-bits/table/src/main/java/org/enso/table/parsing/problems/NoOpProblemAggregator.java index bb0ab2808fc7..1b31b1297c72 100644 --- a/std-bits/table/src/main/java/org/enso/table/parsing/problems/NoOpProblemAggregator.java +++ b/std-bits/table/src/main/java/org/enso/table/parsing/problems/NoOpProblemAggregator.java @@ -10,9 +10,6 @@ public class NoOpProblemAggregator implements ProblemAggregator { @Override public void reportInvalidFormat(String cell) {} - @Override - public void reportLeadingZeroes(String cell) {} - @Override public void reportMismatchedQuote(String cellText) {} diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/problems/ProblemAggregator.java b/std-bits/table/src/main/java/org/enso/table/parsing/problems/ProblemAggregator.java index 4f315f032c76..f7d55232a4ea 100644 --- a/std-bits/table/src/main/java/org/enso/table/parsing/problems/ProblemAggregator.java +++ b/std-bits/table/src/main/java/org/enso/table/parsing/problems/ProblemAggregator.java @@ -15,9 +15,6 @@ public interface ProblemAggregator { */ void reportInvalidFormat(String cell); - /** Reports a cell containing unexpected leading zeros. */ - void reportLeadingZeroes(String cell); - /** Reports that a mismatched quote has been encountered. */ void reportMismatchedQuote(String cellText); diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/problems/ProblemAggregatorImpl.java b/std-bits/table/src/main/java/org/enso/table/parsing/problems/ProblemAggregatorImpl.java index bd9c416b8f9b..872849c73949 100644 --- a/std-bits/table/src/main/java/org/enso/table/parsing/problems/ProblemAggregatorImpl.java +++ b/std-bits/table/src/main/java/org/enso/table/parsing/problems/ProblemAggregatorImpl.java @@ -19,11 +19,6 @@ public void reportInvalidFormat(String cell) { invalidFormatCells.add(cell); } - @Override - public void reportLeadingZeroes(String cell) { - leadingZerosCells.add(cell); - } - @Override public void reportMismatchedQuote(String cellText) { throw new MismatchedQuote(cellText); @@ -42,10 +37,6 @@ public List getAggregatedProblems() { problems.add(new InvalidFormat(relatedColumnName, invalidFormatCells)); } - if (!leadingZerosCells.isEmpty()) { - problems.add(new LeadingZeros(relatedColumnName, leadingZerosCells)); - } - assert problems.isEmpty() == !hasProblems(); return problems; diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/problems/SimplifiedProblemAggregator.java b/std-bits/table/src/main/java/org/enso/table/parsing/problems/SimplifiedProblemAggregator.java index 37abd1ce7704..6b7a010c1ff3 100644 --- a/std-bits/table/src/main/java/org/enso/table/parsing/problems/SimplifiedProblemAggregator.java +++ b/std-bits/table/src/main/java/org/enso/table/parsing/problems/SimplifiedProblemAggregator.java @@ -13,11 +13,6 @@ public void reportInvalidFormat(String cell) { hasProblems = true; } - @Override - public void reportLeadingZeroes(String cell) { - hasProblems = true; - } - @Override public void reportMismatchedQuote(String cellText) { hasProblems = true; diff --git a/test/Table_Tests/src/Common_Table_Operations/Column_Operations_Spec.enso b/test/Table_Tests/src/Common_Table_Operations/Column_Operations_Spec.enso index 86ec7e2263b2..4eecae25ed0c 100644 --- a/test/Table_Tests/src/Common_Table_Operations/Column_Operations_Spec.enso +++ b/test/Table_Tests/src/Common_Table_Operations/Column_Operations_Spec.enso @@ -205,6 +205,7 @@ spec setup = t3 = table_builder [["s1", ["foobar", "bar", "baz", "BAB", Nothing]], ["s2", ["foo", "ar", "a", "b", Nothing]]] s1 = t3.at "s1" s2 = t3.at "s2" + Test.specify "should handle basic Text operations" <| s1.starts_with s2 . to_vector . should_equal [True, False, False, False, Nothing] s1.starts_with s2 Case_Sensitivity.Insensitive . to_vector . should_equal [True, False, False, True, Nothing] @@ -224,7 +225,67 @@ spec setup = s1.ends_with "a" . to_vector . should_equal [False, False, False, False, Nothing] s1.ends_with "b" Case_Sensitivity.Insensitive . to_vector . should_equal [False, False, False, True, Nothing] - Test.group prefix+"Colum Operations - Names" <| + Test.group prefix+"Column Operations - Text Replace" <| + if setup.is_database.not then + t4 = table_builder [["A", ["Alpha", "Bravo", "Charlie", "Delta", "Echo", "Foxtrot"]], ["B", ["A","O","a","E","o","O"]], ["C", [1,2,3,4,5,6]], ["D", ['',Nothing,'',Nothing,'','']]] + a = t4.at "A" + b = t4.at "B" + c = t4.at "C" + d = t4.at "D" + + Test.specify "should allow simple replacing" <| + a.replace "a" "#" . to_vector . should_equal ["Alph#", "Br#vo", "Ch#rlie", "Delt#", "Echo", "Foxtrot"] + a.replace "o" "#" . to_vector . should_equal ["Alpha", "Brav#", "Charlie", "Delta", "Ech#", "F#xtr#t"] + a.replace b "#" . to_vector . should_equal ["#lpha", "Bravo", "Ch#rlie", "Delta", "Ech#", "Foxtrot"] + a.replace "o" "#" only_first=True . to_vector . should_equal ["Alpha", "Brav#", "Charlie", "Delta", "Ech#", "F#xtrot"] + a.replace "a" "#" Case_Sensitivity.Insensitive . to_vector . should_equal ["#lph#", "Br#vo", "Ch#rlie", "Delt#", "Echo", "Foxtrot"] + a.replace b "#" Case_Sensitivity.Insensitive . to_vector . should_equal ["#lph#", "Brav#", "Ch#rlie", "D#lta", "Ech#", "F#xtr#t"] + a.replace b "#" Case_Sensitivity.Insensitive only_first=True . to_vector . should_equal ["#lpha", "Brav#", "Ch#rlie", "D#lta", "Ech#", "F#xtrot"] + + Test.specify "should allow regex based replacing" <| + a.replace "[aeiou]" "#" use_regex=True . to_vector . should_equal ["Alph#", "Br#v#", "Ch#rl##", "D#lt#", "Ech#", "F#xtr#t"] + a.replace "[aeiou]" "#" use_regex=True . to_vector . should_equal ["Alph#", "Br#v#", "Ch#rl##", "D#lt#", "Ech#", "F#xtr#t"] + a.replace "([aeiou])(.*?)[aeiou]" "$1$2$1" use_regex=True . to_vector . should_equal ["Alpha", "Brava", "Charlae", "Delte", "Echo", "Foxtrot"] + + Test.specify "should only allow replace on Text columns" <| + c.replace "a" "#" . should_fail_with Invalid_Value_Type + a.replace 1 "#" . should_fail_with Illegal_Argument + a.replace c "#" . should_fail_with Invalid_Value_Type + a.replace "a" 1 . should_fail_with Illegal_Argument + a.replace "a" c . should_fail_with Invalid_Value_Type + + Test.specify "should not replace if Empty term" <| + a.replace '' "#" . to_vector . should_equal ["Alpha", "Bravo", "Charlie", "Delta", "Echo", "Foxtrot"] + a.replace '' "#" use_regex=True . to_vector . should_equal ["Alpha", "Bravo", "Charlie", "Delta", "Echo", "Foxtrot"] + a.replace d "#" . to_vector . should_equal ["Alpha", "Bravo", "Charlie", "Delta", "Echo", "Foxtrot"] + a.replace d "#" use_regex=True . to_vector . should_equal ["Alpha", "Bravo", "Charlie", "Delta", "Echo", "Foxtrot"] + + Test.group prefix+"Column Operations - Text Trim" <| + t5 = table_builder [["A", [" A ", ' \t\n\rA\r\n\t ', "xxxAxx"]], ["B", [" ",' \t',"x"]], ["C", [1,2,3]]] + a = t5.at "A" + b = t5.at "B" + c = t5.at "C" + + Test.specify "should trim whitespace by default" <| + a.trim . to_vector . should_equal ["A", "A", "xxxAxx"] + a.trim Location.Start . to_vector . should_equal ["A ", 'A\r\n\t ', "xxxAxx"] + a.trim Location.End . to_vector . should_equal [" A", ' \t\n\rA', "xxxAxx"] + + Test.specify "should trim custom characters" <| + a.trim what='x' . to_vector . should_equal [" A ", ' \t\n\rA\r\n\t ', "A"] + a.trim what='x' Location.Start . to_vector . should_equal [" A ", ' \t\n\rA\r\n\t ', "Axx"] + a.trim what='x' Location.End . to_vector . should_equal [" A ", ' \t\n\rA\r\n\t ', "xxxA"] + a.trim what=' ' . to_vector . should_equal ["A", '\t\n\rA\r\n\t', "xxxAxx"] + a.trim what=' \t' . to_vector . should_equal ["A", '\n\rA\r\n', "xxxAxx"] + a.trim what=' \r' . to_vector . should_equal ["A", '\t\n\rA\r\n\t', "xxxAxx"] + a.trim what=b . to_vector . should_equal ["A", '\n\rA\r\n', "A"] + + Test.specify "should only allow trim on Text columns" <| + c.trim what="a" . should_fail_with Invalid_Value_Type + a.trim what=1 . should_fail_with Illegal_Argument + a.trim what=c . should_fail_with Invalid_Value_Type + + Test.group prefix+"Column Operations - Names" <| t = table_builder [["a", [1, 2, 3]], ["b", ['x', 'y', 'z']], ["c", [1.0, 2.0, 3.0]], ["d", [True, False, True]]] Test.specify "arithmetic" <| ((t.at "a") + 42) . name . should_equal "[a] + 42" diff --git a/test/Table_Tests/src/Formatting/Data_Formatter_Spec.enso b/test/Table_Tests/src/Formatting/Data_Formatter_Spec.enso index 3e58c6d23c32..3fa504df6a2a 100644 --- a/test/Table_Tests/src/Formatting/Data_Formatter_Spec.enso +++ b/test/Table_Tests/src/Formatting/Data_Formatter_Spec.enso @@ -47,8 +47,8 @@ spec = formatter.parse "123" . should_equal 123 formatter.parse "1_000_000" . should_equal 1000000 formatter.parse "1_000_000_000" . should_equal (1000 * 1000 * 1000) - formatter.parse "100_0_0_0" . should_equal 100000 - formatter.parse "1__00_000,0" . should_equal 100000.0 + formatter.parse "100_0_0_0" . should_equal "100_0_0_0" + formatter.parse "1__00_000,0" . should_equal "1__00_000,0" formatter.parse "-100_000,0000" . should_equal -100000.0 formatter.parse "0" . should_equal 0 formatter.parse "-1,0" . should_equal -1.0 diff --git a/test/Table_Tests/src/Formatting/Parse_Values_Spec.enso b/test/Table_Tests/src/Formatting/Parse_Values_Spec.enso index ff6ef4a29b67..937d7b4f7789 100644 --- a/test/Table_Tests/src/Formatting/Parse_Values_Spec.enso +++ b/test/Table_Tests/src/Formatting/Parse_Values_Spec.enso @@ -15,7 +15,7 @@ spec = Test.specify "should correctly parse integers" <| t1 = Table.new [["ints", ["0", "+0", "-0", "+1", "-1", "1", "000", "0010", "12345", Nothing]]] t2 = t1.parse type=Value_Type.Integer - t2.at "ints" . to_vector . should_equal [0, 0, 0, 1, -1, 1, Nothing, Nothing, 12345, Nothing] + t2.at "ints" . to_vector . should_equal [0, 0, 0, 1, -1, 1, 0, 10, 12345, Nothing] Test.specify "should correctly parse decimals" <| t1 = Table.new [["ints", ["0", "+0", "-0", "+1", "-1", "1", "12345", Nothing]]] @@ -31,35 +31,34 @@ spec = t6 = t5.parse type=Value_Type.Float t6.at "floats" . to_vector . should_equal [0.0, 0.0, 1.0, 0.1, 0.123, -0.1, 0.1, 0.0, 0.1234, Nothing, 11111111.111] - Test.specify "should warn on leading zeros in numbers, if asked" <| + Test.specify "should parse leading zeros in numbers" <| t1 = Table.new [["ints", ["0", "+00", "-00", "+01", "-01", "01", "000", "0010", "12345", Nothing]]] t2 = Table.new [["floats", ["0.0000", ".0", "00.", "01.0", "-0010.0000", "1.0000"]]] - t1_parsed = [0, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, 12345, Nothing] - t1_zeros = ["+00", "-00", "+01", "-01", "01", "000", "0010"] + t1_parsed = [0, 0, 0, 1, -1, 1, 0, 10, 12345, Nothing] t3 = t1.parse type=Value_Type.Integer t3.at "ints" . to_vector . should_equal t1_parsed - Problems.get_attached_warnings t3 . should_equal [Leading_Zeros.Error "ints" Value_Type.Integer t1_zeros] + Problems.assume_no_problems t3 t4 = t1.parse type=Value_Type.Float t4.at "ints" . to_vector . should_equal t1_parsed - Problems.get_attached_warnings t4 . should_equal [Leading_Zeros.Error "ints" Value_Type.Float t1_zeros] + Problems.assume_no_problems t4 t5 = t2.parse type=Value_Type.Float - t5.at "floats" . to_vector . should_equal [0.0, 0.0, Nothing, Nothing, Nothing, 1.0] - Problems.get_attached_warnings t5 . should_equal [Leading_Zeros.Error "floats" Value_Type.Float ["00.", "01.0", '-0010.0000']] + t5.at "floats" . to_vector . should_equal [0.0, 0.0, 0, 1, -10, 1.0] + Problems.assume_no_problems t5 opts = Data_Formatter.Value allow_leading_zeros=True t1_parsed_zeros = [0, 0, 0, 1, -1, 1, 0, 10, 12345, Nothing] - t6 = t1.parse format=opts type=Value_Type.Integer + t6 = t1.parse format=opts t6.at "ints" . to_vector . should_equal t1_parsed_zeros Problems.assume_no_problems t6 - t7 = t1.parse format=opts type=Value_Type.Float + t7 = t1.parse format=opts t7.at "ints" . to_vector . should_equal t1_parsed_zeros Problems.assume_no_problems t7 - t8 = t2.parse format=opts type=Value_Type.Float + t8 = t2.parse format=opts t8.at "floats" . to_vector . should_equal [0.0, 0.0, 0.0, 1.0, -10.0, 1.0] Problems.assume_no_problems t8 @@ -217,13 +216,13 @@ spec = t2 = t1.parse format=opts t2.at "floats" . to_vector . should_equal [0.0, 0.0, 0.0, 1.5, -1.2, 1.0, 0.0, 10000.0, 0.0] - t3 = Table.new [["xs", ["1,2", "1.3", "_0", "0_", "1_0_0"]]] + t3 = Table.new [["xs", ["1,2", "1.3", "_0", "0_", "1_0_0", "1_000"]]] t4 = t3.parse format=opts type=Value_Type.Float - t4.at "xs" . to_vector . should_equal [1.2, Nothing, Nothing, Nothing, 100.0] - Problems.get_attached_warnings t4 . should_equal [Invalid_Format.Error "xs" Value_Type.Float ["1.3", "_0", "0_"]] + t4.at "xs" . to_vector . should_equal [1.2, Nothing, Nothing, Nothing, Nothing, 1000] + Problems.get_attached_warnings t4 . should_equal [Invalid_Format.Error "xs" Value_Type.Float ["1.3", "_0", "0_", "1_0_0"]] t5 = t3.parse format=opts type=Value_Type.Integer - t5.at "xs" . to_vector . should_equal [Nothing, Nothing, Nothing, Nothing, 100] - Problems.get_attached_warnings t5 . should_equal [Invalid_Format.Error "xs" Value_Type.Integer ["1,2", "1.3", "_0", "0_"]] + t5.at "xs" . to_vector . should_equal [Nothing, Nothing, Nothing, Nothing, Nothing, 1000] + Problems.get_attached_warnings t5 . should_equal [Invalid_Format.Error "xs" Value_Type.Integer ["1,2", "1.3", "_0", "0_", "1_0_0"]] Test.specify "should allow to specify custom values for booleans" <| opts_1 = Data_Formatter.Value true_values=["1", "YES"] false_values=["0"] @@ -395,22 +394,59 @@ spec = c1 = Column.from_vector "ints" ["0", "+0", "-0", "+1", "-1", "1", "000", "0010", "12345", Nothing] c2 = c1.parse type=Value_Type.Integer c2.name.should_equal c1.name - c2 . to_vector . should_equal [0, 0, 0, 1, -1, 1, Nothing, Nothing, 12345, Nothing] + c2 . to_vector . should_equal [0, 0, 0, 1, -1, 1, 0, 10, 12345, Nothing] c2.value_type.should_equal Value_Type.Integer - Problems.expect_warning Leading_Zeros c2 + Problems.assume_no_problems c2 c3 = c1.parse type=Value_Type.Integer format=(Data_Formatter.Value.with_number_formatting allow_leading_zeros=True) c3.to_vector . should_equal [0, 0, 0, 1, -1, 1, 0, 10, 12345, Nothing] Problems.assume_no_problems c3 + Test.specify "should correctly parse integers in US formats" <| + cUS = Column.from_vector "ints" ["1", "000123", "-1234", "1234567", "123e456"] + pUS = cUS.parse type=Value_Type.Integer + pUS.to_vector . should_equal [1, 123, -1234, 1234567, Nothing] + Problems.expect_warning (Invalid_Format.Error "ints" Value_Type.Integer ["123e456"]) pUS + + cUS2 = Column.from_vector "ints" ["1", "000123", "-1,234", "1,234,567", "12,34,56"] + pUS2 = cUS2.parse type=Value_Type.Integer + pUS2.to_vector . should_equal [1, 123, -1234, 1234567, Nothing] + Problems.expect_warning (Invalid_Format.Error "ints" Value_Type.Integer ["12,34,56"]) pUS2 + + cUS3 = Column.from_vector "ints" ["1", "(000,123)", "-1,234", "1,234,567", "12,34,56"] + pUS3 = cUS3.parse type=Value_Type.Integer + pUS3.to_vector . should_equal [1, -123, Nothing, 1234567, Nothing] + Problems.expect_warning (Invalid_Format.Error "ints" Value_Type.Integer ["-1,234", "12,34,56"]) pUS3 + + cUS4 = Column.from_vector "ints" ["$1234", "$1,234", "$1,234,567","-$1,234", "($1,234,567)"] + pUS4 = cUS4.parse type=Value_Type.Integer + pUS4.to_vector . should_equal [1234, 1234, 1234567, -1234, Nothing] + Problems.expect_warning (Invalid_Format.Error "ints" Value_Type.Integer ["($1,234,567)"]) pUS4 + + Test.specify "should correctly parse integers in European formats" <| + cDE = Column.from_vector "ints" ["1", "000123", "-1.234", "1.234.567", "12.34.56"] + pDE = cDE.parse type=Value_Type.Integer + pDE.to_vector . should_equal [1, 123, -1234, 1234567, Nothing] + Problems.expect_warning (Invalid_Format.Error "ints" Value_Type.Integer ["12.34.56"]) pDE + + cFR = Column.from_vector "ints" ["€1", "€000123", "€-1 234", "€ 1 234 567", "€ 12 34 56"] + pFR = cFR.parse type=Value_Type.Integer + pFR.to_vector . should_equal [1, 123, -1234, 1234567, Nothing] + Problems.expect_warning (Invalid_Format.Error "ints" Value_Type.Integer ["€ 12 34 56"]) pFR + + cSW = Column.from_vector "ints" ["1", "000123", "-1'234", "1'234'567", "(123'456)"] + pSW = cSW.parse type=Value_Type.Integer + pSW.to_vector . should_equal [1, 123, -1234, 1234567, Nothing] + Problems.expect_warning (Invalid_Format.Error "ints" Value_Type.Integer ["(123'456)"]) pSW + Test.specify "should correctly parse decimals" <| c1 = Column.from_vector "ints" ["0", "+0", "-0", "+1", "-1", "1", "000", "0010", "12345", Nothing] c2 = c1.parse Value_Type.Float c2.name.should_equal c1.name - c2.to_vector . should_equal [0, 0, 0, 1, -1, 1, Nothing, Nothing, 12345, Nothing] + c2.to_vector . should_equal [0, 0, 0, 1, -1, 1, 0, 10, 12345, Nothing] c2.value_type.should_equal Value_Type.Float - c2.to_vector . map .to_text . should_equal ["0.0", "0.0", "-0.0", "1.0", "-1.0", "1.0", "Nothing", "Nothing", "12345.0", "Nothing"] - Problems.expect_warning Leading_Zeros c2 + c2.to_vector . map .to_text . should_equal ["0.0", "0.0", "-0.0", "1.0", "-1.0", "1.0", "0.0", "10.0", "12345.0", "Nothing"] + Problems.assume_no_problems c2 c3 = Column.from_vector "floats" ["0.0", "+0.0", "-0.0", "+1.0", "-1.0", "1.0", "0.0000", "10.", "12345."] c4 = c3.parse Value_Type.Float @@ -423,6 +459,43 @@ spec = c6.to_vector . should_equal [0.0, 0.0, 1.0, 0.1, 0.123, -0.1, 0.1, 0.0, 0.1234, Nothing, 11111111.111] Problems.assume_no_problems c6 + Test.specify "should correctly parse decimals in US formats" <| + cUS = Column.from_vector "floats" ["1.23", "000123", "-12.34", "123.4567", "123e456"] + pUS = cUS.parse type=Value_Type.Float + pUS.to_vector . should_equal [1.23, 123, -12.34, 123.4567, Nothing] + Problems.expect_warning (Invalid_Format.Error "floats" Value_Type.Float ["123e456"]) pUS + + cUS2 = Column.from_vector "floats" ["1.23", "000123", "-1,234.567", "1,234,567.789", "12,34.56"] + pUS2 = cUS2.parse type=Value_Type.Float + pUS2.to_vector . should_equal [1.23, 123, -1234.567, 1234567.789, Nothing] + Problems.expect_warning (Invalid_Format.Error "floats" Value_Type.Float ["12,34.56"]) pUS2 + + cUS3 = Column.from_vector "floats" ["1.23", "(000,123)", "-1,234.567", "1,234,567.789", "12,34.56"] + pUS3 = cUS3.parse type=Value_Type.Float + pUS3.to_vector . should_equal [1.23, -123, Nothing, 1234567.789, Nothing] + Problems.expect_warning (Invalid_Format.Error "floats" Value_Type.Float ["-1,234.567", "12,34.56"]) pUS3 + + cUS4 = Column.from_vector "floats" ["$12.34", "$1,234", "$1,234,567.789","-$1,234.96", "($1,234,567)"] + pUS4 = cUS4.parse type=Value_Type.Float + pUS4.to_vector . should_equal [12.34, 1234, 1234567.789, -1234.96, Nothing] + Problems.expect_warning (Invalid_Format.Error "floats" Value_Type.Float ["($1,234,567)"]) pUS4 + + Test.specify "should correctly parse decimals in European formats" <| + cDE = Column.from_vector "floats" ["1,23", "000123", "-1.234,567", "1.234.567,789", "12.34,56"] + pDE = cDE.parse type=Value_Type.Float + pDE.to_vector . should_equal [1.23, 123, -1234.567, 1234567.789, Nothing] + Problems.expect_warning (Invalid_Format.Error "floats" Value_Type.Float ["12.34,56"]) pDE + + cFR = Column.from_vector "floats" ["€1,23", "€000123", "€-1 234,567", "€ 1 234 567,789", "€ 12 34,56"] + pFR = cFR.parse type=Value_Type.Float + pDE.to_vector . should_equal [1.23, 123, -1234.567, 1234567.789, Nothing] + Problems.expect_warning (Invalid_Format.Error "floats" Value_Type.Float ["€ 12 34,56"]) pFR + + cSW = Column.from_vector "floats" ["1,23", "000123", "-1'234,567", "1'234'567", "(123'456)"] + pSW = cSW.parse type=Value_Type.Float + pSW.to_vector . should_equal [1.23, 123, -1234.567, 1234567, Nothing] + Problems.expect_warning (Invalid_Format.Error "floats" Value_Type.Float ["(123'456)"]) pSW + Test.specify "should correctly parse booleans" <| c1 = Column.from_vector "bools" ["true", "false", "True", "TRUE", "FALSE", Nothing, "False"] c2 = c1.parse type=Value_Type.Boolean diff --git a/test/Table_Tests/src/IO/Formats_Spec.enso b/test/Table_Tests/src/IO/Formats_Spec.enso index e35ffccfbcdd..af435ebc97c7 100644 --- a/test/Table_Tests/src/IO/Formats_Spec.enso +++ b/test/Table_Tests/src/IO/Formats_Spec.enso @@ -1,6 +1,8 @@ from Standard.Base import all import Standard.Base.Errors.File_Error.File_Error -from Standard.Table import Table + +from Standard.Table import all +import Standard.Table.Errors.Invalid_JSON_Format from Standard.Test import Test, Test_Suite import Standard.Test.Extensions @@ -11,8 +13,9 @@ import project.Util spec = Test.group 'Various File Format support on Table' <| t1 = Table.new [["X", [1, 2, 3]]] transient = enso_project.data / "transient" - Test.specify "should be able to be written as CSV, Excel" <| + simple_empty = enso_project.data/'simple_empty.csv' . read + Test.specify "should be able to be written as CSV, Excel" <| f1 = transient / "test2.csv" f2 = transient / "test3.xlsx" [f1, f2].each f-> @@ -21,13 +24,44 @@ spec = Test.group 'Various File Format support on Table' <| f.exists.should_be_true f.delete - Test.specify "should be able to be written as JSON using Table.write" pending="Currently Table.write cannot autodetect that JSON writing is supported and write_json is used as a workaround." <| + Test.specify "should be able to be written as JSON using Table.write" <| f1 = transient / "test1.json" f1.delete_if_exists t1.write f1 . should_succeed f1.exists.should_be_true f1.delete + Test.specify 'should write JSON tables' <| + simple_empty = enso_project.data/'simple_empty.csv' . read + out = transient / 'out.json' + out.delete_if_exists + simple_empty.write out . should_equal out + Table.from_objects (Json.parse out.read_text) ['a', 'b', 'c'] . should_equal simple_empty + out.delete_if_exists + + Test.specify 'should append to JSON tables' <| + out = transient / 'out.json' + out.delete_if_exists + simple_empty.write out . should_equal out + simple_empty.write out on_existing_file=Existing_File_Behavior.Append . should_equal out + Table.from_objects (Json.parse out.read_text) ['a', 'b', 'c'] . row_count . should_equal 2*simple_empty.row_count + out.delete_if_exists + + Test.specify 'should fail to append to JSON non-arrays' <| + out = transient / 'out.json' + out.delete_if_exists + '1'.write out + simple_empty.write out on_existing_file=Existing_File_Behavior.Append . should_fail_with Invalid_JSON_Format + out.delete_if_exists + + '"Hello World"'.write out + simple_empty.write out on_existing_file=Existing_File_Behavior.Append . should_fail_with Invalid_JSON_Format + out.delete_if_exists + + '{}'.write out + simple_empty.write out on_existing_file=Existing_File_Behavior.Append . should_fail_with Invalid_JSON_Format + out.delete_if_exists + Test.specify "should fail gracefully when provided with an unsupported format" <| f1 = (transient / "test4.unknown-format") f1.delete_if_exists diff --git a/test/Table_Tests/src/IO/Json_Spec.enso b/test/Table_Tests/src/IO/Json_Spec.enso index 27dd63dbd228..673db873b0d2 100644 --- a/test/Table_Tests/src/IO/Json_Spec.enso +++ b/test/Table_Tests/src/IO/Json_Spec.enso @@ -9,19 +9,11 @@ import project.Util spec = Test.group 'JSON conversion' <| clothes = enso_project.data/'clothes.csv' . read - simple_empty = enso_project.data/'simple_empty.csv' . read Test.specify 'should convert tables to a format compatible with Table.from_objects' <| clothes_json = clothes.to_json Table.from_objects (Json.parse clothes_json) ['Id', 'Name', 'Quantity', 'Rating', 'Price'] . should_equal clothes - Test.specify 'should write JSON tables to disk' <| - out = enso_project.data / 'out.json' - out.delete_if_exists - simple_empty.write_json out . should_equal out - Table.from_objects (Json.parse out.read_text) ['a', 'b', 'c'] . should_equal simple_empty - out.delete_if_exists - Test.specify "should allow converting a JSON array into a table" <| r_1 = JS_Object.from_pairs [['foo', 20], ['bar', 'baz'], ['baz', False]] r_2 = JS_Object.from_pairs [['bar', 'xyz'], ['baz', True]] diff --git a/test/Table_Tests/src/In_Memory/Table_Spec.enso b/test/Table_Tests/src/In_Memory/Table_Spec.enso index 78b2fa0a6457..4412fb2760b3 100644 --- a/test/Table_Tests/src/In_Memory/Table_Spec.enso +++ b/test/Table_Tests/src/In_Memory/Table_Spec.enso @@ -364,13 +364,6 @@ spec = r1.at "Y" . to_vector . to_text . should_equal "[, Nothing, NaN]" r1.at "X" . to_vector . should_equal [1, 2, 4] - # TODO this could be moved to Common_Table_Operations once replace_text is implemented for Database too - t2 = Table.new [["X", [1, 2, 3, 4, 5]], ["Y", ["", Nothing, Nothing, Nothing, ""]], ["Z", ["", "---", "-1", "", "foobar"]]] - r2 = t2.replace_text (Column_Selector.Blank_Columns when_any=True) "-" "A" - r2.at "X" . to_vector . should_equal [1, 2, 3, 4, 5] - r2.at "Y" . to_vector . should_equal ["", Nothing, Nothing, Nothing, ""] - r2.at "Z" . to_vector . should_equal ["", "AAA", "A1", "", "foobar"] - Test.group "Info" <| Test.specify "should return Table information" <| a = ["strs", ["a", "b", Nothing, "a"]] @@ -609,58 +602,6 @@ spec = problems = [Duplicate_Output_Column_Names.Error ["A", "A", "A"]] Problems.test_problem_handling action problems tester - Test.group "Table.replace_text" <| - Test.specify "should replace text in full-text table columns" <| - bools = ["bools", [False, False, True, True, False]] - texts = ["texts", ["foo", "foo", "bar", "baz", "spam"]] - table = Table.new [bools, texts] - actual = table.replace_text "texts" "a" "o" - actual.at "bools" . to_vector . should_equal [False, False, True, True, False] - actual.at "texts" . to_vector . should_equal ["foo", "foo", "bor", "boz", "spom"] - Problems.assume_no_problems actual - - Test.specify "should replace text in mixed columns" <| - bools = ["bools", [False, False, True, True, False]] - mixed = ["mixed", ["foo", 5, "bar", False, "spam"]] - table = Table.new [bools, mixed] - actual = table.replace_text "mixed" "a" "o" - actual.at "bools" . to_vector . should_equal [False, False, True, True, False] - actual.at "mixed" . to_vector . should_equal ["foo", 5, "bor", False, "spom"] - Problems.assume_no_problems actual - - Test.specify "should support operating on multiple columns at once" <| - bools = ["bools", [False, False, True]] - texts1 = ["texts1", ["foo", "bar", "baz"]] - texts2 = ["texts2", ["baz", "quux", "spam"]] - table = Table.new [bools, texts1, texts2] - actual = table.replace_text ["texts1", "texts2"] "a" "o" - actual.at "bools" . to_vector . should_equal [False, False, True] - actual.at "texts1" . to_vector . should_equal ["foo", "bor", "boz"] - actual.at "texts2" . to_vector . should_equal ["boz", "quux", "spom"] - Problems.assume_no_problems actual - - Test.specify "should support regex replacement" <| - bools = ["bools", [False, False, True, True]] - texts = ["texts", ["foo", "bar", "baz", "spam"]] - table = Table.new [bools, texts] - actual = table.replace_text "texts" "(a|o)" "$1e" use_regex=True - actual.at "texts" . to_vector . should_equal ["foeoe", "baer", "baez", "spaem"] - Problems.assume_no_problems actual - - Test.specify 'should return warnings and errors when passed a non-existent column' <| - table = Table.new [["bools", [False, True]], ["texts", ["foo", "bar"]]] - action = table.replace_text "invalid_name" "a" "b" on_problems=_ - tester = _.should_equal table - problems = [Missing_Input_Columns.Error ['invalid_name']] - Problems.test_problem_handling action problems tester - - Test.specify "should return warnings and errors when selected non-text column" <| - table = Table.new [["bools", [False, True]], ["texts", ["foo", "bar"]]] - action = table.replace_text "bools" "a" "b" on_problems=_ - tester = _.should_equal table - problems = [Invalid_Value_Type.Error Value_Type.Char Value_Type.Boolean] - Problems.test_problem_handling action problems tester - Test.group "[In-Memory] Table.aggregate" <| Test.specify "should return columns with correct types" <| dates = ["dates", [Date.new 1999, Date.new 2000, Date.new 2000, Date.new 2000]]