enso-org · mergify · Dec 21, 2023 · Dec 20, 2023 · Dec 21, 2023 · Dec 21, 2023
diff --git a/.gitignore b/.gitignore
@@ -62,6 +62,8 @@ node_modules/
 !.idea/runConfigurations/
 !.vscode
 .vscode/*
+!.vs
+.vs/*
 .metals/
 *.swp
 .projections.json

@@ -1175,6 +1175,26 @@ type Column
         new_name = self.naming_helper.function_name "ends_with" [self, other]
         make_text_case_op self "ends_with" other case_sensitivity new_name
 
+    ## GROUP Standard.Base.Text
+       ICON preparation
+       Calulates the text length for each element of the column
+
+       In the Database backends, the default text length method of the
+       particular database is used.
+
+       In the in-memory backend, this will give you the grapheme length of the string.
+
+       > Example
+             import Standard.Examples
+
+             example_text_length =
+                Examples.text_column_1.text_length
+    text_length : Column
+    text_length self =
+        Value_Type.expect_text self <|
+          new_name = self.naming_helper.function_name "text_length" [self]
+          self.make_unary_op "length" new_name
+
     ## GROUP Standard.Base.Logical
        Checks for each element of the column if it contains `other`.
 

@@ -189,7 +189,7 @@ base_dialect =
     functions = [["COALESCE", make_function "COALESCE"], ["ROW_MIN", make_function "MIN"], ["ROW_MAX", make_function "MAX"]]
     agg = [fun "MAX", fun "MIN", fun "AVG", fun "SUM"]
     counts = [fun "COUNT", ["COUNT_ROWS", make_constant "COUNT(*)"]]
-    text = [is_empty, bin "LIKE", simple_equals_ignore_case, fold_case, make_case_sensitive]
+    text = [is_empty, bin "LIKE", simple_equals_ignore_case, fold_case, make_case_sensitive, length]
     nulls = [["IS_NULL", make_right_unary_op "IS NULL"], ["FILL_NULL", make_function "COALESCE"]]
     contains = [["IS_IN", make_is_in], ["IS_IN_COLUMN", make_is_in_column]]
     types = [simple_cast]
@@ -368,6 +368,10 @@ generate_from_part dialect from_spec = case from_spec of
 fold_case = lift_unary_op "FOLD_CASE" arg->
     Builder.code "LOWER(UPPER(" ++ arg ++ "))"
 
+## PRIVATE
+length = Base_Generator.lift_unary_op "LENGTH" str->
+    Builder.code "length(" ++ str ++ ")"
+
 ## PRIVATE
 make_case_sensitive = lift_unary_op "MAKE_CASE_SENSITIVE" _->
     Error.throw <| Unsupported_Database_Operation.Error ("Case sensitive operations are not currently supported by this connection.")

@@ -185,7 +185,7 @@ operations_map =
     always_boolean_ops = ["==", "!=", "equals_ignore_case", ">=", "<=", "<", ">", "BETWEEN", "AND", "OR", "NOT", "IS_NULL", "IS_EMPTY", "LIKE", "IS_IN", "IS_IN_COLUMN", "starts_with", "ends_with", "contains", "BOOL_OR", "IS_INF"]
     always_floating_ops = ["/", "mod", "AVG", "STDDEV_POP", "STDDEV_SAMP", "ROUND"]
     always_text_ops = ["ADD_TEXT", "CONCAT", "CONCAT_QUOTE_IF_NEEDED", "MAKE_CASE_SENSITIVE", "FOLD_CASE", "TRIM", "LTRIM", "RTRIM", "REPLACE"]
-    always_integer_ops = ["COUNT", "COUNT_IS_NULL", "COUNT_DISTINCT", "COUNT_DISTINCT_INCLUDE_NULL", "COUNT_EMPTY", "COUNT_NOT_EMPTY", "COUNT_ROWS", "ROW_NUMBER", "ROW_NUMBER_IN_GROUP"]
+    always_integer_ops = ["COUNT", "COUNT_IS_NULL", "COUNT_DISTINCT", "COUNT_DISTINCT_INCLUDE_NULL", "COUNT_EMPTY", "COUNT_NOT_EMPTY", "COUNT_ROWS", "ROW_NUMBER", "ROW_NUMBER_IN_GROUP", "LENGTH"]
     same_as_first = ["TRUNCATE", "CEIL", "FLOOR"]
     arithmetic_ops = ["ADD_NUMBER", "-", "*", "^", "%", "SUM"]
     merge_input_types_ops = ["ROW_MAX", "ROW_MIN", "MAX", "MIN", "FILL_NULL", "COALESCE"]

@@ -38,6 +38,7 @@ polyglot java import org.enso.table.data.table.Column as Java_Column
 polyglot java import org.enso.table.error.ValueTypeMismatchException
 polyglot java import org.enso.table.operations.OrderBuilder
 polyglot java import org.enso.table.parsing.problems.ParseProblemAggregator
+polyglot java import org.enso.base.Text_Utils
 
 type Column
     ## GROUP Standard.Base.Input
@@ -1237,6 +1238,26 @@ type Column
         new_name = naming_helper.function_name "ends_with" [self, other]
         run_vectorized_binary_case_text_op self Java_Storage.Maps.ENDS_WITH other case_sensitivity (a -> b -> a.ends_with b case_sensitivity) new_name
 
+    ## GROUP Standard.Base.Text
+       ICON preparation
+       Calulates the text length for each element of the column
+
+       In the Database backends, the default text length method of the
+       particular database is used.
+
+       In the in-memory backend, this will give you the grapheme length of the string.
+
+       > Example
+             import Standard.Examples
+
+             example_text_length =
+                Examples.text_column_1.text_length
+    text_length : Column
+    text_length self =
+        Value_Type.expect_text self <|
+          new_name = naming_helper.function_name "text_length" [self]
+          run_vectorized_unary_op self Java_Storage.Maps.TEXT_LENGTH new_name expected_result_type=Value_Type.Integer
+
     ## GROUP Standard.Base.Logical
        Checks for each element of the column if it contains `other`.
 

@@ -109,6 +109,7 @@ public static final class Maps {
     public static final String IS_EMPTY = "is_empty";
     public static final String STARTS_WITH = "starts_with";
     public static final String ENDS_WITH = "ends_with";
+    public static final String TEXT_LENGTH = "text_length";
     public static final String CONTAINS = "contains";
     public static final String LIKE = "like";
     public static final String IS_IN = "is_in";

@@ -1,12 +1,14 @@
 package org.enso.table.data.column.storage;
 
 import java.util.BitSet;
+
 import org.enso.base.Text_Utils;
 import org.enso.table.data.column.builder.StringBuilder;
 import org.enso.table.data.column.operation.map.BinaryMapOperation;
 import org.enso.table.data.column.operation.map.MapOperationProblemAggregator;
 import org.enso.table.data.column.operation.map.MapOperationStorage;
 import org.enso.table.data.column.operation.map.UnaryMapOperation;
+import org.enso.table.data.column.operation.map.numeric.UnaryIntegerOp;
 import org.enso.table.data.column.operation.map.text.LikeOp;
 import org.enso.table.data.column.operation.map.text.StringBooleanOp;
 import org.enso.table.data.column.operation.map.text.StringIsInOp;
@@ -135,6 +137,13 @@ protected boolean doString(String a, String b) {
             return Text_Utils.ends_with(a, b);
           }
         });
+    t.add(
+        new UnaryIntegerOp<>(Maps.TEXT_LENGTH) {
+          @Override
+          protected long doOperation(String a) {
+            return Text_Utils.grapheme_length(a);
+          }
+        });
     t.add(
         new StringBooleanOp(Maps.CONTAINS) {
           @Override

@@ -827,6 +827,21 @@ spec setup =
                 s1.like (s2+"%r") . to_vector . should_equal [True, False, False, False, Nothing]
                 s1.like "%r%" . to_vector . should_equal [True, True, False, False, Nothing]
 
+        Test.specify "should handle operation text_length" <|
+            t = table_builder [["strings", ["foobar", "", Nothing, "👩‍🔬", "café", "It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of light, it was the season of darkness, it was the spring of hope, it was the winter of despair."]]]
+            col = t.at "strings"
+            res = col.text_length
+            res.name . should_equal "text_length([strings])"
+
+            case setup.is_database of
+                False -> res . to_vector . should_equal [6, 0, Nothing, 1, 4, 286] # Grapheme Length
+                True -> res . to_vector . should_equal [6, 0, Nothing, 3, 4, 286]  # Storage Length
+
+        Test.specify "text_length should error on non-string columns" <|
+            t = table_builder [["numbers", [1, 2, 3]]]
+            col = t.at "numbers"
+            col.text_length . should_fail_with Invalid_Value_Type
+
         Test.specify "should handle operations like is_empty, is_blank, fill_empty" <|
             with_mixed_columns_if_supported [["s", ["", " ", "  ", Nothing, "foo"]], ["letters", ["a", "b", "c", "d", "e"]]] t->
                 s = t.at "s"