From 3a83aa40287a5a0e0c9e7358304fd474ed8b0797 Mon Sep 17 00:00:00 2001 From: Gil Forsyth Date: Tue, 17 Sep 2024 06:23:54 -0400 Subject: [PATCH] refactor(padding): follow python string padding conventions (#10096) BREAKING CHANGE: String padding operations now follow Python semantics and leave strings greater than the padding length untouched. --- .../test_string_builtins/lpad_char/out.sql | 2 +- .../test_string_builtins/lpad_default/out.sql | 2 +- .../test_string_builtins/rpad_char/out.sql | 2 +- .../test_string_builtins/rpad_default/out.sql | 2 +- ibis/backends/impala/tests/test_exprs.py | 4 +- ibis/backends/sql/compilers/base.py | 8 +- .../sql/compilers/bigquery/__init__.py | 2 - ibis/backends/sql/compilers/clickhouse.py | 10 ++ ibis/backends/sql/compilers/datafusion.py | 14 ++ ibis/backends/sql/compilers/duckdb.py | 14 ++ ibis/backends/sql/compilers/flink.py | 14 ++ ibis/backends/sql/compilers/mssql.py | 2 +- ibis/backends/sql/compilers/oracle.py | 8 +- ibis/backends/sqlite/udf.py | 4 +- .../test_sql/test_no_cart_join/out.sql | 19 ++- ibis/backends/tests/test_string.py | 137 +++++++++++++++--- 16 files changed, 206 insertions(+), 38 deletions(-) diff --git a/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/lpad_char/out.sql b/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/lpad_char/out.sql index 83db8a156a2f..79413b65c4aa 100644 --- a/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/lpad_char/out.sql +++ b/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/lpad_char/out.sql @@ -1,3 +1,3 @@ SELECT - LPAD(`t0`.`string_col`, 1, 'a') AS `LPad(string_col, 1, 'a')` + LPAD(`t0`.`string_col`, GREATEST(LENGTH(`t0`.`string_col`), 1), 'a') AS `LPad(string_col, 1, 'a')` FROM `functional_alltypes` AS `t0` \ No newline at end of file diff --git a/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/lpad_default/out.sql b/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/lpad_default/out.sql index 8776196103c9..d575554de10f 100644 --- a/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/lpad_default/out.sql +++ b/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/lpad_default/out.sql @@ -1,3 +1,3 @@ SELECT - LPAD(`t0`.`string_col`, 25, ' ') AS `LPad(string_col, 25, ' ')` + LPAD(`t0`.`string_col`, GREATEST(LENGTH(`t0`.`string_col`), 25), ' ') AS `LPad(string_col, 25, ' ')` FROM `functional_alltypes` AS `t0` \ No newline at end of file diff --git a/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/rpad_char/out.sql b/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/rpad_char/out.sql index 23314bf1cafa..762059fc07fc 100644 --- a/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/rpad_char/out.sql +++ b/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/rpad_char/out.sql @@ -1,3 +1,3 @@ SELECT - RPAD(`t0`.`string_col`, 1, 'a') AS `RPad(string_col, 1, 'a')` + RPAD(`t0`.`string_col`, GREATEST(LENGTH(`t0`.`string_col`), 1), 'a') AS `RPad(string_col, 1, 'a')` FROM `functional_alltypes` AS `t0` \ No newline at end of file diff --git a/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/rpad_default/out.sql b/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/rpad_default/out.sql index c2f18f32a5ce..440043d095e5 100644 --- a/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/rpad_default/out.sql +++ b/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/rpad_default/out.sql @@ -1,3 +1,3 @@ SELECT - RPAD(`t0`.`string_col`, 25, ' ') AS `RPad(string_col, 25, ' ')` + RPAD(`t0`.`string_col`, GREATEST(LENGTH(`t0`.`string_col`), 25), ' ') AS `RPad(string_col, 25, ' ')` FROM `functional_alltypes` AS `t0` \ No newline at end of file diff --git a/ibis/backends/impala/tests/test_exprs.py b/ibis/backends/impala/tests/test_exprs.py index 84949ba314be..98e548d4a28b 100644 --- a/ibis/backends/impala/tests/test_exprs.py +++ b/ibis/backends/impala/tests/test_exprs.py @@ -295,9 +295,9 @@ def test_decimal_builtins_2(con, func, expected): (L("0123").translate("012", "abc"), "abc3"), (L("abcd").find("a"), 0), (L("baaaab").find("b", 2), 5), - (L("abcd").lpad(1, "-"), "a"), + (L("abcd").lpad(1, "-"), "abcd"), (L("abcd").lpad(5), " abcd"), - (L("abcd").rpad(1, "-"), "a"), + (L("abcd").rpad(1, "-"), "abcd"), (L("abcd").rpad(5), "abcd "), (L("abcd").find_in_set(["a", "b", "abcd"]), 2), (L(", ").join(["a", "b"]), "a, b"), diff --git a/ibis/backends/sql/compilers/base.py b/ibis/backends/sql/compilers/base.py index b4af50aa3ceb..f8ceaf71f5f3 100644 --- a/ibis/backends/sql/compilers/base.py +++ b/ibis/backends/sql/compilers/base.py @@ -331,7 +331,6 @@ class SQLGlotCompiler(abc.ABC): ops.IsInf: "isinf", ops.IsNan: "isnan", ops.JSONGetItem: "json_extract", - ops.LPad: "lpad", LastValue: "last_value", ops.Levenshtein: "levenshtein", ops.Ln: "ln", @@ -347,7 +346,6 @@ class SQLGlotCompiler(abc.ABC): ops.PercentRank: "percent_rank", ops.Pi: "pi", ops.Power: "pow", - ops.RPad: "rpad", ops.Radians: "radians", ops.RegexSearch: "regexp_like", ops.RegexSplit: "regexp_split", @@ -985,6 +983,12 @@ def visit_RStrip(self, op, *, arg): def visit_LStrip(self, op, *, arg): return self.f.ltrim(arg, string.whitespace) + def visit_LPad(self, op, *, arg, length, pad): + return self.f.lpad(arg, self.f.greatest(self.f.length(arg), length), pad) + + def visit_RPad(self, op, *, arg, length, pad): + return self.f.rpad(arg, self.f.greatest(self.f.length(arg), length), pad) + def visit_Substring(self, op, *, arg, start, length): if isinstance(op.length, ops.Literal) and (value := op.length.value) < 0: raise com.IbisInputError( diff --git a/ibis/backends/sql/compilers/bigquery/__init__.py b/ibis/backends/sql/compilers/bigquery/__init__.py index b9c0ed990fde..0e8f3a7d3017 100644 --- a/ibis/backends/sql/compilers/bigquery/__init__.py +++ b/ibis/backends/sql/compilers/bigquery/__init__.py @@ -183,8 +183,6 @@ class BigQueryCompiler(SQLGlotCompiler): ops.IsInf: "is_inf", ops.IsNan: "is_nan", ops.Log10: "log10", - ops.LPad: "lpad", - ops.RPad: "rpad", ops.Levenshtein: "edit_distance", ops.Modulus: "mod", ops.RegexReplace: "regexp_replace", diff --git a/ibis/backends/sql/compilers/clickhouse.py b/ibis/backends/sql/compilers/clickhouse.py index f7fa8af6fd87..3b7ca75147b5 100644 --- a/ibis/backends/sql/compilers/clickhouse.py +++ b/ibis/backends/sql/compilers/clickhouse.py @@ -480,6 +480,16 @@ def visit_Strip(self, op, *, arg): this=arg, position="BOTH", expression=sge.Literal.string(whitespace) ) + def visit_LPad(self, op, *, arg, length, pad): + return self.f.leftPadUTF8( + arg, self.f.greatest(self.f.lengthUTF8(arg), length), pad + ) + + def visit_RPad(self, op, *, arg, length, pad): + return self.f.rightPadUTF8( + arg, self.f.greatest(self.f.lengthUTF8(arg), length), pad + ) + def visit_DayOfWeekIndex(self, op, *, arg): weekdays = len(calendar.day_name) return (((self.f.toDayOfWeek(arg) - 1) % weekdays) + weekdays) % weekdays diff --git a/ibis/backends/sql/compilers/datafusion.py b/ibis/backends/sql/compilers/datafusion.py index b12060a0b447..b527663a6394 100644 --- a/ibis/backends/sql/compilers/datafusion.py +++ b/ibis/backends/sql/compilers/datafusion.py @@ -207,6 +207,20 @@ def visit_RegexSearch(self, op, *, arg, pattern): def visit_StringContains(self, op, *, haystack, needle): return self.f.strpos(haystack, needle) > sg.exp.convert(0) + def visit_LPad(self, op, *, arg, length, pad): + return self.if_( + length <= self.f.length(arg), + arg, + self.f.concat(self.f.repeat(pad, length - self.f.length(arg)), arg), + ) + + def visit_RPad(self, op, *, arg, length, pad): + return self.if_( + length <= self.f.length(arg), + arg, + self.f.concat(arg, self.f.repeat(pad, length - self.f.length(arg))), + ) + def visit_ExtractFragment(self, op, *, arg): return self.f.extract_url_field(arg, "fragment") diff --git a/ibis/backends/sql/compilers/duckdb.py b/ibis/backends/sql/compilers/duckdb.py index 76082a5c29eb..da0ecd45dc7e 100644 --- a/ibis/backends/sql/compilers/duckdb.py +++ b/ibis/backends/sql/compilers/duckdb.py @@ -568,6 +568,20 @@ def visit_Hash(self, op, *, arg): def visit_StringConcat(self, op, *, arg): return reduce(lambda x, y: sge.DPipe(this=x, expression=y), arg) + def visit_LPad(self, op, *, arg, length, pad): + return self.if_( + length <= self.f.length(arg), + arg, + self.f.concat(self.f.repeat(pad, length - self.f.length(arg)), arg), + ) + + def visit_RPad(self, op, *, arg, length, pad): + return self.if_( + length <= self.f.length(arg), + arg, + self.f.concat(arg, self.f.repeat(pad, length - self.f.length(arg))), + ) + def visit_StringSlice(self, op, *, arg, start, end): if start is not None: start += 1 diff --git a/ibis/backends/sql/compilers/flink.py b/ibis/backends/sql/compilers/flink.py index ff50258b327c..9d9b0a45f69c 100644 --- a/ibis/backends/sql/compilers/flink.py +++ b/ibis/backends/sql/compilers/flink.py @@ -518,6 +518,20 @@ def visit_StringFind(self, op, *, arg, substr, start, end): return self.f.instr(arg, substr) + def visit_LPad(self, op, *, arg, length, pad): + return self.if_( + length <= self.f.length(arg), + arg, + self.f.concat(self.f.repeat(pad, length - self.f.length(arg)), arg), + ) + + def visit_RPad(self, op, *, arg, length, pad): + return self.if_( + length <= self.f.length(arg), + arg, + self.f.concat(arg, self.f.repeat(pad, length - self.f.length(arg))), + ) + def visit_StartsWith(self, op, *, arg, start): return self.f.left(arg, self.f.char_length(start)).eq(start) diff --git a/ibis/backends/sql/compilers/mssql.py b/ibis/backends/sql/compilers/mssql.py index cadc8b30cc1b..70215fed821a 100644 --- a/ibis/backends/sql/compilers/mssql.py +++ b/ibis/backends/sql/compilers/mssql.py @@ -528,7 +528,7 @@ def visit_LPad(self, op, *, arg, length, pad): return self.if_( length <= self.f.length(arg), arg, - self.f.left( + self.f.right( self.f.concat(self.f.replicate(pad, length - self.f.length(arg)), arg), length, ), diff --git a/ibis/backends/sql/compilers/oracle.py b/ibis/backends/sql/compilers/oracle.py index 103f5deeec7f..faee258b10e4 100644 --- a/ibis/backends/sql/compilers/oracle.py +++ b/ibis/backends/sql/compilers/oracle.py @@ -82,8 +82,6 @@ class OracleCompiler(SQLGlotCompiler): ops.BitXor: "bit_xor_agg", ops.BitwiseAnd: "bitand", ops.Hash: "ora_hash", - ops.LPad: "lpad", - ops.RPad: "rpad", ops.StringAscii: "ascii", ops.Mode: "stats_mode", } @@ -275,6 +273,12 @@ def visit_StringContains(self, op, *, haystack, needle): def visit_StringJoin(self, op, *, arg, sep): return self.f.concat(*toolz.interpose(sep, arg)) + def visit_LPad(self, op, *, arg, length, pad): + return self.f.lpad(arg, self.f.greatest(self.f.length(arg), length), pad) + + def visit_RPad(self, op, *, arg, length, pad): + return self.f.rpad(arg, self.f.greatest(self.f.length(arg), length), pad) + ## Aggregate stuff def visit_Correlation(self, op, *, left, right, where, how): diff --git a/ibis/backends/sqlite/udf.py b/ibis/backends/sqlite/udf.py index afaf1490ea19..536b622b2b05 100644 --- a/ibis/backends/sqlite/udf.py +++ b/ibis/backends/sqlite/udf.py @@ -258,12 +258,12 @@ def _ibis_string_ascii(string): @udf def _ibis_rpad(string, width, pad): - return string.ljust(width, pad)[:width] + return string.ljust(width, pad) @udf def _ibis_lpad(string, width, pad): - return string.rjust(width, pad)[:width] + return string.rjust(width, pad) @udf diff --git a/ibis/backends/tests/sql/snapshots/test_sql/test_no_cart_join/out.sql b/ibis/backends/tests/sql/snapshots/test_sql/test_no_cart_join/out.sql index 9139f2ed68a5..b586ab67ebbd 100644 --- a/ibis/backends/tests/sql/snapshots/test_sql/test_no_cart_join/out.sql +++ b/ibis/backends/tests/sql/snapshots/test_sql/test_no_cart_join/out.sql @@ -19,9 +19,22 @@ FROM ( "t1"."ancestor_level_number", "t1"."ancestor_node_sort_order", "t1"."descendant_node_natural_key", - LPAD('-', ( - "t1"."ancestor_level_number" - 1 - ) * 7, '-') || "t1"."ancestor_level_name" AS "product_level_name" + CASE + WHEN ( + ( + "t1"."ancestor_level_number" - 1 + ) * 7 + ) <= LENGTH('-') + THEN '-' + ELSE CONCAT( + REPEAT('-', ( + ( + "t1"."ancestor_level_number" - 1 + ) * 7 + ) - LENGTH('-')), + '-' + ) + END || "t1"."ancestor_level_name" AS "product_level_name" FROM "products" AS "t1" ) AS "t4" ON "t2"."product_id" = "t4"."descendant_node_natural_key" diff --git a/ibis/backends/tests/test_string.py b/ibis/backends/tests/test_string.py index 8e44e31d86e9..cb51c30aa273 100644 --- a/ibis/backends/tests/test_string.py +++ b/ibis/backends/tests/test_string.py @@ -1079,55 +1079,69 @@ def string_temp_table(backend, con): ), param( lambda t: t.string_col.rpad(4, "-"), - lambda t: t.str[:4].str.pad(4, side="right", fillchar="-"), + lambda t: t.str.pad(4, side="right", fillchar="-"), id="rpad", marks=[ pytest.mark.notyet( - ["flink", "oracle"], + ["oracle"], raises=AssertionError, - reason="Treats len(๐Ÿ) == 2 so padding is off", + reason="Treats len(๐Ÿ) == 2", ), pytest.mark.notyet( - ["impala"], + ["impala", "mysql"], raises=AssertionError, - reason="Treats len(๐Ÿ) == 4, len(ร‰รฉ) == 4", + reason="Treats len(๐Ÿ) == 4 and accented characters as len 2", ), + ], + ), + param( + lambda t: t.string_col.rpad(8, "-"), + lambda t: t.str.pad(8, side="right", fillchar="-"), + id="rpad_gt", + marks=[ pytest.mark.notyet( - ["mssql", "polars"], + ["oracle"], raises=AssertionError, - reason="Python style padding, e.g. doesn't trim strings to pad-length", + reason="Treats len(๐Ÿ) == 2", ), pytest.mark.notyet( - ["clickhouse"], + ["impala", "mysql"], raises=AssertionError, - reason="Can use rightPadUTF8 instead", + reason="Treats len(๐Ÿ) == 4 and accented characters as len 2", ), ], ), param( lambda t: t.string_col.lpad(4, "-"), - lambda t: t.str[:4].str.pad(4, side="left", fillchar="-"), - id="lpad", + lambda t: t.str.pad(4, side="left", fillchar="-"), + id="lpad_lt", marks=[ pytest.mark.notyet( - ["flink", "oracle"], + ["oracle"], raises=AssertionError, - reason="Treats len(๐Ÿ) == 2 so padding is off", + reason="Treats len(๐Ÿ) == 2", ), pytest.mark.notyet( - ["impala"], + ["impala", "mysql"], raises=AssertionError, - reason="Treats len(๐Ÿ) == 4, len(ร‰รฉ) == 4", + reason="Treats len(๐Ÿ) == 4 and accented characters as len 2", ), + ], + ), + param( + lambda t: t.string_col.lpad(8, "-"), + lambda t: t.str.pad(8, side="left", fillchar="-"), + id="lpad_gt", + marks=[ pytest.mark.notyet( - ["mssql", "polars"], + ["oracle"], raises=AssertionError, - reason="Python style padding, e.g. doesn't trim strings to pad-length", + reason="Treats len(๐Ÿ) == 2", ), pytest.mark.notyet( - ["clickhouse"], + ["impala", "mysql"], raises=AssertionError, - reason="Can use leftPadUTF8 instead", + reason="Treats len(๐Ÿ) == 4 and accented characters as len 2", ), ], ), @@ -1279,7 +1293,9 @@ def string_temp_table(backend, con): ), ], ) -def test_string_methods_no_regex(string_temp_table, backend, result_mut, expected_func): +def test_string_methods_accents_and_emoji( + string_temp_table, backend, result_mut, expected_func +): """ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”ƒ string_col โ”ƒ @@ -1304,3 +1320,84 @@ def test_string_methods_no_regex(string_temp_table, backend, result_mut, expecte expected = expected_func(series) backend.assert_series_equal(result, expected) + + +@pytest.fixture(scope="session") +def string_temp_table_no_complications(backend, con): + better_strings = pd.DataFrame( + { + "string_col": [ + "AbC\t", + "\n123\n ", + "abc, 123", + "123", + "aBc", + ], + "index_col": [0, 1, 2, 3, 4], + } + ) + + temp_table_name = gen_name("strings") + temp = backend.name() not in ["exasol", "impala", "pyspark", "risingwave", "trino"] + if backend.name() == "datafusion": + temp = None + if backend.name() == "druid": + yield "I HATE DRUID" + else: + t = con.create_table(temp_table_name, better_strings, temp=temp) + yield t + con.drop_table(temp_table_name, force=True) + + +@pytest.mark.never(["druid"], reason="can't create tables") +@pytest.mark.parametrize( + "result_mut, expected_func", + [ + param( + lambda t: t.string_col.rpad(4, "-"), + lambda t: t.str.pad(4, side="right", fillchar="-"), + id="rpad_lt", + ), + param( + lambda t: t.string_col.rpad(8, "-"), + lambda t: t.str.pad(8, side="right", fillchar="-"), + id="rpad_gt", + ), + param( + lambda t: t.string_col.lpad(4, "-"), + lambda t: t.str.pad(4, side="left", fillchar="-"), + id="lpad_lt", + ), + param( + lambda t: t.string_col.lpad(8, "-"), + lambda t: t.str.pad(8, side="left", fillchar="-"), + id="lpad_gt", + ), + ], +) +def test_string_methods_no_accents_and_no_emoji( + string_temp_table_no_complications, backend, result_mut, expected_func +): + """ + โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“ + โ”ƒ string_col โ”ƒ + โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ + โ”‚ string โ”‚ + โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค + โ”‚ AbC\t โ”‚ + โ”‚ \n123\n โ”‚ + โ”‚ abc, 123 โ”‚ + โ”‚ 123 โ”‚ + โ”‚ aBc โ”‚ + โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + """ + # TODO: figure out a better organization for this + t = string_temp_table_no_complications + series = t.order_by(t.index_col).string_col.name("tmp").to_pandas() + + expr = t.mutate(string_col=result_mut).order_by(t.index_col) + result = expr.string_col.name("tmp").to_pandas() + + expected = expected_func(series) + + backend.assert_series_equal(result, expected)