From 6f9948b8c027f782431805331de174c4092de40a Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Thu, 28 Mar 2024 13:40:36 -0700 Subject: [PATCH 1/3] feat: pass SessionState not SessionConfig to FunctionFactory::create (#9837) --- datafusion-examples/examples/function_factory.rs | 7 ++++--- datafusion/core/src/execution/context/mod.rs | 4 ++-- .../tests/user_defined/user_defined_scalar_functions.rs | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/datafusion-examples/examples/function_factory.rs b/datafusion-examples/examples/function_factory.rs index 6c033e6c8eef..a7c8558c6da8 100644 --- a/datafusion-examples/examples/function_factory.rs +++ b/datafusion-examples/examples/function_factory.rs @@ -16,8 +16,9 @@ // under the License. use datafusion::error::Result; -use datafusion::execution::config::SessionConfig; -use datafusion::execution::context::{FunctionFactory, RegisterFunction, SessionContext}; +use datafusion::execution::context::{ + FunctionFactory, RegisterFunction, SessionContext, SessionState, +}; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::{exec_err, internal_err, DataFusionError}; use datafusion_expr::simplify::ExprSimplifyResult; @@ -91,7 +92,7 @@ impl FunctionFactory for CustomFunctionFactory { /// the function instance. async fn create( &self, - _state: &SessionConfig, + _state: &SessionState, statement: CreateFunction, ) -> Result { let f: ScalarFunctionWrapper = statement.try_into()?; diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 116e45c8c130..31f390607f04 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -794,7 +794,7 @@ impl SessionContext { let function_factory = &state.function_factory; match function_factory { - Some(f) => f.create(state.config(), stmt).await?, + Some(f) => f.create(&state, stmt).await?, _ => Err(DataFusionError::Configuration( "Function factory has not been configured".into(), ))?, @@ -1288,7 +1288,7 @@ pub trait FunctionFactory: Sync + Send { /// Handles creation of user defined function specified in [CreateFunction] statement async fn create( &self, - state: &SessionConfig, + state: &SessionState, statement: CreateFunction, ) -> Result; } diff --git a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs index b525e4fc6341..86be887198ae 100644 --- a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs +++ b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs @@ -747,7 +747,7 @@ struct CustomFunctionFactory {} impl FunctionFactory for CustomFunctionFactory { async fn create( &self, - _state: &SessionConfig, + _state: &SessionState, statement: CreateFunction, ) -> Result { let f: ScalarFunctionWrapper = statement.try_into()?; From 81c96fc3db0ea35638278f32df066be63b745a51 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 28 Mar 2024 17:37:25 -0600 Subject: [PATCH 2/3] Prepare 37.0.0 Release (#9697) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * bump version * changelog * Update configs.md * Update Cargo.toml * Update 37.0.0.md * Update 37.0.0.md * Update 37.0.0.md * update changelog * update changelog --------- Co-authored-by: Daniël Heres --- Cargo.toml | 30 +-- datafusion-cli/Cargo.lock | 24 +-- datafusion-cli/Cargo.toml | 4 +- datafusion/CHANGELOG.md | 1 + dev/changelog/37.0.0.md | 347 ++++++++++++++++++++++++++++++ docs/source/user-guide/configs.md | 2 +- 6 files changed, 378 insertions(+), 30 deletions(-) create mode 100644 dev/changelog/37.0.0.md diff --git a/Cargo.toml b/Cargo.toml index c3dade8bc6c5..8e89e5ef3b85 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,7 +49,7 @@ license = "Apache-2.0" readme = "README.md" repository = "https://github.com/apache/arrow-datafusion" rust-version = "1.72" -version = "36.0.0" +version = "37.0.0" [workspace.dependencies] # We turn off default-features for some dependencies here so the workspaces which inherit them can @@ -71,20 +71,20 @@ bytes = "1.4" chrono = { version = "0.4.34", default-features = false } ctor = "0.2.0" dashmap = "5.4.0" -datafusion = { path = "datafusion/core", version = "36.0.0", default-features = false } -datafusion-common = { path = "datafusion/common", version = "36.0.0", default-features = false } -datafusion-common-runtime = { path = "datafusion/common-runtime", version = "36.0.0" } -datafusion-execution = { path = "datafusion/execution", version = "36.0.0" } -datafusion-expr = { path = "datafusion/expr", version = "36.0.0" } -datafusion-functions = { path = "datafusion/functions", version = "36.0.0" } -datafusion-functions-array = { path = "datafusion/functions-array", version = "36.0.0" } -datafusion-optimizer = { path = "datafusion/optimizer", version = "36.0.0", default-features = false } -datafusion-physical-expr = { path = "datafusion/physical-expr", version = "36.0.0", default-features = false } -datafusion-physical-plan = { path = "datafusion/physical-plan", version = "36.0.0" } -datafusion-proto = { path = "datafusion/proto", version = "36.0.0" } -datafusion-sql = { path = "datafusion/sql", version = "36.0.0" } -datafusion-sqllogictest = { path = "datafusion/sqllogictest", version = "36.0.0" } -datafusion-substrait = { path = "datafusion/substrait", version = "36.0.0" } +datafusion = { path = "datafusion/core", version = "37.0.0", default-features = false } +datafusion-common = { path = "datafusion/common", version = "37.0.0", default-features = false } +datafusion-common-runtime = { path = "datafusion/common-runtime", version = "37.0.0" } +datafusion-execution = { path = "datafusion/execution", version = "37.0.0" } +datafusion-expr = { path = "datafusion/expr", version = "37.0.0" } +datafusion-functions = { path = "datafusion/functions", version = "37.0.0" } +datafusion-functions-array = { path = "datafusion/functions-array", version = "37.0.0" } +datafusion-optimizer = { path = "datafusion/optimizer", version = "37.0.0", default-features = false } +datafusion-physical-expr = { path = "datafusion/physical-expr", version = "37.0.0", default-features = false } +datafusion-physical-plan = { path = "datafusion/physical-plan", version = "37.0.0" } +datafusion-proto = { path = "datafusion/proto", version = "37.0.0" } +datafusion-sql = { path = "datafusion/sql", version = "37.0.0" } +datafusion-sqllogictest = { path = "datafusion/sqllogictest", version = "37.0.0" } +datafusion-substrait = { path = "datafusion/substrait", version = "37.0.0" } doc-comment = "0.3" env_logger = "0.11" futures = "0.3" diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index ba60c04cea55..0277d23f4de0 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1116,7 +1116,7 @@ dependencies = [ [[package]] name = "datafusion" -version = "36.0.0" +version = "37.0.0" dependencies = [ "ahash", "apache-avro", @@ -1167,7 +1167,7 @@ dependencies = [ [[package]] name = "datafusion-cli" -version = "36.0.0" +version = "37.0.0" dependencies = [ "arrow", "assert_cmd", @@ -1195,7 +1195,7 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "36.0.0" +version = "37.0.0" dependencies = [ "ahash", "apache-avro", @@ -1215,14 +1215,14 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "36.0.0" +version = "37.0.0" dependencies = [ "tokio", ] [[package]] name = "datafusion-execution" -version = "36.0.0" +version = "37.0.0" dependencies = [ "arrow", "chrono", @@ -1241,7 +1241,7 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "36.0.0" +version = "37.0.0" dependencies = [ "ahash", "arrow", @@ -1256,7 +1256,7 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "36.0.0" +version = "37.0.0" dependencies = [ "arrow", "base64 0.22.0", @@ -1279,7 +1279,7 @@ dependencies = [ [[package]] name = "datafusion-functions-array" -version = "36.0.0" +version = "37.0.0" dependencies = [ "arrow", "arrow-array", @@ -1297,7 +1297,7 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "36.0.0" +version = "37.0.0" dependencies = [ "arrow", "async-trait", @@ -1313,7 +1313,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "36.0.0" +version = "37.0.0" dependencies = [ "ahash", "arrow", @@ -1346,7 +1346,7 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "36.0.0" +version = "37.0.0" dependencies = [ "ahash", "arrow", @@ -1375,7 +1375,7 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "36.0.0" +version = "37.0.0" dependencies = [ "arrow", "arrow-array", diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index da744a06f3aa..18e14357314e 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "datafusion-cli" description = "Command Line Client for DataFusion query engine." -version = "36.0.0" +version = "37.0.0" authors = ["Apache Arrow "] edition = "2021" keywords = ["arrow", "datafusion", "query", "sql"] @@ -35,7 +35,7 @@ async-trait = "0.1.41" aws-config = "0.55" aws-credential-types = "0.55" clap = { version = "3", features = ["derive", "cargo"] } -datafusion = { path = "../datafusion/core", version = "36.0.0", features = [ +datafusion = { path = "../datafusion/core", version = "37.0.0", features = [ "avro", "crypto_expressions", "datetime_expressions", diff --git a/datafusion/CHANGELOG.md b/datafusion/CHANGELOG.md index 2d09782a3982..c111375e3058 100644 --- a/datafusion/CHANGELOG.md +++ b/datafusion/CHANGELOG.md @@ -19,6 +19,7 @@ # Changelog +- [37.0.0](../dev/changelog/37.0.0.md) - [36.0.0](../dev/changelog/36.0.0.md) - [35.0.0](../dev/changelog/35.0.0.md) - [34.0.0](../dev/changelog/34.0.0.md) diff --git a/dev/changelog/37.0.0.md b/dev/changelog/37.0.0.md new file mode 100644 index 000000000000..b1fcd5fdf008 --- /dev/null +++ b/dev/changelog/37.0.0.md @@ -0,0 +1,347 @@ + + +## [37.0.0](https://github.com/apache/arrow-datafusion/tree/37.0.0) (2024-03-28) + +[Full Changelog](https://github.com/apache/arrow-datafusion/compare/36.0.0...37.0.0) + +**Breaking changes:** + +- refactor: Change `SchemaProvider::table` to return `Result` rather than `Option<..>` [#9307](https://github.com/apache/arrow-datafusion/pull/9307) (crepererum) +- feat: issue_9285: port builtin reg function into datafusion-function-\* (1/3 regexpmatch) [#9329](https://github.com/apache/arrow-datafusion/pull/9329) (Lordworms) +- Cache common plan properties to eliminate recursive calls in physical plan [#9346](https://github.com/apache/arrow-datafusion/pull/9346) (mustafasrepo) +- Consolidate `TreeNode` transform and rewrite APIs [#8891](https://github.com/apache/arrow-datafusion/pull/8891) (peter-toth) +- Extend argument types for udf `return_type_from_exprs` [#9522](https://github.com/apache/arrow-datafusion/pull/9522) (jayzhan211) +- Systematic Configuration in 'Create External Table' and 'Copy To' Options [#9382](https://github.com/apache/arrow-datafusion/pull/9382) (metesynnada) +- Move trim functions (btrim, ltrim, rtrim) to datafusion_functions, make expr_fn API consistent [#9730](https://github.com/apache/arrow-datafusion/pull/9730) (Omega359) + +**Performance related:** + +- perf: improve to_field performance [#9722](https://github.com/apache/arrow-datafusion/pull/9722) (haohuaijin) + +**Implemented enhancements:** + +- feat: support for defining ARRAY columns in `CREATE TABLE` [#9381](https://github.com/apache/arrow-datafusion/pull/9381) (jonahgao) +- feat: support `unnest` in FROM clause [#9355](https://github.com/apache/arrow-datafusion/pull/9355) (jonahgao) +- feat: support nvl2 function [#9364](https://github.com/apache/arrow-datafusion/pull/9364) (guojidan) +- feat: issue #9224 substitute tlide in table path [#9259](https://github.com/apache/arrow-datafusion/pull/9259) (Lordworms) +- feat: replace std Instant with wasm-compatible wrapper [#9189](https://github.com/apache/arrow-datafusion/pull/9189) (waynexia) +- feat: support `unnest` with additional columns [#9400](https://github.com/apache/arrow-datafusion/pull/9400) (jonahgao) +- feat: Support `EscapedStringLiteral`, update sqlparser to `0.44.0` [#9268](https://github.com/apache/arrow-datafusion/pull/9268) (JasonLi-cn) +- feat: add support for fixed list wildcard in type signature [#9312](https://github.com/apache/arrow-datafusion/pull/9312) (universalmind303) +- feat: Add projection to HashJoinExec. [#9236](https://github.com/apache/arrow-datafusion/pull/9236) (my-vegetable-has-exploded) +- feat: function name hints for UDFs [#9407](https://github.com/apache/arrow-datafusion/pull/9407) (SteveLauC) +- feat: Introduce convert Expr to SQL string API and basic feature [#9517](https://github.com/apache/arrow-datafusion/pull/9517) (backkem) +- feat: implement more expr_to_sql functionality [#9578](https://github.com/apache/arrow-datafusion/pull/9578) (devinjdangelo) +- feat: implement aggregation and subquery plans to SQL [#9606](https://github.com/apache/arrow-datafusion/pull/9606) (devinjdangelo) +- feat: track memory usage for recursive CTE, enable recursive CTEs by default [#9619](https://github.com/apache/arrow-datafusion/pull/9619) (jonahgao) +- feat: Between expr to sql string [#9803](https://github.com/apache/arrow-datafusion/pull/9803) (sebastian2296) +- feat: Expose `array_empty` and `list_empty` functions as alias of `empty` function [#9807](https://github.com/apache/arrow-datafusion/pull/9807) (erenavsarogullari) +- feat: Not expr to string [#9802](https://github.com/apache/arrow-datafusion/pull/9802) (sebastian2296) +- feat: pass SessionState not SessionConfig to FunctionFactory::create [#9837](https://github.com/apache/arrow-datafusion/pull/9837) (tshauck) + +**Fixed bugs:** + +- fix: use `JoinSet` to make spawned tasks cancel-safe [#9318](https://github.com/apache/arrow-datafusion/pull/9318) (DDtKey) +- fix: nvl function's return type [#9357](https://github.com/apache/arrow-datafusion/pull/9357) (guojidan) +- fix: panic in isnan() when no args are given [#9377](https://github.com/apache/arrow-datafusion/pull/9377) (SteveLauC) +- fix: using test data sample for catalog example [#9372](https://github.com/apache/arrow-datafusion/pull/9372) (korowa) +- fix: sort_batch function unsupported mixed types with list [#9410](https://github.com/apache/arrow-datafusion/pull/9410) (JasonLi-cn) +- fix: casting to ARRAY types failed [#9441](https://github.com/apache/arrow-datafusion/pull/9441) (jonahgao) +- fix: reading from partitioned `json` & `arrow` tables [#9431](https://github.com/apache/arrow-datafusion/pull/9431) (korowa) +- fix: coalesce function should return correct data type [#9459](https://github.com/apache/arrow-datafusion/pull/9459) (viirya) +- fix: `generate_series` and `range` panic on edge cases [#9503](https://github.com/apache/arrow-datafusion/pull/9503) (jonahgao) +- fix: `substr_index` not handling negative occurrence correctly [#9475](https://github.com/apache/arrow-datafusion/pull/9475) (jonahgao) +- fix: support two argument TRIM [#9521](https://github.com/apache/arrow-datafusion/pull/9521) (tshauck) +- fix: incorrect null handling in `range` and `generate_series` [#9574](https://github.com/apache/arrow-datafusion/pull/9574) (jonahgao) +- fix: recursive cte hangs on joins [#9687](https://github.com/apache/arrow-datafusion/pull/9687) (jonahgao) +- fix: parallel parquet can underflow when max_record_batch_rows < execution.batch_size [#9737](https://github.com/apache/arrow-datafusion/pull/9737) (devinjdangelo) +- fix: change placeholder errors from Internal to Plan [#9745](https://github.com/apache/arrow-datafusion/pull/9745) (erratic-pattern) +- fix: ensure mutual compatibility of the two input schemas from recursive CTEs [#9795](https://github.com/apache/arrow-datafusion/pull/9795) (jonahgao) + +**Documentation updates:** + +- docs: put flatten in top fn list [#9376](https://github.com/apache/arrow-datafusion/pull/9376) (SteveLauC) +- Update documentation so list_to_string alias to point to array_to_string [#9374](https://github.com/apache/arrow-datafusion/pull/9374) (monkwire) +- Uplift keys/dependencies to use more workspace inheritance [#9293](https://github.com/apache/arrow-datafusion/pull/9293) (Jefffrey) +- docs: update contributor guide (migration to sqllogictest is done) [#9408](https://github.com/apache/arrow-datafusion/pull/9408) (SteveLauC) +- Move the to_timestamp\* functions to datafusion-functions [#9388](https://github.com/apache/arrow-datafusion/pull/9388) (Omega359) +- NEW Logo [#9385](https://github.com/apache/arrow-datafusion/pull/9385) (pinarbayata) +- Minor: docs: rm duplicate words. [#9449](https://github.com/apache/arrow-datafusion/pull/9449) (my-vegetable-has-exploded) +- Update contributor guide with updated scalar function howto [#9438](https://github.com/apache/arrow-datafusion/pull/9438) (Omega359) +- docs: fix extraneous char in array functions table of contents [#9560](https://github.com/apache/arrow-datafusion/pull/9560) (tshauck) +- doc: Add missing doc link [#9631](https://github.com/apache/arrow-datafusion/pull/9631) (Weijun-H) +- chore: remove repetitive word `the the` --> `the` in docs / comments [#9673](https://github.com/apache/arrow-datafusion/pull/9673) (InventiveCoder) +- Update example-usage.md to remove reference to simd and rust nightly. [#9677](https://github.com/apache/arrow-datafusion/pull/9677) (Omega359) +- Minor: Improve documentation for `LogicalPlan::expressions` [#9698](https://github.com/apache/arrow-datafusion/pull/9698) (alamb) +- Add Minimum Supported Rust Version policy to docs [#9681](https://github.com/apache/arrow-datafusion/pull/9681) (alamb) +- doc: Updated known users list and usage dependency description [#9718](https://github.com/apache/arrow-datafusion/pull/9718) (comphead) + +**Merged pull requests:** + +- refactor: Change `SchemaProvider::table` to return `Result` rather than `Option<..>` [#9307](https://github.com/apache/arrow-datafusion/pull/9307) (crepererum) +- fix write_partitioned_parquet_results test case bug [#9360](https://github.com/apache/arrow-datafusion/pull/9360) (guojidan) +- fix: use `JoinSet` to make spawned tasks cancel-safe [#9318](https://github.com/apache/arrow-datafusion/pull/9318) (DDtKey) +- Update nix requirement from 0.27.1 to 0.28.0 [#9344](https://github.com/apache/arrow-datafusion/pull/9344) (dependabot[bot]) +- Replace usages of internal_err with exec_err where appropriate [#9241](https://github.com/apache/arrow-datafusion/pull/9241) (Omega359) +- feat : Support for deregistering user defined functions [#9239](https://github.com/apache/arrow-datafusion/pull/9239) (mobley-trent) +- fix: nvl function's return type [#9357](https://github.com/apache/arrow-datafusion/pull/9357) (guojidan) +- refactor: move acos() to function crate [#9297](https://github.com/apache/arrow-datafusion/pull/9297) (SteveLauC) +- docs: put flatten in top fn list [#9376](https://github.com/apache/arrow-datafusion/pull/9376) (SteveLauC) +- Update documentation so list_to_string alias to point to array_to_string [#9374](https://github.com/apache/arrow-datafusion/pull/9374) (monkwire) +- feat: issue_9285: port builtin reg function into datafusion-function-\* (1/3 regexpmatch) [#9329](https://github.com/apache/arrow-datafusion/pull/9329) (Lordworms) +- Add test to verify issue #9161 [#9265](https://github.com/apache/arrow-datafusion/pull/9265) (jonahgao) +- refactor: fix error macros hygiene (always import `DataFusionError`) [#9366](https://github.com/apache/arrow-datafusion/pull/9366) (crepererum) +- feat: support for defining ARRAY columns in `CREATE TABLE` [#9381](https://github.com/apache/arrow-datafusion/pull/9381) (jonahgao) +- fix: panic in isnan() when no args are given [#9377](https://github.com/apache/arrow-datafusion/pull/9377) (SteveLauC) +- feat: support `unnest` in FROM clause [#9355](https://github.com/apache/arrow-datafusion/pull/9355) (jonahgao) +- feat: support nvl2 function [#9364](https://github.com/apache/arrow-datafusion/pull/9364) (guojidan) +- refactor: move asin() to function crate [#9379](https://github.com/apache/arrow-datafusion/pull/9379) (SteveLauC) +- fix: using test data sample for catalog example [#9372](https://github.com/apache/arrow-datafusion/pull/9372) (korowa) +- delete tail space, fix `error: unused import: DataFusionError` [#9386](https://github.com/apache/arrow-datafusion/pull/9386) (Tangruilin) +- Run cargo-fmt on `datafusion-functions/core` [#9367](https://github.com/apache/arrow-datafusion/pull/9367) (alamb) +- Cache common plan properties to eliminate recursive calls in physical plan [#9346](https://github.com/apache/arrow-datafusion/pull/9346) (mustafasrepo) +- Run cargo-fmt on all of `datafusion-functions` [#9390](https://github.com/apache/arrow-datafusion/pull/9390) (alamb) +- feat: issue #9224 substitute tlide in table path [#9259](https://github.com/apache/arrow-datafusion/pull/9259) (Lordworms) +- port range function and change gen_series logic [#9352](https://github.com/apache/arrow-datafusion/pull/9352) (Lordworms) +- [MINOR]: Generate physical plan, instead of logical plan in the bench test [#9383](https://github.com/apache/arrow-datafusion/pull/9383) (mustafasrepo) +- Add `to_date` function [#9019](https://github.com/apache/arrow-datafusion/pull/9019) (Tangruilin) +- Minor: clarify performance in docs for `ScalarUDF`, `ScalarUDAF` and `ScalarUDWF` [#9384](https://github.com/apache/arrow-datafusion/pull/9384) (alamb) +- feat: replace std Instant with wasm-compatible wrapper [#9189](https://github.com/apache/arrow-datafusion/pull/9189) (waynexia) +- Uplift keys/dependencies to use more workspace inheritance [#9293](https://github.com/apache/arrow-datafusion/pull/9293) (Jefffrey) +- Improve documentation for ExecutionPlanProperties, use consistent field name [#9389](https://github.com/apache/arrow-datafusion/pull/9389) (alamb) +- Doc: Workaround for Running cargo test locally without signficant memory [#9402](https://github.com/apache/arrow-datafusion/pull/9402) (devinjdangelo) +- feat: support `unnest` with additional columns [#9400](https://github.com/apache/arrow-datafusion/pull/9400) (jonahgao) +- Minor: improve the display name of `unnest` expressions [#9412](https://github.com/apache/arrow-datafusion/pull/9412) (jonahgao) +- Minor: Move function signature check to planning stage [#9401](https://github.com/apache/arrow-datafusion/pull/9401) (2010YOUY01) +- chore(deps): update substrait requirement from 0.24.0 to 0.25.1 [#9406](https://github.com/apache/arrow-datafusion/pull/9406) (dependabot[bot]) +- docs: update contributor guide (migration to sqllogictest is done) [#9408](https://github.com/apache/arrow-datafusion/pull/9408) (SteveLauC) +- Move the to_timestamp\* functions to datafusion-functions [#9388](https://github.com/apache/arrow-datafusion/pull/9388) (Omega359) +- Minor: Support LargeList List Range indexing and fix large list handling in ConstEvaluator [#9393](https://github.com/apache/arrow-datafusion/pull/9393) (jayzhan211) +- NEW Logo [#9385](https://github.com/apache/arrow-datafusion/pull/9385) (pinarbayata) +- Handle serde for ScalarUDF [#9395](https://github.com/apache/arrow-datafusion/pull/9395) (yyy1000) +- Minior: Add tests with `sqrt` with negative argument [#9426](https://github.com/apache/arrow-datafusion/pull/9426) (caicancai) +- Move SpawnedTask from datafusion_physical_plan to new `datafusion_common_runtime` crate [#9414](https://github.com/apache/arrow-datafusion/pull/9414) (mustafasrepo) +- Re-export datafusion-functions-array [#9433](https://github.com/apache/arrow-datafusion/pull/9433) (andygrove) +- Minor: Support LargeList for ListIndex [#9424](https://github.com/apache/arrow-datafusion/pull/9424) (PsiACE) +- move ArrayDims, ArrayNdims and Cardinality to datafusion-function-crate [#9425](https://github.com/apache/arrow-datafusion/pull/9425) (Weijun-H) +- refactor: make instr() an alias of strpos() [#9396](https://github.com/apache/arrow-datafusion/pull/9396) (SteveLauC) +- Add test case for invalid tz in timestamp literal [#9429](https://github.com/apache/arrow-datafusion/pull/9429) (MohamedAbdeen21) +- Minor: simplify call [#9434](https://github.com/apache/arrow-datafusion/pull/9434) (alamb) +- Support IGNORE NULLS for LEAD window function [#9419](https://github.com/apache/arrow-datafusion/pull/9419) (comphead) +- fix sqllogicaltest result [#9444](https://github.com/apache/arrow-datafusion/pull/9444) (jackwener) +- Minor: docs: rm duplicate words. [#9449](https://github.com/apache/arrow-datafusion/pull/9449) (my-vegetable-has-exploded) +- minor: fix cargo clippy some warning [#9442](https://github.com/apache/arrow-datafusion/pull/9442) (jackwener) +- port regexp_like function and port related tests [#9397](https://github.com/apache/arrow-datafusion/pull/9397) (Lordworms) +- fix: sort_batch function unsupported mixed types with list [#9410](https://github.com/apache/arrow-datafusion/pull/9410) (JasonLi-cn) +- refactor: add `join_unwind` to `SpawnedTask` [#9422](https://github.com/apache/arrow-datafusion/pull/9422) (DDtKey) +- Ignore null LEAD support for small batch sizes. [#9445](https://github.com/apache/arrow-datafusion/pull/9445) (mustafasrepo) +- fix: casting to ARRAY types failed [#9441](https://github.com/apache/arrow-datafusion/pull/9441) (jonahgao) +- fix: reading from partitioned `json` & `arrow` tables [#9431](https://github.com/apache/arrow-datafusion/pull/9431) (korowa) +- feat: Support `EscapedStringLiteral`, update sqlparser to `0.44.0` [#9268](https://github.com/apache/arrow-datafusion/pull/9268) (JasonLi-cn) +- Minor: fix LEAD test description [#9451](https://github.com/apache/arrow-datafusion/pull/9451) (comphead) +- Consolidate `TreeNode` transform and rewrite APIs [#8891](https://github.com/apache/arrow-datafusion/pull/8891) (peter-toth) +- Support `Date32` arguments for `generate_series` [#9420](https://github.com/apache/arrow-datafusion/pull/9420) (Lordworms) +- Minor: change doc for range [#9455](https://github.com/apache/arrow-datafusion/pull/9455) (Lordworms) +- doc: add missing function index in scalar_expression.md [#9462](https://github.com/apache/arrow-datafusion/pull/9462) (Weijun-H) +- build: Update bigdecimal version in `Cargo.toml` [#9471](https://github.com/apache/arrow-datafusion/pull/9471) (comphead) +- chore(deps): update base64 requirement from 0.21 to 0.22 [#9446](https://github.com/apache/arrow-datafusion/pull/9446) (dependabot[bot]) +- Port regexp_replace functions and related tests [#9454](https://github.com/apache/arrow-datafusion/pull/9454) (Lordworms) +- Update contributor guide with updated scalar function howto [#9438](https://github.com/apache/arrow-datafusion/pull/9438) (Omega359) +- feat: add support for fixed list wildcard in type signature [#9312](https://github.com/apache/arrow-datafusion/pull/9312) (universalmind303) +- Add a `ScalarUDFImpl::simplfy()` API, move `SimplifyInfo` et al to datafusion_expr [#9304](https://github.com/apache/arrow-datafusion/pull/9304) (jayzhan211) +- Implement IGNORE NULLS for FIRST_VALUE [#9411](https://github.com/apache/arrow-datafusion/pull/9411) (huaxingao) +- Add plugable handler for `CREATE FUNCTION` [#9333](https://github.com/apache/arrow-datafusion/pull/9333) (milenkovicm) +- Enable configurable display of partition sizes in the explain statement [#9474](https://github.com/apache/arrow-datafusion/pull/9474) (jayzhan211) +- Reduce casts for LEAD/LAG [#9468](https://github.com/apache/arrow-datafusion/pull/9468) (comphead) +- [CI build] fix chrono suggestions [#9486](https://github.com/apache/arrow-datafusion/pull/9486) (comphead) +- Make regex dependency optional in datafusion-functions, add CI checks for function packages [#9473](https://github.com/apache/arrow-datafusion/pull/9473) (alamb) +- fix: coalesce function should return correct data type [#9459](https://github.com/apache/arrow-datafusion/pull/9459) (viirya) +- LEAD/LAG calculate default value once [#9485](https://github.com/apache/arrow-datafusion/pull/9485) (comphead) +- chore: simplify the return type of `validate_data_types()` [#9491](https://github.com/apache/arrow-datafusion/pull/9491) (waynexia) +- minor: use arrow-rs casting from Float to Timestamp [#9500](https://github.com/apache/arrow-datafusion/pull/9500) (comphead) +- chore(deps): update substrait requirement from 0.25.1 to 0.27.0 [#9502](https://github.com/apache/arrow-datafusion/pull/9502) (dependabot[bot]) +- fix: `generate_series` and `range` panic on edge cases [#9503](https://github.com/apache/arrow-datafusion/pull/9503) (jonahgao) +- Fix undeterministic behaviour of schema nullability of lag window query [#9508](https://github.com/apache/arrow-datafusion/pull/9508) (mustafasrepo) +- Add `to_unixtime` function [#9077](https://github.com/apache/arrow-datafusion/pull/9077) (Tangruilin) +- Minor: fixed transformed state in UDF Simplify [#9484](https://github.com/apache/arrow-datafusion/pull/9484) (alamb) +- test: port strpos test in physical_expr/src/functions to sqllogictest [#9439](https://github.com/apache/arrow-datafusion/pull/9439) (SteveLauC) +- Port ArrayHas family to `functions-array` [#9496](https://github.com/apache/arrow-datafusion/pull/9496) (jayzhan211) +- port array_empty and array_length to datafusion-function-array crate [#9510](https://github.com/apache/arrow-datafusion/pull/9510) (Weijun-H) +- fix: `substr_index` not handling negative occurrence correctly [#9475](https://github.com/apache/arrow-datafusion/pull/9475) (jonahgao) +- [minor] extract collect file statistics method and add doc [#9490](https://github.com/apache/arrow-datafusion/pull/9490) (Ted-Jiang) +- test: sqllogictests for multiple tables join [#9480](https://github.com/apache/arrow-datafusion/pull/9480) (korowa) +- Add support for ignore nulls for LEAD, LAG in WindowAggExec [#9498](https://github.com/apache/arrow-datafusion/pull/9498) (Lordworms) +- Minior: Improve log expr description [#9516](https://github.com/apache/arrow-datafusion/pull/9516) (caicancai) +- port flatten to datafusion-function-array [#9523](https://github.com/apache/arrow-datafusion/pull/9523) (Weijun-H) +- feat: Add projection to HashJoinExec. [#9236](https://github.com/apache/arrow-datafusion/pull/9236) (my-vegetable-has-exploded) +- Add example for `FunctionFactory` [#9482](https://github.com/apache/arrow-datafusion/pull/9482) (milenkovicm) +- Move date_part, date_trunc, date_bin functions to datafusion-functions [#9435](https://github.com/apache/arrow-datafusion/pull/9435) (Omega359) +- fix: support two argument TRIM [#9521](https://github.com/apache/arrow-datafusion/pull/9521) (tshauck) +- Remove physical expr of ListIndex and ListRange, convert to `array_element` and `array_slice` functions [#9492](https://github.com/apache/arrow-datafusion/pull/9492) (jayzhan211) +- feat: function name hints for UDFs [#9407](https://github.com/apache/arrow-datafusion/pull/9407) (SteveLauC) +- Minor: Improve documentation for registering `AnalyzerRule` [#9520](https://github.com/apache/arrow-datafusion/pull/9520) (alamb) +- Extend argument types for udf `return_type_from_exprs` [#9522](https://github.com/apache/arrow-datafusion/pull/9522) (jayzhan211) +- move make_array array_append array_prepend array_concat function to datafusion-functions-array crate [#9504](https://github.com/apache/arrow-datafusion/pull/9504) (guojidan) +- Port `StringToArray` to `function-arrays` subcrate [#9543](https://github.com/apache/arrow-datafusion/pull/9543) (erenavsarogullari) +- Minor: remove `..` pattern matching in sql planner [#9531](https://github.com/apache/arrow-datafusion/pull/9531) (alamb) +- Minor: Fix document Interval syntax [#9542](https://github.com/apache/arrow-datafusion/pull/9542) (yyy1000) +- Port `struct` to datafusion-functions [#9546](https://github.com/apache/arrow-datafusion/pull/9546) (yyy1000) +- UDAF and UDWF support aliases [#9489](https://github.com/apache/arrow-datafusion/pull/9489) (lewiszlw) +- docs: fix extraneous char in array functions table of contents [#9560](https://github.com/apache/arrow-datafusion/pull/9560) (tshauck) +- [MINOR]: Fix undeterministic test [#9559](https://github.com/apache/arrow-datafusion/pull/9559) (mustafasrepo) +- Port `arrow_typeof` to datafusion-function [#9524](https://github.com/apache/arrow-datafusion/pull/9524) (yyy1000) +- feat: Introduce convert Expr to SQL string API and basic feature [#9517](https://github.com/apache/arrow-datafusion/pull/9517) (backkem) +- Port `ArraySort` to `function-arrays` subcrate [#9551](https://github.com/apache/arrow-datafusion/pull/9551) (erenavsarogullari) +- refactor: unify some plan optimization in CommonSubexprEliminate [#9556](https://github.com/apache/arrow-datafusion/pull/9556) (jackwener) +- Port `ArrayDistinct` to `functions-array` subcrate [#9549](https://github.com/apache/arrow-datafusion/pull/9549) (erenavsarogullari) +- Minor: add a sql_planner benchmarks to reflecte select many field on a huge table [#9536](https://github.com/apache/arrow-datafusion/pull/9536) (haohuaijin) +- Support IGNORE NULLS for FIRST/LAST window function [#9470](https://github.com/apache/arrow-datafusion/pull/9470) (huaxingao) +- Systematic Configuration in 'Create External Table' and 'Copy To' Options [#9382](https://github.com/apache/arrow-datafusion/pull/9382) (metesynnada) +- fix: incorrect null handling in `range` and `generate_series` [#9574](https://github.com/apache/arrow-datafusion/pull/9574) (jonahgao) +- Update README.md [#9572](https://github.com/apache/arrow-datafusion/pull/9572) (Abdullahsab3) +- Port tan, tanh to datafusion-functions [#9535](https://github.com/apache/arrow-datafusion/pull/9535) (ongchi) +- feat(9493): provide access to FileMetaData for files written with ParquetSink [#9548](https://github.com/apache/arrow-datafusion/pull/9548) (wiedld) +- Export datafusion-functions UDFs publically [#9585](https://github.com/apache/arrow-datafusion/pull/9585) (alamb) +- Update the comment and Add a check [#9571](https://github.com/apache/arrow-datafusion/pull/9571) (colommar) +- Port `ArrayRepeat` to `functions-array` subcrate [#9568](https://github.com/apache/arrow-datafusion/pull/9568) (erenavsarogullari) +- Fix ApproxPercentileAccumulator on zero values [#9582](https://github.com/apache/arrow-datafusion/pull/9582) (Dandandan) +- Add `FunctionRewrite` API, Move Array specific rewrites to `datafusion_functions_array` [#9583](https://github.com/apache/arrow-datafusion/pull/9583) (alamb) +- Move from_unixtime, now, current_date, current_time functions to datafusion-functions [#9537](https://github.com/apache/arrow-datafusion/pull/9537) (Omega359) +- minor: update Debug trait impl for WindowsFrame [#9587](https://github.com/apache/arrow-datafusion/pull/9587) (comphead) +- Initial support LogicalPlan to SQL String [#9596](https://github.com/apache/arrow-datafusion/pull/9596) (backkem) +- refactor: use a common macro to define math UDFs [#9598](https://github.com/apache/arrow-datafusion/pull/9598) (jonahgao) +- Move all `crypto` related functions to `datafusion-functions` [#9590](https://github.com/apache/arrow-datafusion/pull/9590) (Lordworms) +- Remove physical expr of NamedStructField, convert to `get_field` function call [#9563](https://github.com/apache/arrow-datafusion/pull/9563) (yyy1000) +- Add `/benchmark` github command to comparison benchmark between base and pr commit [#9461](https://github.com/apache/arrow-datafusion/pull/9461) (gruuya) +- support unnest as subexpression [#9592](https://github.com/apache/arrow-datafusion/pull/9592) (YjyJeff) +- feat: implement more expr_to_sql functionality [#9578](https://github.com/apache/arrow-datafusion/pull/9578) (devinjdangelo) +- Port `ArrayResize` to `functions-array` subcrate [#9570](https://github.com/apache/arrow-datafusion/pull/9570) (erenavsarogullari) +- Move make_date, to_char to datafusion-functions [#9601](https://github.com/apache/arrow-datafusion/pull/9601) (Omega359) +- Fix to_timestamp benchmark [#9608](https://github.com/apache/arrow-datafusion/pull/9608) (Omega359) +- feat: implement aggregation and subquery plans to SQL [#9606](https://github.com/apache/arrow-datafusion/pull/9606) (devinjdangelo) +- Port ArrayElem/Slice/PopFront/Back into `functions-array` [#9615](https://github.com/apache/arrow-datafusion/pull/9615) (jayzhan211) +- Minor: Remove datafusion-functions-array dependency from datafusion-optimizer [#9621](https://github.com/apache/arrow-datafusion/pull/9621) (alamb) +- Enable TTY during bench data generation [#9626](https://github.com/apache/arrow-datafusion/pull/9626) (gruuya) +- Remove constant expressions from SortExprs in the SortExec [#9618](https://github.com/apache/arrow-datafusion/pull/9618) (mustafasrepo) +- Try fixing missing results name in the benchmark step [#9632](https://github.com/apache/arrow-datafusion/pull/9632) (gruuya) +- feat: track memory usage for recursive CTE, enable recursive CTEs by default [#9619](https://github.com/apache/arrow-datafusion/pull/9619) (jonahgao) +- doc: Add missing doc link [#9631](https://github.com/apache/arrow-datafusion/pull/9631) (Weijun-H) +- Add explicit move of PR bench results if they were placed in HEAD dir [#9636](https://github.com/apache/arrow-datafusion/pull/9636) (gruuya) +- Add `array_reverse` function to datafusion-function-\* crate [#9630](https://github.com/apache/arrow-datafusion/pull/9630) (Weijun-H) +- Move parts of `InListSimplifier` simplify rules to `Simplifier` [#9628](https://github.com/apache/arrow-datafusion/pull/9628) (jayzhan211) +- Port Array Union and Intersect to `functions-array` [#9629](https://github.com/apache/arrow-datafusion/pull/9629) (jayzhan211) +- Port `ArrayPosition` and `ArrayPositions` to `functions-array` subcrate [#9617](https://github.com/apache/arrow-datafusion/pull/9617) (erenavsarogullari) +- Optimize make_date (#9089) [#9600](https://github.com/apache/arrow-datafusion/pull/9600) (vojtechtoman) +- Support AT TIME ZONE clause [#9647](https://github.com/apache/arrow-datafusion/pull/9647) (tinfoil-knight) +- Window Linear Mode use smaller buffers [#9597](https://github.com/apache/arrow-datafusion/pull/9597) (mustafasrepo) +- Port `ArrayExcept` to `functions-array` subcrate [#9634](https://github.com/apache/arrow-datafusion/pull/9634) (erenavsarogullari) +- chore: improve array expression doc and clean up array_expression.rs [#9650](https://github.com/apache/arrow-datafusion/pull/9650) (Weijun-H) +- Minor: remove clone in `exprlist_to_fields` [#9657](https://github.com/apache/arrow-datafusion/pull/9657) (jayzhan211) +- Port `ArrayRemove`, `ArrayRemoveN`, `ArrayRemoveAll` to `functions-array` subcrate [#9656](https://github.com/apache/arrow-datafusion/pull/9656) (erenavsarogullari) +- Minor: Remove redundant dependencies from `datafusion-functions/Cargo.toml` [#9622](https://github.com/apache/arrow-datafusion/pull/9622) (alamb) +- Support IGNORE NULLS for NTH_VALUE window function [#9625](https://github.com/apache/arrow-datafusion/pull/9625) (huaxingao) +- Improve Robustness of Unparser Testing and Implementation [#9623](https://github.com/apache/arrow-datafusion/pull/9623) (devinjdangelo) +- Adding Constant Check for FilterExec [#9649](https://github.com/apache/arrow-datafusion/pull/9649) (Lordworms) +- chore(deps-dev): bump follow-redirects from 1.15.4 to 1.15.6 in /datafusion/wasmtest/datafusion-wasm-app [#9609](https://github.com/apache/arrow-datafusion/pull/9609) (dependabot[bot]) +- move array_replace family functions to datafusion-function-array crate [#9651](https://github.com/apache/arrow-datafusion/pull/9651) (Weijun-H) +- chore: remove repetitive word `the the` --> `the` in docs / comments [#9673](https://github.com/apache/arrow-datafusion/pull/9673) (InventiveCoder) +- Update example-usage.md to remove reference to simd and rust nightly. [#9677](https://github.com/apache/arrow-datafusion/pull/9677) (Omega359) +- [MINOR]: Remove some `.unwrap`s from nth_value.rs file [#9674](https://github.com/apache/arrow-datafusion/pull/9674) (mustafasrepo) +- minor: Remove deprecated methods [#9627](https://github.com/apache/arrow-datafusion/pull/9627) (comphead) +- Migrate `arrow_cast` to a UDF [#9610](https://github.com/apache/arrow-datafusion/pull/9610) (alamb) +- parquet: Add row*groups_matched*{statistics,bloom_filter} statistics [#9640](https://github.com/apache/arrow-datafusion/pull/9640) (progval) +- Make COPY TO align with CREATE EXTERNAL TABLE [#9604](https://github.com/apache/arrow-datafusion/pull/9604) (metesynnada) +- Support "A column is known to be entirely NULL" in `PruningPredicate` [#9223](https://github.com/apache/arrow-datafusion/pull/9223) (appletreeisyellow) +- Suppress self update for windows CI runner [#9661](https://github.com/apache/arrow-datafusion/pull/9661) (jayzhan211) +- add schema to SQL ast builder [#9624](https://github.com/apache/arrow-datafusion/pull/9624) (sardination) +- core/tests/parquet/row_group_pruning.rs: Add tests for strings [#9642](https://github.com/apache/arrow-datafusion/pull/9642) (progval) +- Fix incorrect results with multiple `COUNT(DISTINCT..)` aggregates on dictionaries [#9679](https://github.com/apache/arrow-datafusion/pull/9679) (alamb) +- parquet: Add support for Bloom filters on binary columns [#9644](https://github.com/apache/arrow-datafusion/pull/9644) (progval) +- Update Arrow/Parquet to `51.0.0`, tonic to `0.11` [#9613](https://github.com/apache/arrow-datafusion/pull/9613) (tustvold) +- Move inlist rule to expr_simplifier [#9692](https://github.com/apache/arrow-datafusion/pull/9692) (jayzhan211) +- Support Serde for ScalarUDF in Physical Expressions [#9436](https://github.com/apache/arrow-datafusion/pull/9436) (yyy1000) +- Support Union types in `ScalarValue` [#9683](https://github.com/apache/arrow-datafusion/pull/9683) (avantgardnerio) +- parquet: Add support for row group pruning on FixedSizeBinary [#9646](https://github.com/apache/arrow-datafusion/pull/9646) (progval) +- Minor: Improve documentation for `LogicalPlan::expressions` [#9698](https://github.com/apache/arrow-datafusion/pull/9698) (alamb) +- Make builtin window function output datatype to be derived from schema [#9686](https://github.com/apache/arrow-datafusion/pull/9686) (comphead) +- refactor: Extract `array_to_string` and `string_to_array` from `functions-array` subcrate' s `kernels` and `udf` containers [#9704](https://github.com/apache/arrow-datafusion/pull/9704) (erenavsarogullari) +- Add Minimum Supported Rust Version policy to docs [#9681](https://github.com/apache/arrow-datafusion/pull/9681) (alamb) +- doc: Add DataFusion profiling documentation for MacOS [#9711](https://github.com/apache/arrow-datafusion/pull/9711) (comphead) +- Minor: add ticket reference to commented out test [#9715](https://github.com/apache/arrow-datafusion/pull/9715) (alamb) +- Minor: Rename path from `common_runtime` to `common-runtime` [#9717](https://github.com/apache/arrow-datafusion/pull/9717) (alamb) +- Use object_store:BufWriter to replace put_multipart [#9648](https://github.com/apache/arrow-datafusion/pull/9648) (yyy1000) +- Fix COPY TO failing on passing format options through CLI [#9709](https://github.com/apache/arrow-datafusion/pull/9709) (tinfoil-knight) +- fix: recursive cte hangs on joins [#9687](https://github.com/apache/arrow-datafusion/pull/9687) (jonahgao) +- Move `starts_with`, `to_hex`,` trim`, `upper` to datafusion-functions (and add string_expressions) [#9541](https://github.com/apache/arrow-datafusion/pull/9541) (Tangruilin) +- Support for `extract(x from time)` / `date_part` from time types [#8693](https://github.com/apache/arrow-datafusion/pull/8693) (Jefffrey) +- doc: Updated known users list and usage dependency description [#9718](https://github.com/apache/arrow-datafusion/pull/9718) (comphead) +- Minor: improve documentation for `CommonSubexprEliminate` [#9700](https://github.com/apache/arrow-datafusion/pull/9700) (alamb) +- build: modify code to comply with latest clippy requirement [#9725](https://github.com/apache/arrow-datafusion/pull/9725) (comphead) +- Minor: return internal error rather than panic on unexpected error in COUNT DISTINCT [#9712](https://github.com/apache/arrow-datafusion/pull/9712) (alamb) +- fix(9678): short circuiting prevented population of visited stack, for common subexpr elimination optimization [#9685](https://github.com/apache/arrow-datafusion/pull/9685) (wiedld) +- perf: improve to_field performance [#9722](https://github.com/apache/arrow-datafusion/pull/9722) (haohuaijin) +- Minor: Run ScalarValue size test on aarch again [#9728](https://github.com/apache/arrow-datafusion/pull/9728) (alamb) +- Move trim functions (btrim, ltrim, rtrim) to datafusion_functions, make expr_fn API consistent [#9730](https://github.com/apache/arrow-datafusion/pull/9730) (Omega359) +- make format prefix optional for format options in COPY [#9723](https://github.com/apache/arrow-datafusion/pull/9723) (tinfoil-knight) +- refactor: Extract `range` and `gen_series` functions from `functions-array` subcrate' s `kernels` and `udf` containers [#9720](https://github.com/apache/arrow-datafusion/pull/9720) (erenavsarogullari) +- Move ascii function to datafusion_functions [#9740](https://github.com/apache/arrow-datafusion/pull/9740) (PsiACE) +- adding expr to string for IsNotNull IsTrue IsFalse and IsUnkown [#9739](https://github.com/apache/arrow-datafusion/pull/9739) (Lordworms) +- fix: parallel parquet can underflow when max_record_batch_rows < execution.batch_size [#9737](https://github.com/apache/arrow-datafusion/pull/9737) (devinjdangelo) +- support format in options of COPY command [#9744](https://github.com/apache/arrow-datafusion/pull/9744) (tinfoil-knight) +- Move lower, octet_length to datafusion-functions [#9747](https://github.com/apache/arrow-datafusion/pull/9747) (Omega359) +- Fixed missing trim() in rust api [#9749](https://github.com/apache/arrow-datafusion/pull/9749) (Omega359) +- refactor: Extract `array_length`, `array_reverse` and `array_sort` functions from `functions-array` subcrate' s `kernels` and `udf` containers [#9751](https://github.com/apache/arrow-datafusion/pull/9751) (erenavsarogullari) +- refactor: Extract `array_empty` and `array_repeat` functions from `functions-array` subcrate' s `kernels` and `udf` containers [#9762](https://github.com/apache/arrow-datafusion/pull/9762) (erenavsarogullari) +- Minor: remove an outdated TODO in `TypeCoercion` [#9752](https://github.com/apache/arrow-datafusion/pull/9752) (jonahgao) +- refactor: Extract `array_resize` and `cardinality` functions from `functions-array` subcrate' s `kernels` and `udf` containers [#9766](https://github.com/apache/arrow-datafusion/pull/9766) (erenavsarogullari) +- fix: change placeholder errors from Internal to Plan [#9745](https://github.com/apache/arrow-datafusion/pull/9745) (erratic-pattern) +- Move levenshtein, uuid, overlay to datafusion-functions [#9760](https://github.com/apache/arrow-datafusion/pull/9760) (Omega359) +- improve null handling for to_char [#9689](https://github.com/apache/arrow-datafusion/pull/9689) (tinfoil-knight) +- Add Expr->String for ScalarFunction and InList [#9759](https://github.com/apache/arrow-datafusion/pull/9759) (yyy1000) +- Move repeat, replace, split_part to datafusion_functions [#9784](https://github.com/apache/arrow-datafusion/pull/9784) (Omega359) +- refactor: Extract `array_dims`, `array_ndims` and `flatten` functions from `functions-array` subcrate' s `kernels` and `udf` containers [#9786](https://github.com/apache/arrow-datafusion/pull/9786) (erenavsarogullari) +- Minor: Improve documentation about `ColumnarValues::values_to_array` [#9774](https://github.com/apache/arrow-datafusion/pull/9774) (alamb) +- Fix panic in `struct` function with mixed scalar/array arguments [#9775](https://github.com/apache/arrow-datafusion/pull/9775) (alamb) +- refactor: Apply minor refactorings to `functions-array` crate [#9788](https://github.com/apache/arrow-datafusion/pull/9788) (erenavsarogullari) +- Move bit_length and chr functions to datafusion_functions [#9782](https://github.com/apache/arrow-datafusion/pull/9782) (PsiACE) +- Support tencent cloud COS storage in `datafusion-cli` [#9734](https://github.com/apache/arrow-datafusion/pull/9734) (harveyyue) +- Make it easier to register configuration extension ... [#9781](https://github.com/apache/arrow-datafusion/pull/9781) (milenkovicm) +- Expr to Sql : Case [#9798](https://github.com/apache/arrow-datafusion/pull/9798) (yyy1000) +- feat: Between expr to sql string [#9803](https://github.com/apache/arrow-datafusion/pull/9803) (sebastian2296) +- feat: Expose `array_empty` and `list_empty` functions as alias of `empty` function [#9807](https://github.com/apache/arrow-datafusion/pull/9807) (erenavsarogullari) +- Support Expr `Like` to sql [#9805](https://github.com/apache/arrow-datafusion/pull/9805) (Weijun-H) +- feat: Not expr to string [#9802](https://github.com/apache/arrow-datafusion/pull/9802) (sebastian2296) +- [Minor]: Move some repetitive codes to functions(proto) [#9811](https://github.com/apache/arrow-datafusion/pull/9811) (mustafasrepo) +- Implement IGNORE NULLS for LAST_VALUE [#9801](https://github.com/apache/arrow-datafusion/pull/9801) (huaxingao) +- [MINOR]: Move some repetitive codes to functions [#9810](https://github.com/apache/arrow-datafusion/pull/9810) (mustafasrepo) +- fix: ensure mutual compatibility of the two input schemas from recursive CTEs [#9795](https://github.com/apache/arrow-datafusion/pull/9795) (jonahgao) +- Add support for constant expression evaluation in limit [#9790](https://github.com/apache/arrow-datafusion/pull/9790) (mustafasrepo) +- Projection Pushdown through user defined LogicalPlan nodes. [#9690](https://github.com/apache/arrow-datafusion/pull/9690) (mustafasrepo) +- chore(deps): update substrait requirement from 0.27.0 to 0.28.0 [#9809](https://github.com/apache/arrow-datafusion/pull/9809) (dependabot[bot]) +- Run TPC-H SF10 during PR benchmarks [#9822](https://github.com/apache/arrow-datafusion/pull/9822) (gruuya) +- Expose `parser` on DFParser to enable user controlled parsing [#9729](https://github.com/apache/arrow-datafusion/pull/9729) (tshauck) +- Disable parallel reading for gziped ndjson file [#9799](https://github.com/apache/arrow-datafusion/pull/9799) (Lordworms) +- Optimize to_timestamp (with format) (#9090) [#9833](https://github.com/apache/arrow-datafusion/pull/9833) (vojtechtoman) +- Create unicode module in datafusion/functions/src/unicode and unicode_expressions feature flag, move char_length function [#9825](https://github.com/apache/arrow-datafusion/pull/9825) (Omega359) +- [Minor] Update TCPDS tests, remove some #[ignore]d tests [#9829](https://github.com/apache/arrow-datafusion/pull/9829) (Dandandan) +- doc: Adding baseline benchmark example [#9827](https://github.com/apache/arrow-datafusion/pull/9827) (comphead) +- Add name method to execution plan [#9793](https://github.com/apache/arrow-datafusion/pull/9793) (matthewmturner) +- chore(deps-dev): bump express from 4.18.2 to 4.19.2 in /datafusion/wasmtest/datafusion-wasm-app [#9826](https://github.com/apache/arrow-datafusion/pull/9826) (dependabot[bot]) +- feat: pass SessionState not SessionConfig to FunctionFactory::create [#9837](https://github.com/apache/arrow-datafusion/pull/9837) (tshauck) diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 492be93caf0c..a95f2f802dfb 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -64,7 +64,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.execution.parquet.statistics_enabled | NULL | Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.max_statistics_size | NULL | Sets max statistics size for any column. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.max_row_group_size | 1048576 | Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. | -| datafusion.execution.parquet.created_by | datafusion version 36.0.0 | Sets "created by" property | +| datafusion.execution.parquet.created_by | datafusion version 37.0.0 | Sets "created by" property | | datafusion.execution.parquet.column_index_truncate_length | NULL | Sets column index truncate length | | datafusion.execution.parquet.data_page_row_count_limit | 18446744073709551615 | Sets best effort maximum number of rows in data page | | datafusion.execution.parquet.encoding | NULL | Sets default encoding for any column Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting | From 09f5a544d25f36ff1d65cc377123aee9b0e8f538 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Thu, 28 Mar 2024 22:56:15 -0400 Subject: [PATCH 3/3] move Left, Lpad, Reverse, Right, Rpad functions to datafusion_functions (#9841) * Fix to_timestamp benchmark * Remove reference to simd and nightly build as simd is no longer an available feature in DataFusion and building with nightly may not be a good recommendation when getting started. * Fixed missing trim() function. * Create unicode module in datafusion/functions/src/unicode and unicode_expressions feature flag, move char_length function * move Left, Lpad, Reverse, Right, Rpad functions to datafusion_functions * Code cleanup from PR review. --- datafusion/expr/src/built_in_function.rs | 50 +- datafusion/expr/src/expr_fn.rs | 21 - datafusion/functions/src/unicode/left.rs | 236 +++++++ datafusion/functions/src/unicode/lpad.rs | 369 +++++++++++ datafusion/functions/src/unicode/mod.rs | 44 +- datafusion/functions/src/unicode/reverse.rs | 149 +++++ datafusion/functions/src/unicode/right.rs | 238 +++++++ datafusion/functions/src/unicode/rpad.rs | 361 +++++++++++ datafusion/physical-expr/src/functions.rs | 606 ------------------ datafusion/physical-expr/src/planner.rs | 4 +- .../physical-expr/src/unicode_expressions.rs | 263 +------- datafusion/proto/proto/datafusion.proto | 10 +- datafusion/proto/src/generated/pbjson.rs | 15 - datafusion/proto/src/generated/prost.rs | 20 +- .../proto/src/logical_plan/from_proto.rs | 53 +- datafusion/proto/src/logical_plan/to_proto.rs | 5 - 16 files changed, 1428 insertions(+), 1016 deletions(-) create mode 100644 datafusion/functions/src/unicode/left.rs create mode 100644 datafusion/functions/src/unicode/lpad.rs create mode 100644 datafusion/functions/src/unicode/reverse.rs create mode 100644 datafusion/functions/src/unicode/right.rs create mode 100644 datafusion/functions/src/unicode/rpad.rs diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index eefbc131a27b..196d278dc70e 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -111,18 +111,8 @@ pub enum BuiltinScalarFunction { EndsWith, /// initcap InitCap, - /// left - Left, - /// lpad - Lpad, /// random Random, - /// reverse - Reverse, - /// right - Right, - /// rpad - Rpad, /// strpos Strpos, /// substr @@ -220,12 +210,7 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::ConcatWithSeparator => Volatility::Immutable, BuiltinScalarFunction::EndsWith => Volatility::Immutable, BuiltinScalarFunction::InitCap => Volatility::Immutable, - BuiltinScalarFunction::Left => Volatility::Immutable, - BuiltinScalarFunction::Lpad => Volatility::Immutable, BuiltinScalarFunction::Radians => Volatility::Immutable, - BuiltinScalarFunction::Reverse => Volatility::Immutable, - BuiltinScalarFunction::Right => Volatility::Immutable, - BuiltinScalarFunction::Rpad => Volatility::Immutable, BuiltinScalarFunction::Strpos => Volatility::Immutable, BuiltinScalarFunction::Substr => Volatility::Immutable, BuiltinScalarFunction::Translate => Volatility::Immutable, @@ -264,17 +249,8 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::InitCap => { utf8_to_str_type(&input_expr_types[0], "initcap") } - BuiltinScalarFunction::Left => utf8_to_str_type(&input_expr_types[0], "left"), - BuiltinScalarFunction::Lpad => utf8_to_str_type(&input_expr_types[0], "lpad"), BuiltinScalarFunction::Pi => Ok(Float64), BuiltinScalarFunction::Random => Ok(Float64), - BuiltinScalarFunction::Reverse => { - utf8_to_str_type(&input_expr_types[0], "reverse") - } - BuiltinScalarFunction::Right => { - utf8_to_str_type(&input_expr_types[0], "right") - } - BuiltinScalarFunction::Rpad => utf8_to_str_type(&input_expr_types[0], "rpad"), BuiltinScalarFunction::EndsWith => Ok(Boolean), BuiltinScalarFunction::Strpos => { utf8_to_int_type(&input_expr_types[0], "strpos/instr/position") @@ -361,28 +337,9 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Coalesce => { Signature::variadic_equal(self.volatility()) } - BuiltinScalarFunction::InitCap | BuiltinScalarFunction::Reverse => { + BuiltinScalarFunction::InitCap => { Signature::uniform(1, vec![Utf8, LargeUtf8], self.volatility()) } - BuiltinScalarFunction::Lpad | BuiltinScalarFunction::Rpad => { - Signature::one_of( - vec![ - Exact(vec![Utf8, Int64]), - Exact(vec![LargeUtf8, Int64]), - Exact(vec![Utf8, Int64, Utf8]), - Exact(vec![LargeUtf8, Int64, Utf8]), - Exact(vec![Utf8, Int64, LargeUtf8]), - Exact(vec![LargeUtf8, Int64, LargeUtf8]), - ], - self.volatility(), - ) - } - BuiltinScalarFunction::Left | BuiltinScalarFunction::Right => { - Signature::one_of( - vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8, Int64])], - self.volatility(), - ) - } BuiltinScalarFunction::EndsWith | BuiltinScalarFunction::Strpos => { Signature::one_of( @@ -580,11 +537,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::ConcatWithSeparator => &["concat_ws"], BuiltinScalarFunction::EndsWith => &["ends_with"], BuiltinScalarFunction::InitCap => &["initcap"], - BuiltinScalarFunction::Left => &["left"], - BuiltinScalarFunction::Lpad => &["lpad"], - BuiltinScalarFunction::Reverse => &["reverse"], - BuiltinScalarFunction::Right => &["right"], - BuiltinScalarFunction::Rpad => &["rpad"], BuiltinScalarFunction::Strpos => &["strpos", "instr", "position"], BuiltinScalarFunction::Substr => &["substr"], BuiltinScalarFunction::Translate => &["translate"], diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index 654464798625..21dab72855e5 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -578,25 +578,11 @@ scalar_expr!(Atan2, atan2, y x, "inverse tangent of a division given in the argu scalar_expr!(Log, log, base x, "logarithm of a `x` for a particular `base`"); scalar_expr!(InitCap, initcap, string, "converts the first letter of each word in `string` in uppercase and the remaining characters in lowercase"); -scalar_expr!(Left, left, string n, "returns the first `n` characters in the `string`"); -scalar_expr!(Reverse, reverse, string, "reverses the `string`"); -scalar_expr!(Right, right, string n, "returns the last `n` characters in the `string`"); scalar_expr!(EndsWith, ends_with, string suffix, "whether the `string` ends with the `suffix`"); scalar_expr!(Strpos, strpos, string substring, "finds the position from where the `substring` matches the `string`"); scalar_expr!(Substr, substr, string position, "substring from the `position` to the end"); scalar_expr!(Substr, substring, string position length, "substring from the `position` with `length` characters"); scalar_expr!(Translate, translate, string from to, "replaces the characters in `from` with the counterpart in `to`"); -//use vec as parameter -nary_scalar_expr!( - Lpad, - lpad, - "fill up a string to the length by prepending the characters" -); -nary_scalar_expr!( - Rpad, - rpad, - "fill up a string to the length by appending the characters" -); nary_scalar_expr!(Coalesce, coalesce, "returns `coalesce(args...)`, which evaluates to the value of the first [Expr] which is not NULL"); //there is a func concat_ws before, so use concat_ws_expr as name.c nary_scalar_expr!( @@ -1028,13 +1014,6 @@ mod test { test_scalar_expr!(Gcd, gcd, arg_1, arg_2); test_scalar_expr!(Lcm, lcm, arg_1, arg_2); test_scalar_expr!(InitCap, initcap, string); - test_scalar_expr!(Left, left, string, count); - test_nary_scalar_expr!(Lpad, lpad, string, count); - test_nary_scalar_expr!(Lpad, lpad, string, count, characters); - test_scalar_expr!(Reverse, reverse, string); - test_scalar_expr!(Right, right, string, count); - test_nary_scalar_expr!(Rpad, rpad, string, count); - test_nary_scalar_expr!(Rpad, rpad, string, count, characters); test_scalar_expr!(EndsWith, ends_with, string, characters); test_scalar_expr!(Strpos, strpos, string, substring); test_scalar_expr!(Substr, substr, string, position); diff --git a/datafusion/functions/src/unicode/left.rs b/datafusion/functions/src/unicode/left.rs new file mode 100644 index 000000000000..473589fdc8aa --- /dev/null +++ b/datafusion/functions/src/unicode/left.rs @@ -0,0 +1,236 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::cmp::Ordering; +use std::sync::Arc; + +use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; +use arrow::datatypes::DataType; + +use datafusion_common::cast::{as_generic_string_array, as_int64_array}; +use datafusion_common::exec_err; +use datafusion_common::Result; +use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; + +use crate::utils::{make_scalar_function, utf8_to_str_type}; + +#[derive(Debug)] +pub(super) struct LeftFunc { + signature: Signature, +} + +impl LeftFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8, Int64])], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for LeftFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "left" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + utf8_to_str_type(&arg_types[0], "left") + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + match args[0].data_type() { + DataType::Utf8 => make_scalar_function(left::, vec![])(args), + DataType::LargeUtf8 => make_scalar_function(left::, vec![])(args), + other => exec_err!("Unsupported data type {other:?} for function left"), + } + } +} + +/// Returns first n characters in the string, or when n is negative, returns all but last |n| characters. +/// left('abcde', 2) = 'ab' +/// The implementation uses UTF-8 code points as characters +pub fn left(args: &[ArrayRef]) -> Result { + let string_array = as_generic_string_array::(&args[0])?; + let n_array = as_int64_array(&args[1])?; + let result = string_array + .iter() + .zip(n_array.iter()) + .map(|(string, n)| match (string, n) { + (Some(string), Some(n)) => match n.cmp(&0) { + Ordering::Less => { + let len = string.chars().count() as i64; + Some(if n.abs() < len { + string.chars().take((len + n) as usize).collect::() + } else { + "".to_string() + }) + } + Ordering::Equal => Some("".to_string()), + Ordering::Greater => { + Some(string.chars().take(n as usize).collect::()) + } + }, + _ => None, + }) + .collect::>(); + + Ok(Arc::new(result) as ArrayRef) +} + +#[cfg(test)] +mod tests { + use arrow::array::{Array, StringArray}; + use arrow::datatypes::DataType::Utf8; + + use datafusion_common::{Result, ScalarValue}; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + + use crate::unicode::left::LeftFunc; + use crate::utils::test::test_function; + + #[test] + fn test_functions() -> Result<()> { + test_function!( + LeftFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::from(2i64)), + ], + Ok(Some("ab")), + &str, + Utf8, + StringArray + ); + test_function!( + LeftFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::from(200i64)), + ], + Ok(Some("abcde")), + &str, + Utf8, + StringArray + ); + test_function!( + LeftFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::from(-2i64)), + ], + Ok(Some("abc")), + &str, + Utf8, + StringArray + ); + test_function!( + LeftFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::from(-200i64)), + ], + Ok(Some("")), + &str, + Utf8, + StringArray + ); + test_function!( + LeftFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::from(0i64)), + ], + Ok(Some("")), + &str, + Utf8, + StringArray + ); + test_function!( + LeftFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(None)), + ColumnarValue::Scalar(ScalarValue::from(2i64)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + LeftFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::Int64(None)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + LeftFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("joséésoj")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ], + Ok(Some("joséé")), + &str, + Utf8, + StringArray + ); + test_function!( + LeftFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("joséésoj")), + ColumnarValue::Scalar(ScalarValue::from(-3i64)), + ], + Ok(Some("joséé")), + &str, + Utf8, + StringArray + ); + #[cfg(not(feature = "unicode_expressions"))] + test_function!( + LeftFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::from(2i64)), + ], + internal_err!( + "function left requires compilation with feature flag: unicode_expressions." + ), + &str, + Utf8, + StringArray + ); + + Ok(()) + } +} diff --git a/datafusion/functions/src/unicode/lpad.rs b/datafusion/functions/src/unicode/lpad.rs new file mode 100644 index 000000000000..76a8e68cca25 --- /dev/null +++ b/datafusion/functions/src/unicode/lpad.rs @@ -0,0 +1,369 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; +use arrow::datatypes::DataType; +use datafusion_common::cast::{as_generic_string_array, as_int64_array}; +use unicode_segmentation::UnicodeSegmentation; + +use crate::utils::{make_scalar_function, utf8_to_str_type}; +use datafusion_common::{exec_err, Result}; +use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; + +#[derive(Debug)] +pub(super) struct LPadFunc { + signature: Signature, +} + +impl LPadFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![ + Exact(vec![Utf8, Int64]), + Exact(vec![LargeUtf8, Int64]), + Exact(vec![Utf8, Int64, Utf8]), + Exact(vec![LargeUtf8, Int64, Utf8]), + Exact(vec![Utf8, Int64, LargeUtf8]), + Exact(vec![LargeUtf8, Int64, LargeUtf8]), + ], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for LPadFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "lpad" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + utf8_to_str_type(&arg_types[0], "lpad") + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + match args[0].data_type() { + DataType::Utf8 => make_scalar_function(lpad::, vec![])(args), + DataType::LargeUtf8 => make_scalar_function(lpad::, vec![])(args), + other => exec_err!("Unsupported data type {other:?} for function lpad"), + } + } +} + +/// Extends the string to length 'length' by prepending the characters fill (a space by default). If the string is already longer than length then it is truncated (on the right). +/// lpad('hi', 5, 'xy') = 'xyxhi' +pub fn lpad(args: &[ArrayRef]) -> Result { + match args.len() { + 2 => { + let string_array = as_generic_string_array::(&args[0])?; + let length_array = as_int64_array(&args[1])?; + + let result = string_array + .iter() + .zip(length_array.iter()) + .map(|(string, length)| match (string, length) { + (Some(string), Some(length)) => { + if length > i32::MAX as i64 { + return exec_err!( + "lpad requested length {length} too large" + ); + } + + let length = if length < 0 { 0 } else { length as usize }; + if length == 0 { + Ok(Some("".to_string())) + } else { + let graphemes = string.graphemes(true).collect::>(); + if length < graphemes.len() { + Ok(Some(graphemes[..length].concat())) + } else { + let mut s: String = " ".repeat(length - graphemes.len()); + s.push_str(string); + Ok(Some(s)) + } + } + } + _ => Ok(None), + }) + .collect::>>()?; + + Ok(Arc::new(result) as ArrayRef) + } + 3 => { + let string_array = as_generic_string_array::(&args[0])?; + let length_array = as_int64_array(&args[1])?; + let fill_array = as_generic_string_array::(&args[2])?; + + let result = string_array + .iter() + .zip(length_array.iter()) + .zip(fill_array.iter()) + .map(|((string, length), fill)| match (string, length, fill) { + (Some(string), Some(length), Some(fill)) => { + if length > i32::MAX as i64 { + return exec_err!( + "lpad requested length {length} too large" + ); + } + + let length = if length < 0 { 0 } else { length as usize }; + if length == 0 { + Ok(Some("".to_string())) + } else { + let graphemes = string.graphemes(true).collect::>(); + let fill_chars = fill.chars().collect::>(); + + if length < graphemes.len() { + Ok(Some(graphemes[..length].concat())) + } else if fill_chars.is_empty() { + Ok(Some(string.to_string())) + } else { + let mut s = string.to_string(); + let mut char_vector = + Vec::::with_capacity(length - graphemes.len()); + for l in 0..length - graphemes.len() { + char_vector.push( + *fill_chars.get(l % fill_chars.len()).unwrap(), + ); + } + s.insert_str( + 0, + char_vector.iter().collect::().as_str(), + ); + Ok(Some(s)) + } + } + } + _ => Ok(None), + }) + .collect::>>()?; + + Ok(Arc::new(result) as ArrayRef) + } + other => exec_err!( + "lpad was called with {other} arguments. It requires at least 2 and at most 3." + ), + } +} + +#[cfg(test)] +mod tests { + use arrow::array::{Array, StringArray}; + use arrow::datatypes::DataType::Utf8; + + use datafusion_common::{Result, ScalarValue}; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + + use crate::unicode::lpad::LPadFunc; + use crate::utils::test::test_function; + + #[test] + fn test_functions() -> Result<()> { + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("josé")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ], + Ok(Some(" josé")), + &str, + Utf8, + StringArray + ); + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ], + Ok(Some(" hi")), + &str, + Utf8, + StringArray + ); + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(0i64)), + ], + Ok(Some("")), + &str, + Utf8, + StringArray + ); + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::Int64(None)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(None)), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ColumnarValue::Scalar(ScalarValue::from("xy")), + ], + Ok(Some("xyxhi")), + &str, + Utf8, + StringArray + ); + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(21i64)), + ColumnarValue::Scalar(ScalarValue::from("abcdef")), + ], + Ok(Some("abcdefabcdefabcdefahi")), + &str, + Utf8, + StringArray + ); + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ColumnarValue::Scalar(ScalarValue::from(" ")), + ], + Ok(Some(" hi")), + &str, + Utf8, + StringArray + ); + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ColumnarValue::Scalar(ScalarValue::from("")), + ], + Ok(Some("hi")), + &str, + Utf8, + StringArray + ); + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(None)), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ColumnarValue::Scalar(ScalarValue::from("xy")), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::Int64(None)), + ColumnarValue::Scalar(ScalarValue::from("xy")), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ColumnarValue::Scalar(ScalarValue::Utf8(None)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("josé")), + ColumnarValue::Scalar(ScalarValue::from(10i64)), + ColumnarValue::Scalar(ScalarValue::from("xy")), + ], + Ok(Some("xyxyxyjosé")), + &str, + Utf8, + StringArray + ); + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("josé")), + ColumnarValue::Scalar(ScalarValue::from(10i64)), + ColumnarValue::Scalar(ScalarValue::from("éñ")), + ], + Ok(Some("éñéñéñjosé")), + &str, + Utf8, + StringArray + ); + #[cfg(not(feature = "unicode_expressions"))] + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("josé")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ], + internal_err!( + "function lpad requires compilation with feature flag: unicode_expressions." + ), + &str, + Utf8, + StringArray + ); + Ok(()) + } +} diff --git a/datafusion/functions/src/unicode/mod.rs b/datafusion/functions/src/unicode/mod.rs index 291de3843903..ea4e70a92199 100644 --- a/datafusion/functions/src/unicode/mod.rs +++ b/datafusion/functions/src/unicode/mod.rs @@ -22,6 +22,11 @@ use std::sync::Arc; use datafusion_expr::ScalarUDF; mod character_length; +mod left; +mod lpad; +mod reverse; +mod right; +mod rpad; // create UDFs make_udf_function!( @@ -29,6 +34,11 @@ make_udf_function!( CHARACTER_LENGTH, character_length ); +make_udf_function!(left::LeftFunc, LEFT, left); +make_udf_function!(lpad::LPadFunc, LPAD, lpad); +make_udf_function!(right::RightFunc, RIGHT, right); +make_udf_function!(reverse::ReverseFunc, REVERSE, reverse); +make_udf_function!(rpad::RPadFunc, RPAD, rpad); pub mod expr_fn { use datafusion_expr::Expr; @@ -47,9 +57,41 @@ pub mod expr_fn { pub fn length(string: Expr) -> Expr { character_length(string) } + + #[doc = "returns the first `n` characters in the `string`"] + pub fn left(string: Expr, n: Expr) -> Expr { + super::left().call(vec![string, n]) + } + + #[doc = "fill up a string to the length by prepending the characters"] + pub fn lpad(args: Vec) -> Expr { + super::lpad().call(args) + } + + #[doc = "reverses the `string`"] + pub fn reverse(string: Expr) -> Expr { + super::reverse().call(vec![string]) + } + + #[doc = "returns the last `n` characters in the `string`"] + pub fn right(string: Expr, n: Expr) -> Expr { + super::right().call(vec![string, n]) + } + + #[doc = "fill up a string to the length by appending the characters"] + pub fn rpad(args: Vec) -> Expr { + super::rpad().call(args) + } } /// Return a list of all functions in this package pub fn functions() -> Vec> { - vec![character_length()] + vec![ + character_length(), + left(), + lpad(), + reverse(), + right(), + rpad(), + ] } diff --git a/datafusion/functions/src/unicode/reverse.rs b/datafusion/functions/src/unicode/reverse.rs new file mode 100644 index 000000000000..42ca6e0d17c3 --- /dev/null +++ b/datafusion/functions/src/unicode/reverse.rs @@ -0,0 +1,149 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; +use arrow::datatypes::DataType; + +use datafusion_common::cast::as_generic_string_array; +use datafusion_common::{exec_err, Result}; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; + +use crate::utils::{make_scalar_function, utf8_to_str_type}; + +#[derive(Debug)] +pub(super) struct ReverseFunc { + signature: Signature, +} + +impl ReverseFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::uniform( + 1, + vec![Utf8, LargeUtf8], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for ReverseFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "reverse" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + utf8_to_str_type(&arg_types[0], "reverse") + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + match args[0].data_type() { + DataType::Utf8 => make_scalar_function(reverse::, vec![])(args), + DataType::LargeUtf8 => make_scalar_function(reverse::, vec![])(args), + other => { + exec_err!("Unsupported data type {other:?} for function reverse") + } + } + } +} + +/// Reverses the order of the characters in the string. +/// reverse('abcde') = 'edcba' +/// The implementation uses UTF-8 code points as characters +pub fn reverse(args: &[ArrayRef]) -> Result { + let string_array = as_generic_string_array::(&args[0])?; + + let result = string_array + .iter() + .map(|string| string.map(|string: &str| string.chars().rev().collect::())) + .collect::>(); + + Ok(Arc::new(result) as ArrayRef) +} + +#[cfg(test)] +mod tests { + use arrow::array::{Array, StringArray}; + use arrow::datatypes::DataType::Utf8; + + use datafusion_common::{Result, ScalarValue}; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + + use crate::unicode::reverse::ReverseFunc; + use crate::utils::test::test_function; + + #[test] + fn test_functions() -> Result<()> { + test_function!( + ReverseFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::from("abcde"))], + Ok(Some("edcba")), + &str, + Utf8, + StringArray + ); + test_function!( + ReverseFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::from("loẅks"))], + Ok(Some("sk̈wol")), + &str, + Utf8, + StringArray + ); + test_function!( + ReverseFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::from("loẅks"))], + Ok(Some("sk̈wol")), + &str, + Utf8, + StringArray + ); + test_function!( + ReverseFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8(None))], + Ok(None), + &str, + Utf8, + StringArray + ); + #[cfg(not(feature = "unicode_expressions"))] + test_function!( + ReverseFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::from("abcde"))], + internal_err!( + "function reverse requires compilation with feature flag: unicode_expressions." + ), + &str, + Utf8, + StringArray + ); + + Ok(()) + } +} diff --git a/datafusion/functions/src/unicode/right.rs b/datafusion/functions/src/unicode/right.rs new file mode 100644 index 000000000000..d1bd976342b2 --- /dev/null +++ b/datafusion/functions/src/unicode/right.rs @@ -0,0 +1,238 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::cmp::{max, Ordering}; +use std::sync::Arc; + +use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; +use arrow::datatypes::DataType; + +use datafusion_common::cast::{as_generic_string_array, as_int64_array}; +use datafusion_common::exec_err; +use datafusion_common::Result; +use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; + +use crate::utils::{make_scalar_function, utf8_to_str_type}; + +#[derive(Debug)] +pub(super) struct RightFunc { + signature: Signature, +} + +impl RightFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8, Int64])], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for RightFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "right" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + utf8_to_str_type(&arg_types[0], "right") + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + match args[0].data_type() { + DataType::Utf8 => make_scalar_function(right::, vec![])(args), + DataType::LargeUtf8 => make_scalar_function(right::, vec![])(args), + other => exec_err!("Unsupported data type {other:?} for function right"), + } + } +} + +/// Returns last n characters in the string, or when n is negative, returns all but first |n| characters. +/// right('abcde', 2) = 'de' +/// The implementation uses UTF-8 code points as characters +pub fn right(args: &[ArrayRef]) -> Result { + let string_array = as_generic_string_array::(&args[0])?; + let n_array = as_int64_array(&args[1])?; + + let result = string_array + .iter() + .zip(n_array.iter()) + .map(|(string, n)| match (string, n) { + (Some(string), Some(n)) => match n.cmp(&0) { + Ordering::Less => Some( + string + .chars() + .skip(n.unsigned_abs() as usize) + .collect::(), + ), + Ordering::Equal => Some("".to_string()), + Ordering::Greater => Some( + string + .chars() + .skip(max(string.chars().count() as i64 - n, 0) as usize) + .collect::(), + ), + }, + _ => None, + }) + .collect::>(); + + Ok(Arc::new(result) as ArrayRef) +} + +#[cfg(test)] +mod tests { + use arrow::array::{Array, StringArray}; + use arrow::datatypes::DataType::Utf8; + + use datafusion_common::{Result, ScalarValue}; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + + use crate::unicode::right::RightFunc; + use crate::utils::test::test_function; + + #[test] + fn test_functions() -> Result<()> { + test_function!( + RightFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::from(2i64)), + ], + Ok(Some("de")), + &str, + Utf8, + StringArray + ); + test_function!( + RightFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::from(200i64)), + ], + Ok(Some("abcde")), + &str, + Utf8, + StringArray + ); + test_function!( + RightFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::from(-2i64)), + ], + Ok(Some("cde")), + &str, + Utf8, + StringArray + ); + test_function!( + RightFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::from(-200i64)), + ], + Ok(Some("")), + &str, + Utf8, + StringArray + ); + test_function!( + RightFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::from(0i64)), + ], + Ok(Some("")), + &str, + Utf8, + StringArray + ); + test_function!( + RightFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(None)), + ColumnarValue::Scalar(ScalarValue::from(2i64)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + RightFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::Int64(None)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + RightFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("joséésoj")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ], + Ok(Some("éésoj")), + &str, + Utf8, + StringArray + ); + test_function!( + RightFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("joséésoj")), + ColumnarValue::Scalar(ScalarValue::from(-3i64)), + ], + Ok(Some("éésoj")), + &str, + Utf8, + StringArray + ); + #[cfg(not(feature = "unicode_expressions"))] + test_function!( + RightFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::from(2i64)), + ], + internal_err!( + "function right requires compilation with feature flag: unicode_expressions." + ), + &str, + Utf8, + StringArray + ); + + Ok(()) + } +} diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs new file mode 100644 index 000000000000..070278c90b2f --- /dev/null +++ b/datafusion/functions/src/unicode/rpad.rs @@ -0,0 +1,361 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; +use arrow::datatypes::DataType; +use datafusion_common::cast::{as_generic_string_array, as_int64_array}; +use unicode_segmentation::UnicodeSegmentation; + +use crate::utils::{make_scalar_function, utf8_to_str_type}; +use datafusion_common::{exec_err, Result}; +use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; + +#[derive(Debug)] +pub(super) struct RPadFunc { + signature: Signature, +} + +impl RPadFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![ + Exact(vec![Utf8, Int64]), + Exact(vec![LargeUtf8, Int64]), + Exact(vec![Utf8, Int64, Utf8]), + Exact(vec![LargeUtf8, Int64, Utf8]), + Exact(vec![Utf8, Int64, LargeUtf8]), + Exact(vec![LargeUtf8, Int64, LargeUtf8]), + ], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for RPadFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "rpad" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + utf8_to_str_type(&arg_types[0], "rpad") + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + match args[0].data_type() { + DataType::Utf8 => make_scalar_function(rpad::, vec![])(args), + DataType::LargeUtf8 => make_scalar_function(rpad::, vec![])(args), + other => exec_err!("Unsupported data type {other:?} for function rpad"), + } + } +} + +/// Extends the string to length 'length' by appending the characters fill (a space by default). If the string is already longer than length then it is truncated. +/// rpad('hi', 5, 'xy') = 'hixyx' +pub fn rpad(args: &[ArrayRef]) -> Result { + match args.len() { + 2 => { + let string_array = as_generic_string_array::(&args[0])?; + let length_array = as_int64_array(&args[1])?; + + let result = string_array + .iter() + .zip(length_array.iter()) + .map(|(string, length)| match (string, length) { + (Some(string), Some(length)) => { + if length > i32::MAX as i64 { + return exec_err!( + "rpad requested length {length} too large" + ); + } + + let length = if length < 0 { 0 } else { length as usize }; + if length == 0 { + Ok(Some("".to_string())) + } else { + let graphemes = string.graphemes(true).collect::>(); + if length < graphemes.len() { + Ok(Some(graphemes[..length].concat())) + } else { + let mut s = string.to_string(); + s.push_str(" ".repeat(length - graphemes.len()).as_str()); + Ok(Some(s)) + } + } + } + _ => Ok(None), + }) + .collect::>>()?; + Ok(Arc::new(result) as ArrayRef) + } + 3 => { + let string_array = as_generic_string_array::(&args[0])?; + let length_array = as_int64_array(&args[1])?; + let fill_array = as_generic_string_array::(&args[2])?; + + let result = string_array + .iter() + .zip(length_array.iter()) + .zip(fill_array.iter()) + .map(|((string, length), fill)| match (string, length, fill) { + (Some(string), Some(length), Some(fill)) => { + if length > i32::MAX as i64 { + return exec_err!( + "rpad requested length {length} too large" + ); + } + + let length = if length < 0 { 0 } else { length as usize }; + let graphemes = string.graphemes(true).collect::>(); + let fill_chars = fill.chars().collect::>(); + + if length < graphemes.len() { + Ok(Some(graphemes[..length].concat())) + } else if fill_chars.is_empty() { + Ok(Some(string.to_string())) + } else { + let mut s = string.to_string(); + let mut char_vector = + Vec::::with_capacity(length - graphemes.len()); + for l in 0..length - graphemes.len() { + char_vector + .push(*fill_chars.get(l % fill_chars.len()).unwrap()); + } + s.push_str(char_vector.iter().collect::().as_str()); + Ok(Some(s)) + } + } + _ => Ok(None), + }) + .collect::>>()?; + + Ok(Arc::new(result) as ArrayRef) + } + other => exec_err!( + "rpad was called with {other} arguments. It requires at least 2 and at most 3." + ), + } +} + +#[cfg(test)] +mod tests { + use arrow::array::{Array, StringArray}; + use arrow::datatypes::DataType::Utf8; + + use datafusion_common::{Result, ScalarValue}; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + + use crate::unicode::rpad::RPadFunc; + use crate::utils::test::test_function; + + #[test] + fn test_functions() -> Result<()> { + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("josé")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ], + Ok(Some("josé ")), + &str, + Utf8, + StringArray + ); + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ], + Ok(Some("hi ")), + &str, + Utf8, + StringArray + ); + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(0i64)), + ], + Ok(Some("")), + &str, + Utf8, + StringArray + ); + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::Int64(None)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(None)), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ColumnarValue::Scalar(ScalarValue::from("xy")), + ], + Ok(Some("hixyx")), + &str, + Utf8, + StringArray + ); + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(21i64)), + ColumnarValue::Scalar(ScalarValue::from("abcdef")), + ], + Ok(Some("hiabcdefabcdefabcdefa")), + &str, + Utf8, + StringArray + ); + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ColumnarValue::Scalar(ScalarValue::from(" ")), + ], + Ok(Some("hi ")), + &str, + Utf8, + StringArray + ); + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ColumnarValue::Scalar(ScalarValue::from("")), + ], + Ok(Some("hi")), + &str, + Utf8, + StringArray + ); + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(None)), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ColumnarValue::Scalar(ScalarValue::from("xy")), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::Int64(None)), + ColumnarValue::Scalar(ScalarValue::from("xy")), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ColumnarValue::Scalar(ScalarValue::Utf8(None)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("josé")), + ColumnarValue::Scalar(ScalarValue::from(10i64)), + ColumnarValue::Scalar(ScalarValue::from("xy")), + ], + Ok(Some("joséxyxyxy")), + &str, + Utf8, + StringArray + ); + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("josé")), + ColumnarValue::Scalar(ScalarValue::from(10i64)), + ColumnarValue::Scalar(ScalarValue::from("éñ")), + ], + Ok(Some("josééñéñéñ")), + &str, + Utf8, + StringArray + ); + #[cfg(not(feature = "unicode_expressions"))] + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("josé")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ], + internal_err!( + "function rpad requires compilation with feature flag: unicode_expressions." + ), + &str, + Utf8, + StringArray + ); + + Ok(()) + } +} diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index 9adc8536341d..c1b4900e399a 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -270,67 +270,6 @@ pub fn create_physical_fun( exec_err!("Unsupported data type {other:?} for function initcap") } }), - BuiltinScalarFunction::Left => Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => { - let func = invoke_if_unicode_expressions_feature_flag!(left, i32, "left"); - make_scalar_function_inner(func)(args) - } - DataType::LargeUtf8 => { - let func = invoke_if_unicode_expressions_feature_flag!(left, i64, "left"); - make_scalar_function_inner(func)(args) - } - other => exec_err!("Unsupported data type {other:?} for function left"), - }), - BuiltinScalarFunction::Lpad => Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => { - let func = invoke_if_unicode_expressions_feature_flag!(lpad, i32, "lpad"); - make_scalar_function_inner(func)(args) - } - DataType::LargeUtf8 => { - let func = invoke_if_unicode_expressions_feature_flag!(lpad, i64, "lpad"); - make_scalar_function_inner(func)(args) - } - other => exec_err!("Unsupported data type {other:?} for function lpad"), - }), - BuiltinScalarFunction::Reverse => Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => { - let func = - invoke_if_unicode_expressions_feature_flag!(reverse, i32, "reverse"); - make_scalar_function_inner(func)(args) - } - DataType::LargeUtf8 => { - let func = - invoke_if_unicode_expressions_feature_flag!(reverse, i64, "reverse"); - make_scalar_function_inner(func)(args) - } - other => { - exec_err!("Unsupported data type {other:?} for function reverse") - } - }), - BuiltinScalarFunction::Right => Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => { - let func = - invoke_if_unicode_expressions_feature_flag!(right, i32, "right"); - make_scalar_function_inner(func)(args) - } - DataType::LargeUtf8 => { - let func = - invoke_if_unicode_expressions_feature_flag!(right, i64, "right"); - make_scalar_function_inner(func)(args) - } - other => exec_err!("Unsupported data type {other:?} for function right"), - }), - BuiltinScalarFunction::Rpad => Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => { - let func = invoke_if_unicode_expressions_feature_flag!(rpad, i32, "rpad"); - make_scalar_function_inner(func)(args) - } - DataType::LargeUtf8 => { - let func = invoke_if_unicode_expressions_feature_flag!(rpad, i64, "rpad"); - make_scalar_function_inner(func)(args) - } - other => exec_err!("Unsupported data type {other:?} for function rpad"), - }), BuiltinScalarFunction::EndsWith => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { make_scalar_function_inner(string_expressions::ends_with::)(args) @@ -691,551 +630,6 @@ mod tests { Utf8, StringArray ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Left, - &[lit("abcde"), lit(ScalarValue::Int8(Some(2))),], - Ok(Some("ab")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Left, - &[lit("abcde"), lit(ScalarValue::Int64(Some(200))),], - Ok(Some("abcde")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Left, - &[lit("abcde"), lit(ScalarValue::Int64(Some(-2))),], - Ok(Some("abc")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Left, - &[lit("abcde"), lit(ScalarValue::Int64(Some(-200))),], - Ok(Some("")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Left, - &[lit("abcde"), lit(ScalarValue::Int64(Some(0))),], - Ok(Some("")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Left, - &[ - lit(ScalarValue::Utf8(None)), - lit(ScalarValue::Int64(Some(2))), - ], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Left, - &[lit("abcde"), lit(ScalarValue::Int64(None)),], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Left, - &[lit("joséésoj"), lit(ScalarValue::Int64(Some(5))),], - Ok(Some("joséé")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Left, - &[lit("joséésoj"), lit(ScalarValue::Int64(Some(-3))),], - Ok(Some("joséé")), - &str, - Utf8, - StringArray - ); - #[cfg(not(feature = "unicode_expressions"))] - test_function!( - Left, - &[ - lit("abcde"), - lit(ScalarValue::Int8(Some(2))), - ], - internal_err!( - "function left requires compilation with feature flag: unicode_expressions." - ), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[lit("josé"), lit(ScalarValue::Int64(Some(5))),], - Ok(Some(" josé")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[lit("hi"), lit(ScalarValue::Int64(Some(5))),], - Ok(Some(" hi")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[lit("hi"), lit(ScalarValue::Int64(Some(0))),], - Ok(Some("")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[lit("hi"), lit(ScalarValue::Int64(None)),], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[ - lit(ScalarValue::Utf8(None)), - lit(ScalarValue::Int64(Some(5))), - ], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[lit("hi"), lit(ScalarValue::Int64(Some(5))), lit("xy"),], - Ok(Some("xyxhi")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[lit("hi"), lit(ScalarValue::Int64(Some(21))), lit("abcdef"),], - Ok(Some("abcdefabcdefabcdefahi")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[lit("hi"), lit(ScalarValue::Int64(Some(5))), lit(" "),], - Ok(Some(" hi")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[lit("hi"), lit(ScalarValue::Int64(Some(5))), lit(""),], - Ok(Some("hi")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[ - lit(ScalarValue::Utf8(None)), - lit(ScalarValue::Int64(Some(5))), - lit("xy"), - ], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[lit("hi"), lit(ScalarValue::Int64(None)), lit("xy"),], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[ - lit("hi"), - lit(ScalarValue::Int64(Some(5))), - lit(ScalarValue::Utf8(None)), - ], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[lit("josé"), lit(ScalarValue::Int64(Some(10))), lit("xy"),], - Ok(Some("xyxyxyjosé")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[lit("josé"), lit(ScalarValue::Int64(Some(10))), lit("éñ"),], - Ok(Some("éñéñéñjosé")), - &str, - Utf8, - StringArray - ); - #[cfg(not(feature = "unicode_expressions"))] - test_function!( - Lpad, - &[ - lit("josé"), - lit(ScalarValue::Int64(Some(5))), - ], - internal_err!( - "function lpad requires compilation with feature flag: unicode_expressions." - ), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Reverse, - &[lit("abcde")], - Ok(Some("edcba")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Reverse, - &[lit("loẅks")], - Ok(Some("sk̈wol")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Reverse, - &[lit("loẅks")], - Ok(Some("sk̈wol")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Reverse, - &[lit(ScalarValue::Utf8(None))], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(not(feature = "unicode_expressions"))] - test_function!( - Reverse, - &[lit("abcde")], - internal_err!( - "function reverse requires compilation with feature flag: unicode_expressions." - ), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Right, - &[lit("abcde"), lit(ScalarValue::Int8(Some(2))),], - Ok(Some("de")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Right, - &[lit("abcde"), lit(ScalarValue::Int64(Some(200))),], - Ok(Some("abcde")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Right, - &[lit("abcde"), lit(ScalarValue::Int64(Some(-2))),], - Ok(Some("cde")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Right, - &[lit("abcde"), lit(ScalarValue::Int64(Some(-200))),], - Ok(Some("")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Right, - &[lit("abcde"), lit(ScalarValue::Int64(Some(0))),], - Ok(Some("")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Right, - &[ - lit(ScalarValue::Utf8(None)), - lit(ScalarValue::Int64(Some(2))), - ], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Right, - &[lit("abcde"), lit(ScalarValue::Int64(None)),], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Right, - &[lit("joséésoj"), lit(ScalarValue::Int64(Some(5))),], - Ok(Some("éésoj")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Right, - &[lit("joséésoj"), lit(ScalarValue::Int64(Some(-3))),], - Ok(Some("éésoj")), - &str, - Utf8, - StringArray - ); - #[cfg(not(feature = "unicode_expressions"))] - test_function!( - Right, - &[ - lit("abcde"), - lit(ScalarValue::Int8(Some(2))), - ], - internal_err!( - "function right requires compilation with feature flag: unicode_expressions." - ), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[lit("josé"), lit(ScalarValue::Int64(Some(5))),], - Ok(Some("josé ")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[lit("hi"), lit(ScalarValue::Int64(Some(5))),], - Ok(Some("hi ")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[lit("hi"), lit(ScalarValue::Int64(Some(0))),], - Ok(Some("")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[lit("hi"), lit(ScalarValue::Int64(None)),], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[ - lit(ScalarValue::Utf8(None)), - lit(ScalarValue::Int64(Some(5))), - ], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[lit("hi"), lit(ScalarValue::Int64(Some(5))), lit("xy"),], - Ok(Some("hixyx")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[lit("hi"), lit(ScalarValue::Int64(Some(21))), lit("abcdef"),], - Ok(Some("hiabcdefabcdefabcdefa")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[lit("hi"), lit(ScalarValue::Int64(Some(5))), lit(" "),], - Ok(Some("hi ")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[lit("hi"), lit(ScalarValue::Int64(Some(5))), lit(""),], - Ok(Some("hi")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[ - lit(ScalarValue::Utf8(None)), - lit(ScalarValue::Int64(Some(5))), - lit("xy"), - ], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[lit("hi"), lit(ScalarValue::Int64(None)), lit("xy"),], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[ - lit("hi"), - lit(ScalarValue::Int64(Some(5))), - lit(ScalarValue::Utf8(None)), - ], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[lit("josé"), lit(ScalarValue::Int64(Some(10))), lit("xy"),], - Ok(Some("joséxyxyxy")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[lit("josé"), lit(ScalarValue::Int64(Some(10))), lit("éñ"),], - Ok(Some("josééñéñéñ")), - &str, - Utf8, - StringArray - ); - #[cfg(not(feature = "unicode_expressions"))] - test_function!( - Rpad, - &[ - lit("josé"), - lit(ScalarValue::Int64(Some(5))), - ], - internal_err!( - "function rpad requires compilation with feature flag: unicode_expressions." - ), - &str, - Utf8, - StringArray - ); test_function!( EndsWith, &[lit("alphabet"), lit("alph"),], diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs index 319d9ca2269a..0dbea09ffb51 100644 --- a/datafusion/physical-expr/src/planner.rs +++ b/datafusion/physical-expr/src/planner.rs @@ -335,11 +335,11 @@ mod tests { use arrow_array::{ArrayRef, BooleanArray, RecordBatch, StringArray}; use arrow_schema::{DataType, Field, Schema}; use datafusion_common::{DFSchema, Result}; - use datafusion_expr::{col, left, Literal}; + use datafusion_expr::{col, lit}; #[test] fn test_create_physical_expr_scalar_input_output() -> Result<()> { - let expr = col("letter").eq(left("APACHE".lit(), 1i64.lit())); + let expr = col("letter").eq(lit("A")); let schema = Schema::new(vec![Field::new("letter", DataType::Utf8, false)]); let df_schema = DFSchema::try_from_qualified_schema("data", &schema)?; diff --git a/datafusion/physical-expr/src/unicode_expressions.rs b/datafusion/physical-expr/src/unicode_expressions.rs index c7e4b7d7c443..faff21111a61 100644 --- a/datafusion/physical-expr/src/unicode_expressions.rs +++ b/datafusion/physical-expr/src/unicode_expressions.rs @@ -21,7 +21,7 @@ //! Unicode expressions -use std::cmp::{max, Ordering}; +use std::cmp::max; use std::sync::Arc; use arrow::{ @@ -36,267 +36,6 @@ use datafusion_common::{ exec_err, Result, }; -/// Returns first n characters in the string, or when n is negative, returns all but last |n| characters. -/// left('abcde', 2) = 'ab' -/// The implementation uses UTF-8 code points as characters -pub fn left(args: &[ArrayRef]) -> Result { - let string_array = as_generic_string_array::(&args[0])?; - let n_array = as_int64_array(&args[1])?; - let result = string_array - .iter() - .zip(n_array.iter()) - .map(|(string, n)| match (string, n) { - (Some(string), Some(n)) => match n.cmp(&0) { - Ordering::Less => { - let len = string.chars().count() as i64; - Some(if n.abs() < len { - string.chars().take((len + n) as usize).collect::() - } else { - "".to_string() - }) - } - Ordering::Equal => Some("".to_string()), - Ordering::Greater => { - Some(string.chars().take(n as usize).collect::()) - } - }, - _ => None, - }) - .collect::>(); - - Ok(Arc::new(result) as ArrayRef) -} - -/// Extends the string to length 'length' by prepending the characters fill (a space by default). If the string is already longer than length then it is truncated (on the right). -/// lpad('hi', 5, 'xy') = 'xyxhi' -pub fn lpad(args: &[ArrayRef]) -> Result { - match args.len() { - 2 => { - let string_array = as_generic_string_array::(&args[0])?; - let length_array = as_int64_array(&args[1])?; - - let result = string_array - .iter() - .zip(length_array.iter()) - .map(|(string, length)| match (string, length) { - (Some(string), Some(length)) => { - if length > i32::MAX as i64 { - return exec_err!( - "lpad requested length {length} too large" - ); - } - - let length = if length < 0 { 0 } else { length as usize }; - if length == 0 { - Ok(Some("".to_string())) - } else { - let graphemes = string.graphemes(true).collect::>(); - if length < graphemes.len() { - Ok(Some(graphemes[..length].concat())) - } else { - let mut s: String = " ".repeat(length - graphemes.len()); - s.push_str(string); - Ok(Some(s)) - } - } - } - _ => Ok(None), - }) - .collect::>>()?; - - Ok(Arc::new(result) as ArrayRef) - } - 3 => { - let string_array = as_generic_string_array::(&args[0])?; - let length_array = as_int64_array(&args[1])?; - let fill_array = as_generic_string_array::(&args[2])?; - - let result = string_array - .iter() - .zip(length_array.iter()) - .zip(fill_array.iter()) - .map(|((string, length), fill)| match (string, length, fill) { - (Some(string), Some(length), Some(fill)) => { - if length > i32::MAX as i64 { - return exec_err!( - "lpad requested length {length} too large" - ); - } - - let length = if length < 0 { 0 } else { length as usize }; - if length == 0 { - Ok(Some("".to_string())) - } else { - let graphemes = string.graphemes(true).collect::>(); - let fill_chars = fill.chars().collect::>(); - - if length < graphemes.len() { - Ok(Some(graphemes[..length].concat())) - } else if fill_chars.is_empty() { - Ok(Some(string.to_string())) - } else { - let mut s = string.to_string(); - let mut char_vector = - Vec::::with_capacity(length - graphemes.len()); - for l in 0..length - graphemes.len() { - char_vector.push( - *fill_chars.get(l % fill_chars.len()).unwrap(), - ); - } - s.insert_str( - 0, - char_vector.iter().collect::().as_str(), - ); - Ok(Some(s)) - } - } - } - _ => Ok(None), - }) - .collect::>>()?; - - Ok(Arc::new(result) as ArrayRef) - } - other => exec_err!( - "lpad was called with {other} arguments. It requires at least 2 and at most 3." - ), - } -} - -/// Reverses the order of the characters in the string. -/// reverse('abcde') = 'edcba' -/// The implementation uses UTF-8 code points as characters -pub fn reverse(args: &[ArrayRef]) -> Result { - let string_array = as_generic_string_array::(&args[0])?; - - let result = string_array - .iter() - .map(|string| string.map(|string: &str| string.chars().rev().collect::())) - .collect::>(); - - Ok(Arc::new(result) as ArrayRef) -} - -/// Returns last n characters in the string, or when n is negative, returns all but first |n| characters. -/// right('abcde', 2) = 'de' -/// The implementation uses UTF-8 code points as characters -pub fn right(args: &[ArrayRef]) -> Result { - let string_array = as_generic_string_array::(&args[0])?; - let n_array = as_int64_array(&args[1])?; - - let result = string_array - .iter() - .zip(n_array.iter()) - .map(|(string, n)| match (string, n) { - (Some(string), Some(n)) => match n.cmp(&0) { - Ordering::Less => Some( - string - .chars() - .skip(n.unsigned_abs() as usize) - .collect::(), - ), - Ordering::Equal => Some("".to_string()), - Ordering::Greater => Some( - string - .chars() - .skip(max(string.chars().count() as i64 - n, 0) as usize) - .collect::(), - ), - }, - _ => None, - }) - .collect::>(); - - Ok(Arc::new(result) as ArrayRef) -} - -/// Extends the string to length 'length' by appending the characters fill (a space by default). If the string is already longer than length then it is truncated. -/// rpad('hi', 5, 'xy') = 'hixyx' -pub fn rpad(args: &[ArrayRef]) -> Result { - match args.len() { - 2 => { - let string_array = as_generic_string_array::(&args[0])?; - let length_array = as_int64_array(&args[1])?; - - let result = string_array - .iter() - .zip(length_array.iter()) - .map(|(string, length)| match (string, length) { - (Some(string), Some(length)) => { - if length > i32::MAX as i64 { - return exec_err!( - "rpad requested length {length} too large" - ); - } - - let length = if length < 0 { 0 } else { length as usize }; - if length == 0 { - Ok(Some("".to_string())) - } else { - let graphemes = string.graphemes(true).collect::>(); - if length < graphemes.len() { - Ok(Some(graphemes[..length].concat())) - } else { - let mut s = string.to_string(); - s.push_str(" ".repeat(length - graphemes.len()).as_str()); - Ok(Some(s)) - } - } - } - _ => Ok(None), - }) - .collect::>>()?; - Ok(Arc::new(result) as ArrayRef) - } - 3 => { - let string_array = as_generic_string_array::(&args[0])?; - let length_array = as_int64_array(&args[1])?; - let fill_array = as_generic_string_array::(&args[2])?; - - let result = string_array - .iter() - .zip(length_array.iter()) - .zip(fill_array.iter()) - .map(|((string, length), fill)| match (string, length, fill) { - (Some(string), Some(length), Some(fill)) => { - if length > i32::MAX as i64 { - return exec_err!( - "rpad requested length {length} too large" - ); - } - - let length = if length < 0 { 0 } else { length as usize }; - let graphemes = string.graphemes(true).collect::>(); - let fill_chars = fill.chars().collect::>(); - - if length < graphemes.len() { - Ok(Some(graphemes[..length].concat())) - } else if fill_chars.is_empty() { - Ok(Some(string.to_string())) - } else { - let mut s = string.to_string(); - let mut char_vector = - Vec::::with_capacity(length - graphemes.len()); - for l in 0..length - graphemes.len() { - char_vector - .push(*fill_chars.get(l % fill_chars.len()).unwrap()); - } - s.push_str(char_vector.iter().collect::().as_str()); - Ok(Some(s)) - } - } - _ => Ok(None), - }) - .collect::>>()?; - - Ok(Arc::new(result) as ArrayRef) - } - other => exec_err!( - "rpad was called with {other} arguments. It requires at least 2 and at most 3." - ), - } -} - /// Returns starting index of specified substring within string, or zero if it's not present. (Same as position(substring in string), but note the reversed argument order.) /// strpos('high', 'ig') = 2 /// The implementation uses UTF-8 code points as characters diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 766ca6633ee1..6319372d98d2 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -572,8 +572,8 @@ enum ScalarFunction { // 28 was DatePart // 29 was DateTrunc InitCap = 30; - Left = 31; - Lpad = 32; + // 31 was Left + // 32 was Lpad // 33 was Lower // 34 was Ltrim // 35 was MD5 @@ -583,9 +583,9 @@ enum ScalarFunction { // 39 was RegexpReplace // 40 was Repeat // 41 was Replace - Reverse = 42; - Right = 43; - Rpad = 44; + // 42 was Reverse + // 43 was Right + // 44 was Rpad // 45 was Rtrim // 46 was SHA224 // 47 was SHA256 diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index f2814956ef1b..7281bc9dc263 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -22931,12 +22931,7 @@ impl serde::Serialize for ScalarFunction { Self::Concat => "Concat", Self::ConcatWithSeparator => "ConcatWithSeparator", Self::InitCap => "InitCap", - Self::Left => "Left", - Self::Lpad => "Lpad", Self::Random => "Random", - Self::Reverse => "Reverse", - Self::Right => "Right", - Self::Rpad => "Rpad", Self::Strpos => "Strpos", Self::Substr => "Substr", Self::Translate => "Translate", @@ -22990,12 +22985,7 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Concat", "ConcatWithSeparator", "InitCap", - "Left", - "Lpad", "Random", - "Reverse", - "Right", - "Rpad", "Strpos", "Substr", "Translate", @@ -23078,12 +23068,7 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Concat" => Ok(ScalarFunction::Concat), "ConcatWithSeparator" => Ok(ScalarFunction::ConcatWithSeparator), "InitCap" => Ok(ScalarFunction::InitCap), - "Left" => Ok(ScalarFunction::Left), - "Lpad" => Ok(ScalarFunction::Lpad), "Random" => Ok(ScalarFunction::Random), - "Reverse" => Ok(ScalarFunction::Reverse), - "Right" => Ok(ScalarFunction::Right), - "Rpad" => Ok(ScalarFunction::Rpad), "Strpos" => Ok(ScalarFunction::Strpos), "Substr" => Ok(ScalarFunction::Substr), "Translate" => Ok(ScalarFunction::Translate), diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index ecc94fcdaf99..2fe89efb9cea 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -2871,8 +2871,8 @@ pub enum ScalarFunction { /// 28 was DatePart /// 29 was DateTrunc InitCap = 30, - Left = 31, - Lpad = 32, + /// 31 was Left + /// 32 was Lpad /// 33 was Lower /// 34 was Ltrim /// 35 was MD5 @@ -2882,9 +2882,9 @@ pub enum ScalarFunction { /// 39 was RegexpReplace /// 40 was Repeat /// 41 was Replace - Reverse = 42, - Right = 43, - Rpad = 44, + /// 42 was Reverse + /// 43 was Right + /// 44 was Rpad /// 45 was Rtrim /// 46 was SHA224 /// 47 was SHA256 @@ -3004,12 +3004,7 @@ impl ScalarFunction { ScalarFunction::Concat => "Concat", ScalarFunction::ConcatWithSeparator => "ConcatWithSeparator", ScalarFunction::InitCap => "InitCap", - ScalarFunction::Left => "Left", - ScalarFunction::Lpad => "Lpad", ScalarFunction::Random => "Random", - ScalarFunction::Reverse => "Reverse", - ScalarFunction::Right => "Right", - ScalarFunction::Rpad => "Rpad", ScalarFunction::Strpos => "Strpos", ScalarFunction::Substr => "Substr", ScalarFunction::Translate => "Translate", @@ -3057,12 +3052,7 @@ impl ScalarFunction { "Concat" => Some(Self::Concat), "ConcatWithSeparator" => Some(Self::ConcatWithSeparator), "InitCap" => Some(Self::InitCap), - "Left" => Some(Self::Left), - "Lpad" => Some(Self::Lpad), "Random" => Some(Self::Random), - "Reverse" => Some(Self::Reverse), - "Right" => Some(Self::Right), - "Rpad" => Some(Self::Rpad), "Strpos" => Some(Self::Strpos), "Substr" => Some(Self::Substr), "Translate" => Some(Self::Translate), diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index 19edd71a3a80..2c6f2e479b24 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -17,18 +17,6 @@ use std::sync::Arc; -use crate::protobuf::{ - self, - plan_type::PlanTypeEnum::{ - AnalyzedLogicalPlan, FinalAnalyzedLogicalPlan, FinalLogicalPlan, - FinalPhysicalPlan, FinalPhysicalPlanWithStats, InitialLogicalPlan, - InitialPhysicalPlan, InitialPhysicalPlanWithStats, OptimizedLogicalPlan, - OptimizedPhysicalPlan, - }, - AnalyzedLogicalPlanType, CubeNode, GroupingSetNode, OptimizedLogicalPlanType, - OptimizedPhysicalPlanType, PlaceholderNode, RollupNode, -}; - use arrow::{ array::AsArray, buffer::Buffer, @@ -38,6 +26,7 @@ use arrow::{ }, ipc::{reader::read_record_batch, root_as_message}, }; + use datafusion::execution::registry::FunctionRegistry; use datafusion_common::{ arrow_datafusion_err, internal_err, plan_datafusion_err, Column, Constraint, @@ -51,17 +40,29 @@ use datafusion_expr::{ acosh, asinh, atan, atan2, atanh, cbrt, ceil, coalesce, concat_expr, concat_ws_expr, cos, cosh, cot, degrees, ends_with, exp, expr::{self, InList, Sort, WindowFunction}, - factorial, find_in_set, floor, gcd, initcap, iszero, lcm, left, ln, log, log10, log2, + factorial, find_in_set, floor, gcd, initcap, iszero, lcm, ln, log, log10, log2, logical_plan::{PlanType, StringifiedPlan}, - lpad, nanvl, pi, power, radians, random, reverse, right, round, rpad, signum, sin, - sinh, sqrt, strpos, substr, substr_index, substring, translate, trunc, - AggregateFunction, Between, BinaryExpr, BuiltInWindowFunction, BuiltinScalarFunction, - Case, Cast, Expr, GetFieldAccess, GetIndexedField, GroupingSet, + nanvl, pi, power, radians, random, round, signum, sin, sinh, sqrt, strpos, substr, + substr_index, substring, translate, trunc, AggregateFunction, Between, BinaryExpr, + BuiltInWindowFunction, BuiltinScalarFunction, Case, Cast, Expr, GetFieldAccess, + GetIndexedField, GroupingSet, GroupingSet::GroupingSets, JoinConstraint, JoinType, Like, Operator, TryCast, WindowFrame, WindowFrameBound, WindowFrameUnits, }; +use crate::protobuf::{ + self, + plan_type::PlanTypeEnum::{ + AnalyzedLogicalPlan, FinalAnalyzedLogicalPlan, FinalLogicalPlan, + FinalPhysicalPlan, FinalPhysicalPlanWithStats, InitialLogicalPlan, + InitialPhysicalPlan, InitialPhysicalPlanWithStats, OptimizedLogicalPlan, + OptimizedPhysicalPlan, + }, + AnalyzedLogicalPlanType, CubeNode, GroupingSetNode, OptimizedLogicalPlanType, + OptimizedPhysicalPlanType, PlaceholderNode, RollupNode, +}; + use super::LogicalExtensionCodec; #[derive(Debug)] @@ -453,12 +454,7 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::ConcatWithSeparator => Self::ConcatWithSeparator, ScalarFunction::EndsWith => Self::EndsWith, ScalarFunction::InitCap => Self::InitCap, - ScalarFunction::Left => Self::Left, - ScalarFunction::Lpad => Self::Lpad, ScalarFunction::Random => Self::Random, - ScalarFunction::Reverse => Self::Reverse, - ScalarFunction::Right => Self::Right, - ScalarFunction::Rpad => Self::Rpad, ScalarFunction::Strpos => Self::Strpos, ScalarFunction::Substr => Self::Substr, ScalarFunction::Translate => Self::Translate, @@ -1382,26 +1378,13 @@ pub fn parse_expr( parse_expr(&args[0], registry, codec)?, parse_expr(&args[1], registry, codec)?, )), - ScalarFunction::Left => Ok(left( - parse_expr(&args[0], registry, codec)?, - parse_expr(&args[1], registry, codec)?, - )), ScalarFunction::Random => Ok(random()), - ScalarFunction::Reverse => { - Ok(reverse(parse_expr(&args[0], registry, codec)?)) - } - ScalarFunction::Right => Ok(right( - parse_expr(&args[0], registry, codec)?, - parse_expr(&args[1], registry, codec)?, - )), ScalarFunction::Concat => { Ok(concat_expr(parse_exprs(args, registry, codec)?)) } ScalarFunction::ConcatWithSeparator => { Ok(concat_ws_expr(parse_exprs(args, registry, codec)?)) } - ScalarFunction::Lpad => Ok(lpad(parse_exprs(args, registry, codec)?)), - ScalarFunction::Rpad => Ok(rpad(parse_exprs(args, registry, codec)?)), ScalarFunction::EndsWith => Ok(ends_with( parse_expr(&args[0], registry, codec)?, parse_expr(&args[1], registry, codec)?, diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index 11fc7362c75d..ea682a5a22f8 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -1445,12 +1445,7 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::ConcatWithSeparator => Self::ConcatWithSeparator, BuiltinScalarFunction::EndsWith => Self::EndsWith, BuiltinScalarFunction::InitCap => Self::InitCap, - BuiltinScalarFunction::Left => Self::Left, - BuiltinScalarFunction::Lpad => Self::Lpad, BuiltinScalarFunction::Random => Self::Random, - BuiltinScalarFunction::Reverse => Self::Reverse, - BuiltinScalarFunction::Right => Self::Right, - BuiltinScalarFunction::Rpad => Self::Rpad, BuiltinScalarFunction::Strpos => Self::Strpos, BuiltinScalarFunction::Substr => Self::Substr, BuiltinScalarFunction::Translate => Self::Translate,