From 6f9948b8c027f782431805331de174c4092de40a Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Thu, 28 Mar 2024 13:40:36 -0700 Subject: [PATCH 01/15] feat: pass SessionState not SessionConfig to FunctionFactory::create (#9837) --- datafusion-examples/examples/function_factory.rs | 7 ++++--- datafusion/core/src/execution/context/mod.rs | 4 ++-- .../tests/user_defined/user_defined_scalar_functions.rs | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/datafusion-examples/examples/function_factory.rs b/datafusion-examples/examples/function_factory.rs index 6c033e6c8eef..a7c8558c6da8 100644 --- a/datafusion-examples/examples/function_factory.rs +++ b/datafusion-examples/examples/function_factory.rs @@ -16,8 +16,9 @@ // under the License. use datafusion::error::Result; -use datafusion::execution::config::SessionConfig; -use datafusion::execution::context::{FunctionFactory, RegisterFunction, SessionContext}; +use datafusion::execution::context::{ + FunctionFactory, RegisterFunction, SessionContext, SessionState, +}; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::{exec_err, internal_err, DataFusionError}; use datafusion_expr::simplify::ExprSimplifyResult; @@ -91,7 +92,7 @@ impl FunctionFactory for CustomFunctionFactory { /// the function instance. async fn create( &self, - _state: &SessionConfig, + _state: &SessionState, statement: CreateFunction, ) -> Result { let f: ScalarFunctionWrapper = statement.try_into()?; diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 116e45c8c130..31f390607f04 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -794,7 +794,7 @@ impl SessionContext { let function_factory = &state.function_factory; match function_factory { - Some(f) => f.create(state.config(), stmt).await?, + Some(f) => f.create(&state, stmt).await?, _ => Err(DataFusionError::Configuration( "Function factory has not been configured".into(), ))?, @@ -1288,7 +1288,7 @@ pub trait FunctionFactory: Sync + Send { /// Handles creation of user defined function specified in [CreateFunction] statement async fn create( &self, - state: &SessionConfig, + state: &SessionState, statement: CreateFunction, ) -> Result; } diff --git a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs index b525e4fc6341..86be887198ae 100644 --- a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs +++ b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs @@ -747,7 +747,7 @@ struct CustomFunctionFactory {} impl FunctionFactory for CustomFunctionFactory { async fn create( &self, - _state: &SessionConfig, + _state: &SessionState, statement: CreateFunction, ) -> Result { let f: ScalarFunctionWrapper = statement.try_into()?; From 81c96fc3db0ea35638278f32df066be63b745a51 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 28 Mar 2024 17:37:25 -0600 Subject: [PATCH 02/15] Prepare 37.0.0 Release (#9697) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * bump version * changelog * Update configs.md * Update Cargo.toml * Update 37.0.0.md * Update 37.0.0.md * Update 37.0.0.md * update changelog * update changelog --------- Co-authored-by: Daniël Heres --- Cargo.toml | 30 +-- datafusion-cli/Cargo.lock | 24 +-- datafusion-cli/Cargo.toml | 4 +- datafusion/CHANGELOG.md | 1 + dev/changelog/37.0.0.md | 347 ++++++++++++++++++++++++++++++ docs/source/user-guide/configs.md | 2 +- 6 files changed, 378 insertions(+), 30 deletions(-) create mode 100644 dev/changelog/37.0.0.md diff --git a/Cargo.toml b/Cargo.toml index c3dade8bc6c5..8e89e5ef3b85 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,7 +49,7 @@ license = "Apache-2.0" readme = "README.md" repository = "https://github.com/apache/arrow-datafusion" rust-version = "1.72" -version = "36.0.0" +version = "37.0.0" [workspace.dependencies] # We turn off default-features for some dependencies here so the workspaces which inherit them can @@ -71,20 +71,20 @@ bytes = "1.4" chrono = { version = "0.4.34", default-features = false } ctor = "0.2.0" dashmap = "5.4.0" -datafusion = { path = "datafusion/core", version = "36.0.0", default-features = false } -datafusion-common = { path = "datafusion/common", version = "36.0.0", default-features = false } -datafusion-common-runtime = { path = "datafusion/common-runtime", version = "36.0.0" } -datafusion-execution = { path = "datafusion/execution", version = "36.0.0" } -datafusion-expr = { path = "datafusion/expr", version = "36.0.0" } -datafusion-functions = { path = "datafusion/functions", version = "36.0.0" } -datafusion-functions-array = { path = "datafusion/functions-array", version = "36.0.0" } -datafusion-optimizer = { path = "datafusion/optimizer", version = "36.0.0", default-features = false } -datafusion-physical-expr = { path = "datafusion/physical-expr", version = "36.0.0", default-features = false } -datafusion-physical-plan = { path = "datafusion/physical-plan", version = "36.0.0" } -datafusion-proto = { path = "datafusion/proto", version = "36.0.0" } -datafusion-sql = { path = "datafusion/sql", version = "36.0.0" } -datafusion-sqllogictest = { path = "datafusion/sqllogictest", version = "36.0.0" } -datafusion-substrait = { path = "datafusion/substrait", version = "36.0.0" } +datafusion = { path = "datafusion/core", version = "37.0.0", default-features = false } +datafusion-common = { path = "datafusion/common", version = "37.0.0", default-features = false } +datafusion-common-runtime = { path = "datafusion/common-runtime", version = "37.0.0" } +datafusion-execution = { path = "datafusion/execution", version = "37.0.0" } +datafusion-expr = { path = "datafusion/expr", version = "37.0.0" } +datafusion-functions = { path = "datafusion/functions", version = "37.0.0" } +datafusion-functions-array = { path = "datafusion/functions-array", version = "37.0.0" } +datafusion-optimizer = { path = "datafusion/optimizer", version = "37.0.0", default-features = false } +datafusion-physical-expr = { path = "datafusion/physical-expr", version = "37.0.0", default-features = false } +datafusion-physical-plan = { path = "datafusion/physical-plan", version = "37.0.0" } +datafusion-proto = { path = "datafusion/proto", version = "37.0.0" } +datafusion-sql = { path = "datafusion/sql", version = "37.0.0" } +datafusion-sqllogictest = { path = "datafusion/sqllogictest", version = "37.0.0" } +datafusion-substrait = { path = "datafusion/substrait", version = "37.0.0" } doc-comment = "0.3" env_logger = "0.11" futures = "0.3" diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index ba60c04cea55..0277d23f4de0 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1116,7 +1116,7 @@ dependencies = [ [[package]] name = "datafusion" -version = "36.0.0" +version = "37.0.0" dependencies = [ "ahash", "apache-avro", @@ -1167,7 +1167,7 @@ dependencies = [ [[package]] name = "datafusion-cli" -version = "36.0.0" +version = "37.0.0" dependencies = [ "arrow", "assert_cmd", @@ -1195,7 +1195,7 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "36.0.0" +version = "37.0.0" dependencies = [ "ahash", "apache-avro", @@ -1215,14 +1215,14 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "36.0.0" +version = "37.0.0" dependencies = [ "tokio", ] [[package]] name = "datafusion-execution" -version = "36.0.0" +version = "37.0.0" dependencies = [ "arrow", "chrono", @@ -1241,7 +1241,7 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "36.0.0" +version = "37.0.0" dependencies = [ "ahash", "arrow", @@ -1256,7 +1256,7 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "36.0.0" +version = "37.0.0" dependencies = [ "arrow", "base64 0.22.0", @@ -1279,7 +1279,7 @@ dependencies = [ [[package]] name = "datafusion-functions-array" -version = "36.0.0" +version = "37.0.0" dependencies = [ "arrow", "arrow-array", @@ -1297,7 +1297,7 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "36.0.0" +version = "37.0.0" dependencies = [ "arrow", "async-trait", @@ -1313,7 +1313,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "36.0.0" +version = "37.0.0" dependencies = [ "ahash", "arrow", @@ -1346,7 +1346,7 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "36.0.0" +version = "37.0.0" dependencies = [ "ahash", "arrow", @@ -1375,7 +1375,7 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "36.0.0" +version = "37.0.0" dependencies = [ "arrow", "arrow-array", diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index da744a06f3aa..18e14357314e 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "datafusion-cli" description = "Command Line Client for DataFusion query engine." -version = "36.0.0" +version = "37.0.0" authors = ["Apache Arrow "] edition = "2021" keywords = ["arrow", "datafusion", "query", "sql"] @@ -35,7 +35,7 @@ async-trait = "0.1.41" aws-config = "0.55" aws-credential-types = "0.55" clap = { version = "3", features = ["derive", "cargo"] } -datafusion = { path = "../datafusion/core", version = "36.0.0", features = [ +datafusion = { path = "../datafusion/core", version = "37.0.0", features = [ "avro", "crypto_expressions", "datetime_expressions", diff --git a/datafusion/CHANGELOG.md b/datafusion/CHANGELOG.md index 2d09782a3982..c111375e3058 100644 --- a/datafusion/CHANGELOG.md +++ b/datafusion/CHANGELOG.md @@ -19,6 +19,7 @@ # Changelog +- [37.0.0](../dev/changelog/37.0.0.md) - [36.0.0](../dev/changelog/36.0.0.md) - [35.0.0](../dev/changelog/35.0.0.md) - [34.0.0](../dev/changelog/34.0.0.md) diff --git a/dev/changelog/37.0.0.md b/dev/changelog/37.0.0.md new file mode 100644 index 000000000000..b1fcd5fdf008 --- /dev/null +++ b/dev/changelog/37.0.0.md @@ -0,0 +1,347 @@ + + +## [37.0.0](https://github.com/apache/arrow-datafusion/tree/37.0.0) (2024-03-28) + +[Full Changelog](https://github.com/apache/arrow-datafusion/compare/36.0.0...37.0.0) + +**Breaking changes:** + +- refactor: Change `SchemaProvider::table` to return `Result` rather than `Option<..>` [#9307](https://github.com/apache/arrow-datafusion/pull/9307) (crepererum) +- feat: issue_9285: port builtin reg function into datafusion-function-\* (1/3 regexpmatch) [#9329](https://github.com/apache/arrow-datafusion/pull/9329) (Lordworms) +- Cache common plan properties to eliminate recursive calls in physical plan [#9346](https://github.com/apache/arrow-datafusion/pull/9346) (mustafasrepo) +- Consolidate `TreeNode` transform and rewrite APIs [#8891](https://github.com/apache/arrow-datafusion/pull/8891) (peter-toth) +- Extend argument types for udf `return_type_from_exprs` [#9522](https://github.com/apache/arrow-datafusion/pull/9522) (jayzhan211) +- Systematic Configuration in 'Create External Table' and 'Copy To' Options [#9382](https://github.com/apache/arrow-datafusion/pull/9382) (metesynnada) +- Move trim functions (btrim, ltrim, rtrim) to datafusion_functions, make expr_fn API consistent [#9730](https://github.com/apache/arrow-datafusion/pull/9730) (Omega359) + +**Performance related:** + +- perf: improve to_field performance [#9722](https://github.com/apache/arrow-datafusion/pull/9722) (haohuaijin) + +**Implemented enhancements:** + +- feat: support for defining ARRAY columns in `CREATE TABLE` [#9381](https://github.com/apache/arrow-datafusion/pull/9381) (jonahgao) +- feat: support `unnest` in FROM clause [#9355](https://github.com/apache/arrow-datafusion/pull/9355) (jonahgao) +- feat: support nvl2 function [#9364](https://github.com/apache/arrow-datafusion/pull/9364) (guojidan) +- feat: issue #9224 substitute tlide in table path [#9259](https://github.com/apache/arrow-datafusion/pull/9259) (Lordworms) +- feat: replace std Instant with wasm-compatible wrapper [#9189](https://github.com/apache/arrow-datafusion/pull/9189) (waynexia) +- feat: support `unnest` with additional columns [#9400](https://github.com/apache/arrow-datafusion/pull/9400) (jonahgao) +- feat: Support `EscapedStringLiteral`, update sqlparser to `0.44.0` [#9268](https://github.com/apache/arrow-datafusion/pull/9268) (JasonLi-cn) +- feat: add support for fixed list wildcard in type signature [#9312](https://github.com/apache/arrow-datafusion/pull/9312) (universalmind303) +- feat: Add projection to HashJoinExec. [#9236](https://github.com/apache/arrow-datafusion/pull/9236) (my-vegetable-has-exploded) +- feat: function name hints for UDFs [#9407](https://github.com/apache/arrow-datafusion/pull/9407) (SteveLauC) +- feat: Introduce convert Expr to SQL string API and basic feature [#9517](https://github.com/apache/arrow-datafusion/pull/9517) (backkem) +- feat: implement more expr_to_sql functionality [#9578](https://github.com/apache/arrow-datafusion/pull/9578) (devinjdangelo) +- feat: implement aggregation and subquery plans to SQL [#9606](https://github.com/apache/arrow-datafusion/pull/9606) (devinjdangelo) +- feat: track memory usage for recursive CTE, enable recursive CTEs by default [#9619](https://github.com/apache/arrow-datafusion/pull/9619) (jonahgao) +- feat: Between expr to sql string [#9803](https://github.com/apache/arrow-datafusion/pull/9803) (sebastian2296) +- feat: Expose `array_empty` and `list_empty` functions as alias of `empty` function [#9807](https://github.com/apache/arrow-datafusion/pull/9807) (erenavsarogullari) +- feat: Not expr to string [#9802](https://github.com/apache/arrow-datafusion/pull/9802) (sebastian2296) +- feat: pass SessionState not SessionConfig to FunctionFactory::create [#9837](https://github.com/apache/arrow-datafusion/pull/9837) (tshauck) + +**Fixed bugs:** + +- fix: use `JoinSet` to make spawned tasks cancel-safe [#9318](https://github.com/apache/arrow-datafusion/pull/9318) (DDtKey) +- fix: nvl function's return type [#9357](https://github.com/apache/arrow-datafusion/pull/9357) (guojidan) +- fix: panic in isnan() when no args are given [#9377](https://github.com/apache/arrow-datafusion/pull/9377) (SteveLauC) +- fix: using test data sample for catalog example [#9372](https://github.com/apache/arrow-datafusion/pull/9372) (korowa) +- fix: sort_batch function unsupported mixed types with list [#9410](https://github.com/apache/arrow-datafusion/pull/9410) (JasonLi-cn) +- fix: casting to ARRAY types failed [#9441](https://github.com/apache/arrow-datafusion/pull/9441) (jonahgao) +- fix: reading from partitioned `json` & `arrow` tables [#9431](https://github.com/apache/arrow-datafusion/pull/9431) (korowa) +- fix: coalesce function should return correct data type [#9459](https://github.com/apache/arrow-datafusion/pull/9459) (viirya) +- fix: `generate_series` and `range` panic on edge cases [#9503](https://github.com/apache/arrow-datafusion/pull/9503) (jonahgao) +- fix: `substr_index` not handling negative occurrence correctly [#9475](https://github.com/apache/arrow-datafusion/pull/9475) (jonahgao) +- fix: support two argument TRIM [#9521](https://github.com/apache/arrow-datafusion/pull/9521) (tshauck) +- fix: incorrect null handling in `range` and `generate_series` [#9574](https://github.com/apache/arrow-datafusion/pull/9574) (jonahgao) +- fix: recursive cte hangs on joins [#9687](https://github.com/apache/arrow-datafusion/pull/9687) (jonahgao) +- fix: parallel parquet can underflow when max_record_batch_rows < execution.batch_size [#9737](https://github.com/apache/arrow-datafusion/pull/9737) (devinjdangelo) +- fix: change placeholder errors from Internal to Plan [#9745](https://github.com/apache/arrow-datafusion/pull/9745) (erratic-pattern) +- fix: ensure mutual compatibility of the two input schemas from recursive CTEs [#9795](https://github.com/apache/arrow-datafusion/pull/9795) (jonahgao) + +**Documentation updates:** + +- docs: put flatten in top fn list [#9376](https://github.com/apache/arrow-datafusion/pull/9376) (SteveLauC) +- Update documentation so list_to_string alias to point to array_to_string [#9374](https://github.com/apache/arrow-datafusion/pull/9374) (monkwire) +- Uplift keys/dependencies to use more workspace inheritance [#9293](https://github.com/apache/arrow-datafusion/pull/9293) (Jefffrey) +- docs: update contributor guide (migration to sqllogictest is done) [#9408](https://github.com/apache/arrow-datafusion/pull/9408) (SteveLauC) +- Move the to_timestamp\* functions to datafusion-functions [#9388](https://github.com/apache/arrow-datafusion/pull/9388) (Omega359) +- NEW Logo [#9385](https://github.com/apache/arrow-datafusion/pull/9385) (pinarbayata) +- Minor: docs: rm duplicate words. [#9449](https://github.com/apache/arrow-datafusion/pull/9449) (my-vegetable-has-exploded) +- Update contributor guide with updated scalar function howto [#9438](https://github.com/apache/arrow-datafusion/pull/9438) (Omega359) +- docs: fix extraneous char in array functions table of contents [#9560](https://github.com/apache/arrow-datafusion/pull/9560) (tshauck) +- doc: Add missing doc link [#9631](https://github.com/apache/arrow-datafusion/pull/9631) (Weijun-H) +- chore: remove repetitive word `the the` --> `the` in docs / comments [#9673](https://github.com/apache/arrow-datafusion/pull/9673) (InventiveCoder) +- Update example-usage.md to remove reference to simd and rust nightly. [#9677](https://github.com/apache/arrow-datafusion/pull/9677) (Omega359) +- Minor: Improve documentation for `LogicalPlan::expressions` [#9698](https://github.com/apache/arrow-datafusion/pull/9698) (alamb) +- Add Minimum Supported Rust Version policy to docs [#9681](https://github.com/apache/arrow-datafusion/pull/9681) (alamb) +- doc: Updated known users list and usage dependency description [#9718](https://github.com/apache/arrow-datafusion/pull/9718) (comphead) + +**Merged pull requests:** + +- refactor: Change `SchemaProvider::table` to return `Result` rather than `Option<..>` [#9307](https://github.com/apache/arrow-datafusion/pull/9307) (crepererum) +- fix write_partitioned_parquet_results test case bug [#9360](https://github.com/apache/arrow-datafusion/pull/9360) (guojidan) +- fix: use `JoinSet` to make spawned tasks cancel-safe [#9318](https://github.com/apache/arrow-datafusion/pull/9318) (DDtKey) +- Update nix requirement from 0.27.1 to 0.28.0 [#9344](https://github.com/apache/arrow-datafusion/pull/9344) (dependabot[bot]) +- Replace usages of internal_err with exec_err where appropriate [#9241](https://github.com/apache/arrow-datafusion/pull/9241) (Omega359) +- feat : Support for deregistering user defined functions [#9239](https://github.com/apache/arrow-datafusion/pull/9239) (mobley-trent) +- fix: nvl function's return type [#9357](https://github.com/apache/arrow-datafusion/pull/9357) (guojidan) +- refactor: move acos() to function crate [#9297](https://github.com/apache/arrow-datafusion/pull/9297) (SteveLauC) +- docs: put flatten in top fn list [#9376](https://github.com/apache/arrow-datafusion/pull/9376) (SteveLauC) +- Update documentation so list_to_string alias to point to array_to_string [#9374](https://github.com/apache/arrow-datafusion/pull/9374) (monkwire) +- feat: issue_9285: port builtin reg function into datafusion-function-\* (1/3 regexpmatch) [#9329](https://github.com/apache/arrow-datafusion/pull/9329) (Lordworms) +- Add test to verify issue #9161 [#9265](https://github.com/apache/arrow-datafusion/pull/9265) (jonahgao) +- refactor: fix error macros hygiene (always import `DataFusionError`) [#9366](https://github.com/apache/arrow-datafusion/pull/9366) (crepererum) +- feat: support for defining ARRAY columns in `CREATE TABLE` [#9381](https://github.com/apache/arrow-datafusion/pull/9381) (jonahgao) +- fix: panic in isnan() when no args are given [#9377](https://github.com/apache/arrow-datafusion/pull/9377) (SteveLauC) +- feat: support `unnest` in FROM clause [#9355](https://github.com/apache/arrow-datafusion/pull/9355) (jonahgao) +- feat: support nvl2 function [#9364](https://github.com/apache/arrow-datafusion/pull/9364) (guojidan) +- refactor: move asin() to function crate [#9379](https://github.com/apache/arrow-datafusion/pull/9379) (SteveLauC) +- fix: using test data sample for catalog example [#9372](https://github.com/apache/arrow-datafusion/pull/9372) (korowa) +- delete tail space, fix `error: unused import: DataFusionError` [#9386](https://github.com/apache/arrow-datafusion/pull/9386) (Tangruilin) +- Run cargo-fmt on `datafusion-functions/core` [#9367](https://github.com/apache/arrow-datafusion/pull/9367) (alamb) +- Cache common plan properties to eliminate recursive calls in physical plan [#9346](https://github.com/apache/arrow-datafusion/pull/9346) (mustafasrepo) +- Run cargo-fmt on all of `datafusion-functions` [#9390](https://github.com/apache/arrow-datafusion/pull/9390) (alamb) +- feat: issue #9224 substitute tlide in table path [#9259](https://github.com/apache/arrow-datafusion/pull/9259) (Lordworms) +- port range function and change gen_series logic [#9352](https://github.com/apache/arrow-datafusion/pull/9352) (Lordworms) +- [MINOR]: Generate physical plan, instead of logical plan in the bench test [#9383](https://github.com/apache/arrow-datafusion/pull/9383) (mustafasrepo) +- Add `to_date` function [#9019](https://github.com/apache/arrow-datafusion/pull/9019) (Tangruilin) +- Minor: clarify performance in docs for `ScalarUDF`, `ScalarUDAF` and `ScalarUDWF` [#9384](https://github.com/apache/arrow-datafusion/pull/9384) (alamb) +- feat: replace std Instant with wasm-compatible wrapper [#9189](https://github.com/apache/arrow-datafusion/pull/9189) (waynexia) +- Uplift keys/dependencies to use more workspace inheritance [#9293](https://github.com/apache/arrow-datafusion/pull/9293) (Jefffrey) +- Improve documentation for ExecutionPlanProperties, use consistent field name [#9389](https://github.com/apache/arrow-datafusion/pull/9389) (alamb) +- Doc: Workaround for Running cargo test locally without signficant memory [#9402](https://github.com/apache/arrow-datafusion/pull/9402) (devinjdangelo) +- feat: support `unnest` with additional columns [#9400](https://github.com/apache/arrow-datafusion/pull/9400) (jonahgao) +- Minor: improve the display name of `unnest` expressions [#9412](https://github.com/apache/arrow-datafusion/pull/9412) (jonahgao) +- Minor: Move function signature check to planning stage [#9401](https://github.com/apache/arrow-datafusion/pull/9401) (2010YOUY01) +- chore(deps): update substrait requirement from 0.24.0 to 0.25.1 [#9406](https://github.com/apache/arrow-datafusion/pull/9406) (dependabot[bot]) +- docs: update contributor guide (migration to sqllogictest is done) [#9408](https://github.com/apache/arrow-datafusion/pull/9408) (SteveLauC) +- Move the to_timestamp\* functions to datafusion-functions [#9388](https://github.com/apache/arrow-datafusion/pull/9388) (Omega359) +- Minor: Support LargeList List Range indexing and fix large list handling in ConstEvaluator [#9393](https://github.com/apache/arrow-datafusion/pull/9393) (jayzhan211) +- NEW Logo [#9385](https://github.com/apache/arrow-datafusion/pull/9385) (pinarbayata) +- Handle serde for ScalarUDF [#9395](https://github.com/apache/arrow-datafusion/pull/9395) (yyy1000) +- Minior: Add tests with `sqrt` with negative argument [#9426](https://github.com/apache/arrow-datafusion/pull/9426) (caicancai) +- Move SpawnedTask from datafusion_physical_plan to new `datafusion_common_runtime` crate [#9414](https://github.com/apache/arrow-datafusion/pull/9414) (mustafasrepo) +- Re-export datafusion-functions-array [#9433](https://github.com/apache/arrow-datafusion/pull/9433) (andygrove) +- Minor: Support LargeList for ListIndex [#9424](https://github.com/apache/arrow-datafusion/pull/9424) (PsiACE) +- move ArrayDims, ArrayNdims and Cardinality to datafusion-function-crate [#9425](https://github.com/apache/arrow-datafusion/pull/9425) (Weijun-H) +- refactor: make instr() an alias of strpos() [#9396](https://github.com/apache/arrow-datafusion/pull/9396) (SteveLauC) +- Add test case for invalid tz in timestamp literal [#9429](https://github.com/apache/arrow-datafusion/pull/9429) (MohamedAbdeen21) +- Minor: simplify call [#9434](https://github.com/apache/arrow-datafusion/pull/9434) (alamb) +- Support IGNORE NULLS for LEAD window function [#9419](https://github.com/apache/arrow-datafusion/pull/9419) (comphead) +- fix sqllogicaltest result [#9444](https://github.com/apache/arrow-datafusion/pull/9444) (jackwener) +- Minor: docs: rm duplicate words. [#9449](https://github.com/apache/arrow-datafusion/pull/9449) (my-vegetable-has-exploded) +- minor: fix cargo clippy some warning [#9442](https://github.com/apache/arrow-datafusion/pull/9442) (jackwener) +- port regexp_like function and port related tests [#9397](https://github.com/apache/arrow-datafusion/pull/9397) (Lordworms) +- fix: sort_batch function unsupported mixed types with list [#9410](https://github.com/apache/arrow-datafusion/pull/9410) (JasonLi-cn) +- refactor: add `join_unwind` to `SpawnedTask` [#9422](https://github.com/apache/arrow-datafusion/pull/9422) (DDtKey) +- Ignore null LEAD support for small batch sizes. [#9445](https://github.com/apache/arrow-datafusion/pull/9445) (mustafasrepo) +- fix: casting to ARRAY types failed [#9441](https://github.com/apache/arrow-datafusion/pull/9441) (jonahgao) +- fix: reading from partitioned `json` & `arrow` tables [#9431](https://github.com/apache/arrow-datafusion/pull/9431) (korowa) +- feat: Support `EscapedStringLiteral`, update sqlparser to `0.44.0` [#9268](https://github.com/apache/arrow-datafusion/pull/9268) (JasonLi-cn) +- Minor: fix LEAD test description [#9451](https://github.com/apache/arrow-datafusion/pull/9451) (comphead) +- Consolidate `TreeNode` transform and rewrite APIs [#8891](https://github.com/apache/arrow-datafusion/pull/8891) (peter-toth) +- Support `Date32` arguments for `generate_series` [#9420](https://github.com/apache/arrow-datafusion/pull/9420) (Lordworms) +- Minor: change doc for range [#9455](https://github.com/apache/arrow-datafusion/pull/9455) (Lordworms) +- doc: add missing function index in scalar_expression.md [#9462](https://github.com/apache/arrow-datafusion/pull/9462) (Weijun-H) +- build: Update bigdecimal version in `Cargo.toml` [#9471](https://github.com/apache/arrow-datafusion/pull/9471) (comphead) +- chore(deps): update base64 requirement from 0.21 to 0.22 [#9446](https://github.com/apache/arrow-datafusion/pull/9446) (dependabot[bot]) +- Port regexp_replace functions and related tests [#9454](https://github.com/apache/arrow-datafusion/pull/9454) (Lordworms) +- Update contributor guide with updated scalar function howto [#9438](https://github.com/apache/arrow-datafusion/pull/9438) (Omega359) +- feat: add support for fixed list wildcard in type signature [#9312](https://github.com/apache/arrow-datafusion/pull/9312) (universalmind303) +- Add a `ScalarUDFImpl::simplfy()` API, move `SimplifyInfo` et al to datafusion_expr [#9304](https://github.com/apache/arrow-datafusion/pull/9304) (jayzhan211) +- Implement IGNORE NULLS for FIRST_VALUE [#9411](https://github.com/apache/arrow-datafusion/pull/9411) (huaxingao) +- Add plugable handler for `CREATE FUNCTION` [#9333](https://github.com/apache/arrow-datafusion/pull/9333) (milenkovicm) +- Enable configurable display of partition sizes in the explain statement [#9474](https://github.com/apache/arrow-datafusion/pull/9474) (jayzhan211) +- Reduce casts for LEAD/LAG [#9468](https://github.com/apache/arrow-datafusion/pull/9468) (comphead) +- [CI build] fix chrono suggestions [#9486](https://github.com/apache/arrow-datafusion/pull/9486) (comphead) +- Make regex dependency optional in datafusion-functions, add CI checks for function packages [#9473](https://github.com/apache/arrow-datafusion/pull/9473) (alamb) +- fix: coalesce function should return correct data type [#9459](https://github.com/apache/arrow-datafusion/pull/9459) (viirya) +- LEAD/LAG calculate default value once [#9485](https://github.com/apache/arrow-datafusion/pull/9485) (comphead) +- chore: simplify the return type of `validate_data_types()` [#9491](https://github.com/apache/arrow-datafusion/pull/9491) (waynexia) +- minor: use arrow-rs casting from Float to Timestamp [#9500](https://github.com/apache/arrow-datafusion/pull/9500) (comphead) +- chore(deps): update substrait requirement from 0.25.1 to 0.27.0 [#9502](https://github.com/apache/arrow-datafusion/pull/9502) (dependabot[bot]) +- fix: `generate_series` and `range` panic on edge cases [#9503](https://github.com/apache/arrow-datafusion/pull/9503) (jonahgao) +- Fix undeterministic behaviour of schema nullability of lag window query [#9508](https://github.com/apache/arrow-datafusion/pull/9508) (mustafasrepo) +- Add `to_unixtime` function [#9077](https://github.com/apache/arrow-datafusion/pull/9077) (Tangruilin) +- Minor: fixed transformed state in UDF Simplify [#9484](https://github.com/apache/arrow-datafusion/pull/9484) (alamb) +- test: port strpos test in physical_expr/src/functions to sqllogictest [#9439](https://github.com/apache/arrow-datafusion/pull/9439) (SteveLauC) +- Port ArrayHas family to `functions-array` [#9496](https://github.com/apache/arrow-datafusion/pull/9496) (jayzhan211) +- port array_empty and array_length to datafusion-function-array crate [#9510](https://github.com/apache/arrow-datafusion/pull/9510) (Weijun-H) +- fix: `substr_index` not handling negative occurrence correctly [#9475](https://github.com/apache/arrow-datafusion/pull/9475) (jonahgao) +- [minor] extract collect file statistics method and add doc [#9490](https://github.com/apache/arrow-datafusion/pull/9490) (Ted-Jiang) +- test: sqllogictests for multiple tables join [#9480](https://github.com/apache/arrow-datafusion/pull/9480) (korowa) +- Add support for ignore nulls for LEAD, LAG in WindowAggExec [#9498](https://github.com/apache/arrow-datafusion/pull/9498) (Lordworms) +- Minior: Improve log expr description [#9516](https://github.com/apache/arrow-datafusion/pull/9516) (caicancai) +- port flatten to datafusion-function-array [#9523](https://github.com/apache/arrow-datafusion/pull/9523) (Weijun-H) +- feat: Add projection to HashJoinExec. [#9236](https://github.com/apache/arrow-datafusion/pull/9236) (my-vegetable-has-exploded) +- Add example for `FunctionFactory` [#9482](https://github.com/apache/arrow-datafusion/pull/9482) (milenkovicm) +- Move date_part, date_trunc, date_bin functions to datafusion-functions [#9435](https://github.com/apache/arrow-datafusion/pull/9435) (Omega359) +- fix: support two argument TRIM [#9521](https://github.com/apache/arrow-datafusion/pull/9521) (tshauck) +- Remove physical expr of ListIndex and ListRange, convert to `array_element` and `array_slice` functions [#9492](https://github.com/apache/arrow-datafusion/pull/9492) (jayzhan211) +- feat: function name hints for UDFs [#9407](https://github.com/apache/arrow-datafusion/pull/9407) (SteveLauC) +- Minor: Improve documentation for registering `AnalyzerRule` [#9520](https://github.com/apache/arrow-datafusion/pull/9520) (alamb) +- Extend argument types for udf `return_type_from_exprs` [#9522](https://github.com/apache/arrow-datafusion/pull/9522) (jayzhan211) +- move make_array array_append array_prepend array_concat function to datafusion-functions-array crate [#9504](https://github.com/apache/arrow-datafusion/pull/9504) (guojidan) +- Port `StringToArray` to `function-arrays` subcrate [#9543](https://github.com/apache/arrow-datafusion/pull/9543) (erenavsarogullari) +- Minor: remove `..` pattern matching in sql planner [#9531](https://github.com/apache/arrow-datafusion/pull/9531) (alamb) +- Minor: Fix document Interval syntax [#9542](https://github.com/apache/arrow-datafusion/pull/9542) (yyy1000) +- Port `struct` to datafusion-functions [#9546](https://github.com/apache/arrow-datafusion/pull/9546) (yyy1000) +- UDAF and UDWF support aliases [#9489](https://github.com/apache/arrow-datafusion/pull/9489) (lewiszlw) +- docs: fix extraneous char in array functions table of contents [#9560](https://github.com/apache/arrow-datafusion/pull/9560) (tshauck) +- [MINOR]: Fix undeterministic test [#9559](https://github.com/apache/arrow-datafusion/pull/9559) (mustafasrepo) +- Port `arrow_typeof` to datafusion-function [#9524](https://github.com/apache/arrow-datafusion/pull/9524) (yyy1000) +- feat: Introduce convert Expr to SQL string API and basic feature [#9517](https://github.com/apache/arrow-datafusion/pull/9517) (backkem) +- Port `ArraySort` to `function-arrays` subcrate [#9551](https://github.com/apache/arrow-datafusion/pull/9551) (erenavsarogullari) +- refactor: unify some plan optimization in CommonSubexprEliminate [#9556](https://github.com/apache/arrow-datafusion/pull/9556) (jackwener) +- Port `ArrayDistinct` to `functions-array` subcrate [#9549](https://github.com/apache/arrow-datafusion/pull/9549) (erenavsarogullari) +- Minor: add a sql_planner benchmarks to reflecte select many field on a huge table [#9536](https://github.com/apache/arrow-datafusion/pull/9536) (haohuaijin) +- Support IGNORE NULLS for FIRST/LAST window function [#9470](https://github.com/apache/arrow-datafusion/pull/9470) (huaxingao) +- Systematic Configuration in 'Create External Table' and 'Copy To' Options [#9382](https://github.com/apache/arrow-datafusion/pull/9382) (metesynnada) +- fix: incorrect null handling in `range` and `generate_series` [#9574](https://github.com/apache/arrow-datafusion/pull/9574) (jonahgao) +- Update README.md [#9572](https://github.com/apache/arrow-datafusion/pull/9572) (Abdullahsab3) +- Port tan, tanh to datafusion-functions [#9535](https://github.com/apache/arrow-datafusion/pull/9535) (ongchi) +- feat(9493): provide access to FileMetaData for files written with ParquetSink [#9548](https://github.com/apache/arrow-datafusion/pull/9548) (wiedld) +- Export datafusion-functions UDFs publically [#9585](https://github.com/apache/arrow-datafusion/pull/9585) (alamb) +- Update the comment and Add a check [#9571](https://github.com/apache/arrow-datafusion/pull/9571) (colommar) +- Port `ArrayRepeat` to `functions-array` subcrate [#9568](https://github.com/apache/arrow-datafusion/pull/9568) (erenavsarogullari) +- Fix ApproxPercentileAccumulator on zero values [#9582](https://github.com/apache/arrow-datafusion/pull/9582) (Dandandan) +- Add `FunctionRewrite` API, Move Array specific rewrites to `datafusion_functions_array` [#9583](https://github.com/apache/arrow-datafusion/pull/9583) (alamb) +- Move from_unixtime, now, current_date, current_time functions to datafusion-functions [#9537](https://github.com/apache/arrow-datafusion/pull/9537) (Omega359) +- minor: update Debug trait impl for WindowsFrame [#9587](https://github.com/apache/arrow-datafusion/pull/9587) (comphead) +- Initial support LogicalPlan to SQL String [#9596](https://github.com/apache/arrow-datafusion/pull/9596) (backkem) +- refactor: use a common macro to define math UDFs [#9598](https://github.com/apache/arrow-datafusion/pull/9598) (jonahgao) +- Move all `crypto` related functions to `datafusion-functions` [#9590](https://github.com/apache/arrow-datafusion/pull/9590) (Lordworms) +- Remove physical expr of NamedStructField, convert to `get_field` function call [#9563](https://github.com/apache/arrow-datafusion/pull/9563) (yyy1000) +- Add `/benchmark` github command to comparison benchmark between base and pr commit [#9461](https://github.com/apache/arrow-datafusion/pull/9461) (gruuya) +- support unnest as subexpression [#9592](https://github.com/apache/arrow-datafusion/pull/9592) (YjyJeff) +- feat: implement more expr_to_sql functionality [#9578](https://github.com/apache/arrow-datafusion/pull/9578) (devinjdangelo) +- Port `ArrayResize` to `functions-array` subcrate [#9570](https://github.com/apache/arrow-datafusion/pull/9570) (erenavsarogullari) +- Move make_date, to_char to datafusion-functions [#9601](https://github.com/apache/arrow-datafusion/pull/9601) (Omega359) +- Fix to_timestamp benchmark [#9608](https://github.com/apache/arrow-datafusion/pull/9608) (Omega359) +- feat: implement aggregation and subquery plans to SQL [#9606](https://github.com/apache/arrow-datafusion/pull/9606) (devinjdangelo) +- Port ArrayElem/Slice/PopFront/Back into `functions-array` [#9615](https://github.com/apache/arrow-datafusion/pull/9615) (jayzhan211) +- Minor: Remove datafusion-functions-array dependency from datafusion-optimizer [#9621](https://github.com/apache/arrow-datafusion/pull/9621) (alamb) +- Enable TTY during bench data generation [#9626](https://github.com/apache/arrow-datafusion/pull/9626) (gruuya) +- Remove constant expressions from SortExprs in the SortExec [#9618](https://github.com/apache/arrow-datafusion/pull/9618) (mustafasrepo) +- Try fixing missing results name in the benchmark step [#9632](https://github.com/apache/arrow-datafusion/pull/9632) (gruuya) +- feat: track memory usage for recursive CTE, enable recursive CTEs by default [#9619](https://github.com/apache/arrow-datafusion/pull/9619) (jonahgao) +- doc: Add missing doc link [#9631](https://github.com/apache/arrow-datafusion/pull/9631) (Weijun-H) +- Add explicit move of PR bench results if they were placed in HEAD dir [#9636](https://github.com/apache/arrow-datafusion/pull/9636) (gruuya) +- Add `array_reverse` function to datafusion-function-\* crate [#9630](https://github.com/apache/arrow-datafusion/pull/9630) (Weijun-H) +- Move parts of `InListSimplifier` simplify rules to `Simplifier` [#9628](https://github.com/apache/arrow-datafusion/pull/9628) (jayzhan211) +- Port Array Union and Intersect to `functions-array` [#9629](https://github.com/apache/arrow-datafusion/pull/9629) (jayzhan211) +- Port `ArrayPosition` and `ArrayPositions` to `functions-array` subcrate [#9617](https://github.com/apache/arrow-datafusion/pull/9617) (erenavsarogullari) +- Optimize make_date (#9089) [#9600](https://github.com/apache/arrow-datafusion/pull/9600) (vojtechtoman) +- Support AT TIME ZONE clause [#9647](https://github.com/apache/arrow-datafusion/pull/9647) (tinfoil-knight) +- Window Linear Mode use smaller buffers [#9597](https://github.com/apache/arrow-datafusion/pull/9597) (mustafasrepo) +- Port `ArrayExcept` to `functions-array` subcrate [#9634](https://github.com/apache/arrow-datafusion/pull/9634) (erenavsarogullari) +- chore: improve array expression doc and clean up array_expression.rs [#9650](https://github.com/apache/arrow-datafusion/pull/9650) (Weijun-H) +- Minor: remove clone in `exprlist_to_fields` [#9657](https://github.com/apache/arrow-datafusion/pull/9657) (jayzhan211) +- Port `ArrayRemove`, `ArrayRemoveN`, `ArrayRemoveAll` to `functions-array` subcrate [#9656](https://github.com/apache/arrow-datafusion/pull/9656) (erenavsarogullari) +- Minor: Remove redundant dependencies from `datafusion-functions/Cargo.toml` [#9622](https://github.com/apache/arrow-datafusion/pull/9622) (alamb) +- Support IGNORE NULLS for NTH_VALUE window function [#9625](https://github.com/apache/arrow-datafusion/pull/9625) (huaxingao) +- Improve Robustness of Unparser Testing and Implementation [#9623](https://github.com/apache/arrow-datafusion/pull/9623) (devinjdangelo) +- Adding Constant Check for FilterExec [#9649](https://github.com/apache/arrow-datafusion/pull/9649) (Lordworms) +- chore(deps-dev): bump follow-redirects from 1.15.4 to 1.15.6 in /datafusion/wasmtest/datafusion-wasm-app [#9609](https://github.com/apache/arrow-datafusion/pull/9609) (dependabot[bot]) +- move array_replace family functions to datafusion-function-array crate [#9651](https://github.com/apache/arrow-datafusion/pull/9651) (Weijun-H) +- chore: remove repetitive word `the the` --> `the` in docs / comments [#9673](https://github.com/apache/arrow-datafusion/pull/9673) (InventiveCoder) +- Update example-usage.md to remove reference to simd and rust nightly. [#9677](https://github.com/apache/arrow-datafusion/pull/9677) (Omega359) +- [MINOR]: Remove some `.unwrap`s from nth_value.rs file [#9674](https://github.com/apache/arrow-datafusion/pull/9674) (mustafasrepo) +- minor: Remove deprecated methods [#9627](https://github.com/apache/arrow-datafusion/pull/9627) (comphead) +- Migrate `arrow_cast` to a UDF [#9610](https://github.com/apache/arrow-datafusion/pull/9610) (alamb) +- parquet: Add row*groups_matched*{statistics,bloom_filter} statistics [#9640](https://github.com/apache/arrow-datafusion/pull/9640) (progval) +- Make COPY TO align with CREATE EXTERNAL TABLE [#9604](https://github.com/apache/arrow-datafusion/pull/9604) (metesynnada) +- Support "A column is known to be entirely NULL" in `PruningPredicate` [#9223](https://github.com/apache/arrow-datafusion/pull/9223) (appletreeisyellow) +- Suppress self update for windows CI runner [#9661](https://github.com/apache/arrow-datafusion/pull/9661) (jayzhan211) +- add schema to SQL ast builder [#9624](https://github.com/apache/arrow-datafusion/pull/9624) (sardination) +- core/tests/parquet/row_group_pruning.rs: Add tests for strings [#9642](https://github.com/apache/arrow-datafusion/pull/9642) (progval) +- Fix incorrect results with multiple `COUNT(DISTINCT..)` aggregates on dictionaries [#9679](https://github.com/apache/arrow-datafusion/pull/9679) (alamb) +- parquet: Add support for Bloom filters on binary columns [#9644](https://github.com/apache/arrow-datafusion/pull/9644) (progval) +- Update Arrow/Parquet to `51.0.0`, tonic to `0.11` [#9613](https://github.com/apache/arrow-datafusion/pull/9613) (tustvold) +- Move inlist rule to expr_simplifier [#9692](https://github.com/apache/arrow-datafusion/pull/9692) (jayzhan211) +- Support Serde for ScalarUDF in Physical Expressions [#9436](https://github.com/apache/arrow-datafusion/pull/9436) (yyy1000) +- Support Union types in `ScalarValue` [#9683](https://github.com/apache/arrow-datafusion/pull/9683) (avantgardnerio) +- parquet: Add support for row group pruning on FixedSizeBinary [#9646](https://github.com/apache/arrow-datafusion/pull/9646) (progval) +- Minor: Improve documentation for `LogicalPlan::expressions` [#9698](https://github.com/apache/arrow-datafusion/pull/9698) (alamb) +- Make builtin window function output datatype to be derived from schema [#9686](https://github.com/apache/arrow-datafusion/pull/9686) (comphead) +- refactor: Extract `array_to_string` and `string_to_array` from `functions-array` subcrate' s `kernels` and `udf` containers [#9704](https://github.com/apache/arrow-datafusion/pull/9704) (erenavsarogullari) +- Add Minimum Supported Rust Version policy to docs [#9681](https://github.com/apache/arrow-datafusion/pull/9681) (alamb) +- doc: Add DataFusion profiling documentation for MacOS [#9711](https://github.com/apache/arrow-datafusion/pull/9711) (comphead) +- Minor: add ticket reference to commented out test [#9715](https://github.com/apache/arrow-datafusion/pull/9715) (alamb) +- Minor: Rename path from `common_runtime` to `common-runtime` [#9717](https://github.com/apache/arrow-datafusion/pull/9717) (alamb) +- Use object_store:BufWriter to replace put_multipart [#9648](https://github.com/apache/arrow-datafusion/pull/9648) (yyy1000) +- Fix COPY TO failing on passing format options through CLI [#9709](https://github.com/apache/arrow-datafusion/pull/9709) (tinfoil-knight) +- fix: recursive cte hangs on joins [#9687](https://github.com/apache/arrow-datafusion/pull/9687) (jonahgao) +- Move `starts_with`, `to_hex`,` trim`, `upper` to datafusion-functions (and add string_expressions) [#9541](https://github.com/apache/arrow-datafusion/pull/9541) (Tangruilin) +- Support for `extract(x from time)` / `date_part` from time types [#8693](https://github.com/apache/arrow-datafusion/pull/8693) (Jefffrey) +- doc: Updated known users list and usage dependency description [#9718](https://github.com/apache/arrow-datafusion/pull/9718) (comphead) +- Minor: improve documentation for `CommonSubexprEliminate` [#9700](https://github.com/apache/arrow-datafusion/pull/9700) (alamb) +- build: modify code to comply with latest clippy requirement [#9725](https://github.com/apache/arrow-datafusion/pull/9725) (comphead) +- Minor: return internal error rather than panic on unexpected error in COUNT DISTINCT [#9712](https://github.com/apache/arrow-datafusion/pull/9712) (alamb) +- fix(9678): short circuiting prevented population of visited stack, for common subexpr elimination optimization [#9685](https://github.com/apache/arrow-datafusion/pull/9685) (wiedld) +- perf: improve to_field performance [#9722](https://github.com/apache/arrow-datafusion/pull/9722) (haohuaijin) +- Minor: Run ScalarValue size test on aarch again [#9728](https://github.com/apache/arrow-datafusion/pull/9728) (alamb) +- Move trim functions (btrim, ltrim, rtrim) to datafusion_functions, make expr_fn API consistent [#9730](https://github.com/apache/arrow-datafusion/pull/9730) (Omega359) +- make format prefix optional for format options in COPY [#9723](https://github.com/apache/arrow-datafusion/pull/9723) (tinfoil-knight) +- refactor: Extract `range` and `gen_series` functions from `functions-array` subcrate' s `kernels` and `udf` containers [#9720](https://github.com/apache/arrow-datafusion/pull/9720) (erenavsarogullari) +- Move ascii function to datafusion_functions [#9740](https://github.com/apache/arrow-datafusion/pull/9740) (PsiACE) +- adding expr to string for IsNotNull IsTrue IsFalse and IsUnkown [#9739](https://github.com/apache/arrow-datafusion/pull/9739) (Lordworms) +- fix: parallel parquet can underflow when max_record_batch_rows < execution.batch_size [#9737](https://github.com/apache/arrow-datafusion/pull/9737) (devinjdangelo) +- support format in options of COPY command [#9744](https://github.com/apache/arrow-datafusion/pull/9744) (tinfoil-knight) +- Move lower, octet_length to datafusion-functions [#9747](https://github.com/apache/arrow-datafusion/pull/9747) (Omega359) +- Fixed missing trim() in rust api [#9749](https://github.com/apache/arrow-datafusion/pull/9749) (Omega359) +- refactor: Extract `array_length`, `array_reverse` and `array_sort` functions from `functions-array` subcrate' s `kernels` and `udf` containers [#9751](https://github.com/apache/arrow-datafusion/pull/9751) (erenavsarogullari) +- refactor: Extract `array_empty` and `array_repeat` functions from `functions-array` subcrate' s `kernels` and `udf` containers [#9762](https://github.com/apache/arrow-datafusion/pull/9762) (erenavsarogullari) +- Minor: remove an outdated TODO in `TypeCoercion` [#9752](https://github.com/apache/arrow-datafusion/pull/9752) (jonahgao) +- refactor: Extract `array_resize` and `cardinality` functions from `functions-array` subcrate' s `kernels` and `udf` containers [#9766](https://github.com/apache/arrow-datafusion/pull/9766) (erenavsarogullari) +- fix: change placeholder errors from Internal to Plan [#9745](https://github.com/apache/arrow-datafusion/pull/9745) (erratic-pattern) +- Move levenshtein, uuid, overlay to datafusion-functions [#9760](https://github.com/apache/arrow-datafusion/pull/9760) (Omega359) +- improve null handling for to_char [#9689](https://github.com/apache/arrow-datafusion/pull/9689) (tinfoil-knight) +- Add Expr->String for ScalarFunction and InList [#9759](https://github.com/apache/arrow-datafusion/pull/9759) (yyy1000) +- Move repeat, replace, split_part to datafusion_functions [#9784](https://github.com/apache/arrow-datafusion/pull/9784) (Omega359) +- refactor: Extract `array_dims`, `array_ndims` and `flatten` functions from `functions-array` subcrate' s `kernels` and `udf` containers [#9786](https://github.com/apache/arrow-datafusion/pull/9786) (erenavsarogullari) +- Minor: Improve documentation about `ColumnarValues::values_to_array` [#9774](https://github.com/apache/arrow-datafusion/pull/9774) (alamb) +- Fix panic in `struct` function with mixed scalar/array arguments [#9775](https://github.com/apache/arrow-datafusion/pull/9775) (alamb) +- refactor: Apply minor refactorings to `functions-array` crate [#9788](https://github.com/apache/arrow-datafusion/pull/9788) (erenavsarogullari) +- Move bit_length and chr functions to datafusion_functions [#9782](https://github.com/apache/arrow-datafusion/pull/9782) (PsiACE) +- Support tencent cloud COS storage in `datafusion-cli` [#9734](https://github.com/apache/arrow-datafusion/pull/9734) (harveyyue) +- Make it easier to register configuration extension ... [#9781](https://github.com/apache/arrow-datafusion/pull/9781) (milenkovicm) +- Expr to Sql : Case [#9798](https://github.com/apache/arrow-datafusion/pull/9798) (yyy1000) +- feat: Between expr to sql string [#9803](https://github.com/apache/arrow-datafusion/pull/9803) (sebastian2296) +- feat: Expose `array_empty` and `list_empty` functions as alias of `empty` function [#9807](https://github.com/apache/arrow-datafusion/pull/9807) (erenavsarogullari) +- Support Expr `Like` to sql [#9805](https://github.com/apache/arrow-datafusion/pull/9805) (Weijun-H) +- feat: Not expr to string [#9802](https://github.com/apache/arrow-datafusion/pull/9802) (sebastian2296) +- [Minor]: Move some repetitive codes to functions(proto) [#9811](https://github.com/apache/arrow-datafusion/pull/9811) (mustafasrepo) +- Implement IGNORE NULLS for LAST_VALUE [#9801](https://github.com/apache/arrow-datafusion/pull/9801) (huaxingao) +- [MINOR]: Move some repetitive codes to functions [#9810](https://github.com/apache/arrow-datafusion/pull/9810) (mustafasrepo) +- fix: ensure mutual compatibility of the two input schemas from recursive CTEs [#9795](https://github.com/apache/arrow-datafusion/pull/9795) (jonahgao) +- Add support for constant expression evaluation in limit [#9790](https://github.com/apache/arrow-datafusion/pull/9790) (mustafasrepo) +- Projection Pushdown through user defined LogicalPlan nodes. [#9690](https://github.com/apache/arrow-datafusion/pull/9690) (mustafasrepo) +- chore(deps): update substrait requirement from 0.27.0 to 0.28.0 [#9809](https://github.com/apache/arrow-datafusion/pull/9809) (dependabot[bot]) +- Run TPC-H SF10 during PR benchmarks [#9822](https://github.com/apache/arrow-datafusion/pull/9822) (gruuya) +- Expose `parser` on DFParser to enable user controlled parsing [#9729](https://github.com/apache/arrow-datafusion/pull/9729) (tshauck) +- Disable parallel reading for gziped ndjson file [#9799](https://github.com/apache/arrow-datafusion/pull/9799) (Lordworms) +- Optimize to_timestamp (with format) (#9090) [#9833](https://github.com/apache/arrow-datafusion/pull/9833) (vojtechtoman) +- Create unicode module in datafusion/functions/src/unicode and unicode_expressions feature flag, move char_length function [#9825](https://github.com/apache/arrow-datafusion/pull/9825) (Omega359) +- [Minor] Update TCPDS tests, remove some #[ignore]d tests [#9829](https://github.com/apache/arrow-datafusion/pull/9829) (Dandandan) +- doc: Adding baseline benchmark example [#9827](https://github.com/apache/arrow-datafusion/pull/9827) (comphead) +- Add name method to execution plan [#9793](https://github.com/apache/arrow-datafusion/pull/9793) (matthewmturner) +- chore(deps-dev): bump express from 4.18.2 to 4.19.2 in /datafusion/wasmtest/datafusion-wasm-app [#9826](https://github.com/apache/arrow-datafusion/pull/9826) (dependabot[bot]) +- feat: pass SessionState not SessionConfig to FunctionFactory::create [#9837](https://github.com/apache/arrow-datafusion/pull/9837) (tshauck) diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 492be93caf0c..a95f2f802dfb 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -64,7 +64,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.execution.parquet.statistics_enabled | NULL | Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.max_statistics_size | NULL | Sets max statistics size for any column. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.max_row_group_size | 1048576 | Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. | -| datafusion.execution.parquet.created_by | datafusion version 36.0.0 | Sets "created by" property | +| datafusion.execution.parquet.created_by | datafusion version 37.0.0 | Sets "created by" property | | datafusion.execution.parquet.column_index_truncate_length | NULL | Sets column index truncate length | | datafusion.execution.parquet.data_page_row_count_limit | 18446744073709551615 | Sets best effort maximum number of rows in data page | | datafusion.execution.parquet.encoding | NULL | Sets default encoding for any column Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting | From 09f5a544d25f36ff1d65cc377123aee9b0e8f538 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Thu, 28 Mar 2024 22:56:15 -0400 Subject: [PATCH 03/15] move Left, Lpad, Reverse, Right, Rpad functions to datafusion_functions (#9841) * Fix to_timestamp benchmark * Remove reference to simd and nightly build as simd is no longer an available feature in DataFusion and building with nightly may not be a good recommendation when getting started. * Fixed missing trim() function. * Create unicode module in datafusion/functions/src/unicode and unicode_expressions feature flag, move char_length function * move Left, Lpad, Reverse, Right, Rpad functions to datafusion_functions * Code cleanup from PR review. --- datafusion/expr/src/built_in_function.rs | 50 +- datafusion/expr/src/expr_fn.rs | 21 - datafusion/functions/src/unicode/left.rs | 236 +++++++ datafusion/functions/src/unicode/lpad.rs | 369 +++++++++++ datafusion/functions/src/unicode/mod.rs | 44 +- datafusion/functions/src/unicode/reverse.rs | 149 +++++ datafusion/functions/src/unicode/right.rs | 238 +++++++ datafusion/functions/src/unicode/rpad.rs | 361 +++++++++++ datafusion/physical-expr/src/functions.rs | 606 ------------------ datafusion/physical-expr/src/planner.rs | 4 +- .../physical-expr/src/unicode_expressions.rs | 263 +------- datafusion/proto/proto/datafusion.proto | 10 +- datafusion/proto/src/generated/pbjson.rs | 15 - datafusion/proto/src/generated/prost.rs | 20 +- .../proto/src/logical_plan/from_proto.rs | 53 +- datafusion/proto/src/logical_plan/to_proto.rs | 5 - 16 files changed, 1428 insertions(+), 1016 deletions(-) create mode 100644 datafusion/functions/src/unicode/left.rs create mode 100644 datafusion/functions/src/unicode/lpad.rs create mode 100644 datafusion/functions/src/unicode/reverse.rs create mode 100644 datafusion/functions/src/unicode/right.rs create mode 100644 datafusion/functions/src/unicode/rpad.rs diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index eefbc131a27b..196d278dc70e 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -111,18 +111,8 @@ pub enum BuiltinScalarFunction { EndsWith, /// initcap InitCap, - /// left - Left, - /// lpad - Lpad, /// random Random, - /// reverse - Reverse, - /// right - Right, - /// rpad - Rpad, /// strpos Strpos, /// substr @@ -220,12 +210,7 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::ConcatWithSeparator => Volatility::Immutable, BuiltinScalarFunction::EndsWith => Volatility::Immutable, BuiltinScalarFunction::InitCap => Volatility::Immutable, - BuiltinScalarFunction::Left => Volatility::Immutable, - BuiltinScalarFunction::Lpad => Volatility::Immutable, BuiltinScalarFunction::Radians => Volatility::Immutable, - BuiltinScalarFunction::Reverse => Volatility::Immutable, - BuiltinScalarFunction::Right => Volatility::Immutable, - BuiltinScalarFunction::Rpad => Volatility::Immutable, BuiltinScalarFunction::Strpos => Volatility::Immutable, BuiltinScalarFunction::Substr => Volatility::Immutable, BuiltinScalarFunction::Translate => Volatility::Immutable, @@ -264,17 +249,8 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::InitCap => { utf8_to_str_type(&input_expr_types[0], "initcap") } - BuiltinScalarFunction::Left => utf8_to_str_type(&input_expr_types[0], "left"), - BuiltinScalarFunction::Lpad => utf8_to_str_type(&input_expr_types[0], "lpad"), BuiltinScalarFunction::Pi => Ok(Float64), BuiltinScalarFunction::Random => Ok(Float64), - BuiltinScalarFunction::Reverse => { - utf8_to_str_type(&input_expr_types[0], "reverse") - } - BuiltinScalarFunction::Right => { - utf8_to_str_type(&input_expr_types[0], "right") - } - BuiltinScalarFunction::Rpad => utf8_to_str_type(&input_expr_types[0], "rpad"), BuiltinScalarFunction::EndsWith => Ok(Boolean), BuiltinScalarFunction::Strpos => { utf8_to_int_type(&input_expr_types[0], "strpos/instr/position") @@ -361,28 +337,9 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Coalesce => { Signature::variadic_equal(self.volatility()) } - BuiltinScalarFunction::InitCap | BuiltinScalarFunction::Reverse => { + BuiltinScalarFunction::InitCap => { Signature::uniform(1, vec![Utf8, LargeUtf8], self.volatility()) } - BuiltinScalarFunction::Lpad | BuiltinScalarFunction::Rpad => { - Signature::one_of( - vec![ - Exact(vec![Utf8, Int64]), - Exact(vec![LargeUtf8, Int64]), - Exact(vec![Utf8, Int64, Utf8]), - Exact(vec![LargeUtf8, Int64, Utf8]), - Exact(vec![Utf8, Int64, LargeUtf8]), - Exact(vec![LargeUtf8, Int64, LargeUtf8]), - ], - self.volatility(), - ) - } - BuiltinScalarFunction::Left | BuiltinScalarFunction::Right => { - Signature::one_of( - vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8, Int64])], - self.volatility(), - ) - } BuiltinScalarFunction::EndsWith | BuiltinScalarFunction::Strpos => { Signature::one_of( @@ -580,11 +537,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::ConcatWithSeparator => &["concat_ws"], BuiltinScalarFunction::EndsWith => &["ends_with"], BuiltinScalarFunction::InitCap => &["initcap"], - BuiltinScalarFunction::Left => &["left"], - BuiltinScalarFunction::Lpad => &["lpad"], - BuiltinScalarFunction::Reverse => &["reverse"], - BuiltinScalarFunction::Right => &["right"], - BuiltinScalarFunction::Rpad => &["rpad"], BuiltinScalarFunction::Strpos => &["strpos", "instr", "position"], BuiltinScalarFunction::Substr => &["substr"], BuiltinScalarFunction::Translate => &["translate"], diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index 654464798625..21dab72855e5 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -578,25 +578,11 @@ scalar_expr!(Atan2, atan2, y x, "inverse tangent of a division given in the argu scalar_expr!(Log, log, base x, "logarithm of a `x` for a particular `base`"); scalar_expr!(InitCap, initcap, string, "converts the first letter of each word in `string` in uppercase and the remaining characters in lowercase"); -scalar_expr!(Left, left, string n, "returns the first `n` characters in the `string`"); -scalar_expr!(Reverse, reverse, string, "reverses the `string`"); -scalar_expr!(Right, right, string n, "returns the last `n` characters in the `string`"); scalar_expr!(EndsWith, ends_with, string suffix, "whether the `string` ends with the `suffix`"); scalar_expr!(Strpos, strpos, string substring, "finds the position from where the `substring` matches the `string`"); scalar_expr!(Substr, substr, string position, "substring from the `position` to the end"); scalar_expr!(Substr, substring, string position length, "substring from the `position` with `length` characters"); scalar_expr!(Translate, translate, string from to, "replaces the characters in `from` with the counterpart in `to`"); -//use vec as parameter -nary_scalar_expr!( - Lpad, - lpad, - "fill up a string to the length by prepending the characters" -); -nary_scalar_expr!( - Rpad, - rpad, - "fill up a string to the length by appending the characters" -); nary_scalar_expr!(Coalesce, coalesce, "returns `coalesce(args...)`, which evaluates to the value of the first [Expr] which is not NULL"); //there is a func concat_ws before, so use concat_ws_expr as name.c nary_scalar_expr!( @@ -1028,13 +1014,6 @@ mod test { test_scalar_expr!(Gcd, gcd, arg_1, arg_2); test_scalar_expr!(Lcm, lcm, arg_1, arg_2); test_scalar_expr!(InitCap, initcap, string); - test_scalar_expr!(Left, left, string, count); - test_nary_scalar_expr!(Lpad, lpad, string, count); - test_nary_scalar_expr!(Lpad, lpad, string, count, characters); - test_scalar_expr!(Reverse, reverse, string); - test_scalar_expr!(Right, right, string, count); - test_nary_scalar_expr!(Rpad, rpad, string, count); - test_nary_scalar_expr!(Rpad, rpad, string, count, characters); test_scalar_expr!(EndsWith, ends_with, string, characters); test_scalar_expr!(Strpos, strpos, string, substring); test_scalar_expr!(Substr, substr, string, position); diff --git a/datafusion/functions/src/unicode/left.rs b/datafusion/functions/src/unicode/left.rs new file mode 100644 index 000000000000..473589fdc8aa --- /dev/null +++ b/datafusion/functions/src/unicode/left.rs @@ -0,0 +1,236 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::cmp::Ordering; +use std::sync::Arc; + +use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; +use arrow::datatypes::DataType; + +use datafusion_common::cast::{as_generic_string_array, as_int64_array}; +use datafusion_common::exec_err; +use datafusion_common::Result; +use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; + +use crate::utils::{make_scalar_function, utf8_to_str_type}; + +#[derive(Debug)] +pub(super) struct LeftFunc { + signature: Signature, +} + +impl LeftFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8, Int64])], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for LeftFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "left" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + utf8_to_str_type(&arg_types[0], "left") + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + match args[0].data_type() { + DataType::Utf8 => make_scalar_function(left::, vec![])(args), + DataType::LargeUtf8 => make_scalar_function(left::, vec![])(args), + other => exec_err!("Unsupported data type {other:?} for function left"), + } + } +} + +/// Returns first n characters in the string, or when n is negative, returns all but last |n| characters. +/// left('abcde', 2) = 'ab' +/// The implementation uses UTF-8 code points as characters +pub fn left(args: &[ArrayRef]) -> Result { + let string_array = as_generic_string_array::(&args[0])?; + let n_array = as_int64_array(&args[1])?; + let result = string_array + .iter() + .zip(n_array.iter()) + .map(|(string, n)| match (string, n) { + (Some(string), Some(n)) => match n.cmp(&0) { + Ordering::Less => { + let len = string.chars().count() as i64; + Some(if n.abs() < len { + string.chars().take((len + n) as usize).collect::() + } else { + "".to_string() + }) + } + Ordering::Equal => Some("".to_string()), + Ordering::Greater => { + Some(string.chars().take(n as usize).collect::()) + } + }, + _ => None, + }) + .collect::>(); + + Ok(Arc::new(result) as ArrayRef) +} + +#[cfg(test)] +mod tests { + use arrow::array::{Array, StringArray}; + use arrow::datatypes::DataType::Utf8; + + use datafusion_common::{Result, ScalarValue}; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + + use crate::unicode::left::LeftFunc; + use crate::utils::test::test_function; + + #[test] + fn test_functions() -> Result<()> { + test_function!( + LeftFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::from(2i64)), + ], + Ok(Some("ab")), + &str, + Utf8, + StringArray + ); + test_function!( + LeftFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::from(200i64)), + ], + Ok(Some("abcde")), + &str, + Utf8, + StringArray + ); + test_function!( + LeftFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::from(-2i64)), + ], + Ok(Some("abc")), + &str, + Utf8, + StringArray + ); + test_function!( + LeftFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::from(-200i64)), + ], + Ok(Some("")), + &str, + Utf8, + StringArray + ); + test_function!( + LeftFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::from(0i64)), + ], + Ok(Some("")), + &str, + Utf8, + StringArray + ); + test_function!( + LeftFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(None)), + ColumnarValue::Scalar(ScalarValue::from(2i64)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + LeftFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::Int64(None)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + LeftFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("joséésoj")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ], + Ok(Some("joséé")), + &str, + Utf8, + StringArray + ); + test_function!( + LeftFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("joséésoj")), + ColumnarValue::Scalar(ScalarValue::from(-3i64)), + ], + Ok(Some("joséé")), + &str, + Utf8, + StringArray + ); + #[cfg(not(feature = "unicode_expressions"))] + test_function!( + LeftFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::from(2i64)), + ], + internal_err!( + "function left requires compilation with feature flag: unicode_expressions." + ), + &str, + Utf8, + StringArray + ); + + Ok(()) + } +} diff --git a/datafusion/functions/src/unicode/lpad.rs b/datafusion/functions/src/unicode/lpad.rs new file mode 100644 index 000000000000..76a8e68cca25 --- /dev/null +++ b/datafusion/functions/src/unicode/lpad.rs @@ -0,0 +1,369 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; +use arrow::datatypes::DataType; +use datafusion_common::cast::{as_generic_string_array, as_int64_array}; +use unicode_segmentation::UnicodeSegmentation; + +use crate::utils::{make_scalar_function, utf8_to_str_type}; +use datafusion_common::{exec_err, Result}; +use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; + +#[derive(Debug)] +pub(super) struct LPadFunc { + signature: Signature, +} + +impl LPadFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![ + Exact(vec![Utf8, Int64]), + Exact(vec![LargeUtf8, Int64]), + Exact(vec![Utf8, Int64, Utf8]), + Exact(vec![LargeUtf8, Int64, Utf8]), + Exact(vec![Utf8, Int64, LargeUtf8]), + Exact(vec![LargeUtf8, Int64, LargeUtf8]), + ], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for LPadFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "lpad" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + utf8_to_str_type(&arg_types[0], "lpad") + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + match args[0].data_type() { + DataType::Utf8 => make_scalar_function(lpad::, vec![])(args), + DataType::LargeUtf8 => make_scalar_function(lpad::, vec![])(args), + other => exec_err!("Unsupported data type {other:?} for function lpad"), + } + } +} + +/// Extends the string to length 'length' by prepending the characters fill (a space by default). If the string is already longer than length then it is truncated (on the right). +/// lpad('hi', 5, 'xy') = 'xyxhi' +pub fn lpad(args: &[ArrayRef]) -> Result { + match args.len() { + 2 => { + let string_array = as_generic_string_array::(&args[0])?; + let length_array = as_int64_array(&args[1])?; + + let result = string_array + .iter() + .zip(length_array.iter()) + .map(|(string, length)| match (string, length) { + (Some(string), Some(length)) => { + if length > i32::MAX as i64 { + return exec_err!( + "lpad requested length {length} too large" + ); + } + + let length = if length < 0 { 0 } else { length as usize }; + if length == 0 { + Ok(Some("".to_string())) + } else { + let graphemes = string.graphemes(true).collect::>(); + if length < graphemes.len() { + Ok(Some(graphemes[..length].concat())) + } else { + let mut s: String = " ".repeat(length - graphemes.len()); + s.push_str(string); + Ok(Some(s)) + } + } + } + _ => Ok(None), + }) + .collect::>>()?; + + Ok(Arc::new(result) as ArrayRef) + } + 3 => { + let string_array = as_generic_string_array::(&args[0])?; + let length_array = as_int64_array(&args[1])?; + let fill_array = as_generic_string_array::(&args[2])?; + + let result = string_array + .iter() + .zip(length_array.iter()) + .zip(fill_array.iter()) + .map(|((string, length), fill)| match (string, length, fill) { + (Some(string), Some(length), Some(fill)) => { + if length > i32::MAX as i64 { + return exec_err!( + "lpad requested length {length} too large" + ); + } + + let length = if length < 0 { 0 } else { length as usize }; + if length == 0 { + Ok(Some("".to_string())) + } else { + let graphemes = string.graphemes(true).collect::>(); + let fill_chars = fill.chars().collect::>(); + + if length < graphemes.len() { + Ok(Some(graphemes[..length].concat())) + } else if fill_chars.is_empty() { + Ok(Some(string.to_string())) + } else { + let mut s = string.to_string(); + let mut char_vector = + Vec::::with_capacity(length - graphemes.len()); + for l in 0..length - graphemes.len() { + char_vector.push( + *fill_chars.get(l % fill_chars.len()).unwrap(), + ); + } + s.insert_str( + 0, + char_vector.iter().collect::().as_str(), + ); + Ok(Some(s)) + } + } + } + _ => Ok(None), + }) + .collect::>>()?; + + Ok(Arc::new(result) as ArrayRef) + } + other => exec_err!( + "lpad was called with {other} arguments. It requires at least 2 and at most 3." + ), + } +} + +#[cfg(test)] +mod tests { + use arrow::array::{Array, StringArray}; + use arrow::datatypes::DataType::Utf8; + + use datafusion_common::{Result, ScalarValue}; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + + use crate::unicode::lpad::LPadFunc; + use crate::utils::test::test_function; + + #[test] + fn test_functions() -> Result<()> { + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("josé")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ], + Ok(Some(" josé")), + &str, + Utf8, + StringArray + ); + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ], + Ok(Some(" hi")), + &str, + Utf8, + StringArray + ); + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(0i64)), + ], + Ok(Some("")), + &str, + Utf8, + StringArray + ); + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::Int64(None)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(None)), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ColumnarValue::Scalar(ScalarValue::from("xy")), + ], + Ok(Some("xyxhi")), + &str, + Utf8, + StringArray + ); + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(21i64)), + ColumnarValue::Scalar(ScalarValue::from("abcdef")), + ], + Ok(Some("abcdefabcdefabcdefahi")), + &str, + Utf8, + StringArray + ); + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ColumnarValue::Scalar(ScalarValue::from(" ")), + ], + Ok(Some(" hi")), + &str, + Utf8, + StringArray + ); + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ColumnarValue::Scalar(ScalarValue::from("")), + ], + Ok(Some("hi")), + &str, + Utf8, + StringArray + ); + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(None)), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ColumnarValue::Scalar(ScalarValue::from("xy")), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::Int64(None)), + ColumnarValue::Scalar(ScalarValue::from("xy")), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ColumnarValue::Scalar(ScalarValue::Utf8(None)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("josé")), + ColumnarValue::Scalar(ScalarValue::from(10i64)), + ColumnarValue::Scalar(ScalarValue::from("xy")), + ], + Ok(Some("xyxyxyjosé")), + &str, + Utf8, + StringArray + ); + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("josé")), + ColumnarValue::Scalar(ScalarValue::from(10i64)), + ColumnarValue::Scalar(ScalarValue::from("éñ")), + ], + Ok(Some("éñéñéñjosé")), + &str, + Utf8, + StringArray + ); + #[cfg(not(feature = "unicode_expressions"))] + test_function!( + LPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("josé")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ], + internal_err!( + "function lpad requires compilation with feature flag: unicode_expressions." + ), + &str, + Utf8, + StringArray + ); + Ok(()) + } +} diff --git a/datafusion/functions/src/unicode/mod.rs b/datafusion/functions/src/unicode/mod.rs index 291de3843903..ea4e70a92199 100644 --- a/datafusion/functions/src/unicode/mod.rs +++ b/datafusion/functions/src/unicode/mod.rs @@ -22,6 +22,11 @@ use std::sync::Arc; use datafusion_expr::ScalarUDF; mod character_length; +mod left; +mod lpad; +mod reverse; +mod right; +mod rpad; // create UDFs make_udf_function!( @@ -29,6 +34,11 @@ make_udf_function!( CHARACTER_LENGTH, character_length ); +make_udf_function!(left::LeftFunc, LEFT, left); +make_udf_function!(lpad::LPadFunc, LPAD, lpad); +make_udf_function!(right::RightFunc, RIGHT, right); +make_udf_function!(reverse::ReverseFunc, REVERSE, reverse); +make_udf_function!(rpad::RPadFunc, RPAD, rpad); pub mod expr_fn { use datafusion_expr::Expr; @@ -47,9 +57,41 @@ pub mod expr_fn { pub fn length(string: Expr) -> Expr { character_length(string) } + + #[doc = "returns the first `n` characters in the `string`"] + pub fn left(string: Expr, n: Expr) -> Expr { + super::left().call(vec![string, n]) + } + + #[doc = "fill up a string to the length by prepending the characters"] + pub fn lpad(args: Vec) -> Expr { + super::lpad().call(args) + } + + #[doc = "reverses the `string`"] + pub fn reverse(string: Expr) -> Expr { + super::reverse().call(vec![string]) + } + + #[doc = "returns the last `n` characters in the `string`"] + pub fn right(string: Expr, n: Expr) -> Expr { + super::right().call(vec![string, n]) + } + + #[doc = "fill up a string to the length by appending the characters"] + pub fn rpad(args: Vec) -> Expr { + super::rpad().call(args) + } } /// Return a list of all functions in this package pub fn functions() -> Vec> { - vec![character_length()] + vec![ + character_length(), + left(), + lpad(), + reverse(), + right(), + rpad(), + ] } diff --git a/datafusion/functions/src/unicode/reverse.rs b/datafusion/functions/src/unicode/reverse.rs new file mode 100644 index 000000000000..42ca6e0d17c3 --- /dev/null +++ b/datafusion/functions/src/unicode/reverse.rs @@ -0,0 +1,149 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; +use arrow::datatypes::DataType; + +use datafusion_common::cast::as_generic_string_array; +use datafusion_common::{exec_err, Result}; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; + +use crate::utils::{make_scalar_function, utf8_to_str_type}; + +#[derive(Debug)] +pub(super) struct ReverseFunc { + signature: Signature, +} + +impl ReverseFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::uniform( + 1, + vec![Utf8, LargeUtf8], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for ReverseFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "reverse" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + utf8_to_str_type(&arg_types[0], "reverse") + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + match args[0].data_type() { + DataType::Utf8 => make_scalar_function(reverse::, vec![])(args), + DataType::LargeUtf8 => make_scalar_function(reverse::, vec![])(args), + other => { + exec_err!("Unsupported data type {other:?} for function reverse") + } + } + } +} + +/// Reverses the order of the characters in the string. +/// reverse('abcde') = 'edcba' +/// The implementation uses UTF-8 code points as characters +pub fn reverse(args: &[ArrayRef]) -> Result { + let string_array = as_generic_string_array::(&args[0])?; + + let result = string_array + .iter() + .map(|string| string.map(|string: &str| string.chars().rev().collect::())) + .collect::>(); + + Ok(Arc::new(result) as ArrayRef) +} + +#[cfg(test)] +mod tests { + use arrow::array::{Array, StringArray}; + use arrow::datatypes::DataType::Utf8; + + use datafusion_common::{Result, ScalarValue}; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + + use crate::unicode::reverse::ReverseFunc; + use crate::utils::test::test_function; + + #[test] + fn test_functions() -> Result<()> { + test_function!( + ReverseFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::from("abcde"))], + Ok(Some("edcba")), + &str, + Utf8, + StringArray + ); + test_function!( + ReverseFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::from("loẅks"))], + Ok(Some("sk̈wol")), + &str, + Utf8, + StringArray + ); + test_function!( + ReverseFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::from("loẅks"))], + Ok(Some("sk̈wol")), + &str, + Utf8, + StringArray + ); + test_function!( + ReverseFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8(None))], + Ok(None), + &str, + Utf8, + StringArray + ); + #[cfg(not(feature = "unicode_expressions"))] + test_function!( + ReverseFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::from("abcde"))], + internal_err!( + "function reverse requires compilation with feature flag: unicode_expressions." + ), + &str, + Utf8, + StringArray + ); + + Ok(()) + } +} diff --git a/datafusion/functions/src/unicode/right.rs b/datafusion/functions/src/unicode/right.rs new file mode 100644 index 000000000000..d1bd976342b2 --- /dev/null +++ b/datafusion/functions/src/unicode/right.rs @@ -0,0 +1,238 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::cmp::{max, Ordering}; +use std::sync::Arc; + +use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; +use arrow::datatypes::DataType; + +use datafusion_common::cast::{as_generic_string_array, as_int64_array}; +use datafusion_common::exec_err; +use datafusion_common::Result; +use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; + +use crate::utils::{make_scalar_function, utf8_to_str_type}; + +#[derive(Debug)] +pub(super) struct RightFunc { + signature: Signature, +} + +impl RightFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8, Int64])], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for RightFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "right" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + utf8_to_str_type(&arg_types[0], "right") + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + match args[0].data_type() { + DataType::Utf8 => make_scalar_function(right::, vec![])(args), + DataType::LargeUtf8 => make_scalar_function(right::, vec![])(args), + other => exec_err!("Unsupported data type {other:?} for function right"), + } + } +} + +/// Returns last n characters in the string, or when n is negative, returns all but first |n| characters. +/// right('abcde', 2) = 'de' +/// The implementation uses UTF-8 code points as characters +pub fn right(args: &[ArrayRef]) -> Result { + let string_array = as_generic_string_array::(&args[0])?; + let n_array = as_int64_array(&args[1])?; + + let result = string_array + .iter() + .zip(n_array.iter()) + .map(|(string, n)| match (string, n) { + (Some(string), Some(n)) => match n.cmp(&0) { + Ordering::Less => Some( + string + .chars() + .skip(n.unsigned_abs() as usize) + .collect::(), + ), + Ordering::Equal => Some("".to_string()), + Ordering::Greater => Some( + string + .chars() + .skip(max(string.chars().count() as i64 - n, 0) as usize) + .collect::(), + ), + }, + _ => None, + }) + .collect::>(); + + Ok(Arc::new(result) as ArrayRef) +} + +#[cfg(test)] +mod tests { + use arrow::array::{Array, StringArray}; + use arrow::datatypes::DataType::Utf8; + + use datafusion_common::{Result, ScalarValue}; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + + use crate::unicode::right::RightFunc; + use crate::utils::test::test_function; + + #[test] + fn test_functions() -> Result<()> { + test_function!( + RightFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::from(2i64)), + ], + Ok(Some("de")), + &str, + Utf8, + StringArray + ); + test_function!( + RightFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::from(200i64)), + ], + Ok(Some("abcde")), + &str, + Utf8, + StringArray + ); + test_function!( + RightFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::from(-2i64)), + ], + Ok(Some("cde")), + &str, + Utf8, + StringArray + ); + test_function!( + RightFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::from(-200i64)), + ], + Ok(Some("")), + &str, + Utf8, + StringArray + ); + test_function!( + RightFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::from(0i64)), + ], + Ok(Some("")), + &str, + Utf8, + StringArray + ); + test_function!( + RightFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(None)), + ColumnarValue::Scalar(ScalarValue::from(2i64)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + RightFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::Int64(None)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + RightFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("joséésoj")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ], + Ok(Some("éésoj")), + &str, + Utf8, + StringArray + ); + test_function!( + RightFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("joséésoj")), + ColumnarValue::Scalar(ScalarValue::from(-3i64)), + ], + Ok(Some("éésoj")), + &str, + Utf8, + StringArray + ); + #[cfg(not(feature = "unicode_expressions"))] + test_function!( + RightFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::from(2i64)), + ], + internal_err!( + "function right requires compilation with feature flag: unicode_expressions." + ), + &str, + Utf8, + StringArray + ); + + Ok(()) + } +} diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs new file mode 100644 index 000000000000..070278c90b2f --- /dev/null +++ b/datafusion/functions/src/unicode/rpad.rs @@ -0,0 +1,361 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; +use arrow::datatypes::DataType; +use datafusion_common::cast::{as_generic_string_array, as_int64_array}; +use unicode_segmentation::UnicodeSegmentation; + +use crate::utils::{make_scalar_function, utf8_to_str_type}; +use datafusion_common::{exec_err, Result}; +use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; + +#[derive(Debug)] +pub(super) struct RPadFunc { + signature: Signature, +} + +impl RPadFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![ + Exact(vec![Utf8, Int64]), + Exact(vec![LargeUtf8, Int64]), + Exact(vec![Utf8, Int64, Utf8]), + Exact(vec![LargeUtf8, Int64, Utf8]), + Exact(vec![Utf8, Int64, LargeUtf8]), + Exact(vec![LargeUtf8, Int64, LargeUtf8]), + ], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for RPadFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "rpad" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + utf8_to_str_type(&arg_types[0], "rpad") + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + match args[0].data_type() { + DataType::Utf8 => make_scalar_function(rpad::, vec![])(args), + DataType::LargeUtf8 => make_scalar_function(rpad::, vec![])(args), + other => exec_err!("Unsupported data type {other:?} for function rpad"), + } + } +} + +/// Extends the string to length 'length' by appending the characters fill (a space by default). If the string is already longer than length then it is truncated. +/// rpad('hi', 5, 'xy') = 'hixyx' +pub fn rpad(args: &[ArrayRef]) -> Result { + match args.len() { + 2 => { + let string_array = as_generic_string_array::(&args[0])?; + let length_array = as_int64_array(&args[1])?; + + let result = string_array + .iter() + .zip(length_array.iter()) + .map(|(string, length)| match (string, length) { + (Some(string), Some(length)) => { + if length > i32::MAX as i64 { + return exec_err!( + "rpad requested length {length} too large" + ); + } + + let length = if length < 0 { 0 } else { length as usize }; + if length == 0 { + Ok(Some("".to_string())) + } else { + let graphemes = string.graphemes(true).collect::>(); + if length < graphemes.len() { + Ok(Some(graphemes[..length].concat())) + } else { + let mut s = string.to_string(); + s.push_str(" ".repeat(length - graphemes.len()).as_str()); + Ok(Some(s)) + } + } + } + _ => Ok(None), + }) + .collect::>>()?; + Ok(Arc::new(result) as ArrayRef) + } + 3 => { + let string_array = as_generic_string_array::(&args[0])?; + let length_array = as_int64_array(&args[1])?; + let fill_array = as_generic_string_array::(&args[2])?; + + let result = string_array + .iter() + .zip(length_array.iter()) + .zip(fill_array.iter()) + .map(|((string, length), fill)| match (string, length, fill) { + (Some(string), Some(length), Some(fill)) => { + if length > i32::MAX as i64 { + return exec_err!( + "rpad requested length {length} too large" + ); + } + + let length = if length < 0 { 0 } else { length as usize }; + let graphemes = string.graphemes(true).collect::>(); + let fill_chars = fill.chars().collect::>(); + + if length < graphemes.len() { + Ok(Some(graphemes[..length].concat())) + } else if fill_chars.is_empty() { + Ok(Some(string.to_string())) + } else { + let mut s = string.to_string(); + let mut char_vector = + Vec::::with_capacity(length - graphemes.len()); + for l in 0..length - graphemes.len() { + char_vector + .push(*fill_chars.get(l % fill_chars.len()).unwrap()); + } + s.push_str(char_vector.iter().collect::().as_str()); + Ok(Some(s)) + } + } + _ => Ok(None), + }) + .collect::>>()?; + + Ok(Arc::new(result) as ArrayRef) + } + other => exec_err!( + "rpad was called with {other} arguments. It requires at least 2 and at most 3." + ), + } +} + +#[cfg(test)] +mod tests { + use arrow::array::{Array, StringArray}; + use arrow::datatypes::DataType::Utf8; + + use datafusion_common::{Result, ScalarValue}; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + + use crate::unicode::rpad::RPadFunc; + use crate::utils::test::test_function; + + #[test] + fn test_functions() -> Result<()> { + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("josé")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ], + Ok(Some("josé ")), + &str, + Utf8, + StringArray + ); + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ], + Ok(Some("hi ")), + &str, + Utf8, + StringArray + ); + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(0i64)), + ], + Ok(Some("")), + &str, + Utf8, + StringArray + ); + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::Int64(None)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(None)), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ColumnarValue::Scalar(ScalarValue::from("xy")), + ], + Ok(Some("hixyx")), + &str, + Utf8, + StringArray + ); + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(21i64)), + ColumnarValue::Scalar(ScalarValue::from("abcdef")), + ], + Ok(Some("hiabcdefabcdefabcdefa")), + &str, + Utf8, + StringArray + ); + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ColumnarValue::Scalar(ScalarValue::from(" ")), + ], + Ok(Some("hi ")), + &str, + Utf8, + StringArray + ); + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ColumnarValue::Scalar(ScalarValue::from("")), + ], + Ok(Some("hi")), + &str, + Utf8, + StringArray + ); + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(None)), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ColumnarValue::Scalar(ScalarValue::from("xy")), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::Int64(None)), + ColumnarValue::Scalar(ScalarValue::from("xy")), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("hi")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ColumnarValue::Scalar(ScalarValue::Utf8(None)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("josé")), + ColumnarValue::Scalar(ScalarValue::from(10i64)), + ColumnarValue::Scalar(ScalarValue::from("xy")), + ], + Ok(Some("joséxyxyxy")), + &str, + Utf8, + StringArray + ); + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("josé")), + ColumnarValue::Scalar(ScalarValue::from(10i64)), + ColumnarValue::Scalar(ScalarValue::from("éñ")), + ], + Ok(Some("josééñéñéñ")), + &str, + Utf8, + StringArray + ); + #[cfg(not(feature = "unicode_expressions"))] + test_function!( + RPadFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("josé")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ], + internal_err!( + "function rpad requires compilation with feature flag: unicode_expressions." + ), + &str, + Utf8, + StringArray + ); + + Ok(()) + } +} diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index 9adc8536341d..c1b4900e399a 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -270,67 +270,6 @@ pub fn create_physical_fun( exec_err!("Unsupported data type {other:?} for function initcap") } }), - BuiltinScalarFunction::Left => Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => { - let func = invoke_if_unicode_expressions_feature_flag!(left, i32, "left"); - make_scalar_function_inner(func)(args) - } - DataType::LargeUtf8 => { - let func = invoke_if_unicode_expressions_feature_flag!(left, i64, "left"); - make_scalar_function_inner(func)(args) - } - other => exec_err!("Unsupported data type {other:?} for function left"), - }), - BuiltinScalarFunction::Lpad => Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => { - let func = invoke_if_unicode_expressions_feature_flag!(lpad, i32, "lpad"); - make_scalar_function_inner(func)(args) - } - DataType::LargeUtf8 => { - let func = invoke_if_unicode_expressions_feature_flag!(lpad, i64, "lpad"); - make_scalar_function_inner(func)(args) - } - other => exec_err!("Unsupported data type {other:?} for function lpad"), - }), - BuiltinScalarFunction::Reverse => Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => { - let func = - invoke_if_unicode_expressions_feature_flag!(reverse, i32, "reverse"); - make_scalar_function_inner(func)(args) - } - DataType::LargeUtf8 => { - let func = - invoke_if_unicode_expressions_feature_flag!(reverse, i64, "reverse"); - make_scalar_function_inner(func)(args) - } - other => { - exec_err!("Unsupported data type {other:?} for function reverse") - } - }), - BuiltinScalarFunction::Right => Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => { - let func = - invoke_if_unicode_expressions_feature_flag!(right, i32, "right"); - make_scalar_function_inner(func)(args) - } - DataType::LargeUtf8 => { - let func = - invoke_if_unicode_expressions_feature_flag!(right, i64, "right"); - make_scalar_function_inner(func)(args) - } - other => exec_err!("Unsupported data type {other:?} for function right"), - }), - BuiltinScalarFunction::Rpad => Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => { - let func = invoke_if_unicode_expressions_feature_flag!(rpad, i32, "rpad"); - make_scalar_function_inner(func)(args) - } - DataType::LargeUtf8 => { - let func = invoke_if_unicode_expressions_feature_flag!(rpad, i64, "rpad"); - make_scalar_function_inner(func)(args) - } - other => exec_err!("Unsupported data type {other:?} for function rpad"), - }), BuiltinScalarFunction::EndsWith => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { make_scalar_function_inner(string_expressions::ends_with::)(args) @@ -691,551 +630,6 @@ mod tests { Utf8, StringArray ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Left, - &[lit("abcde"), lit(ScalarValue::Int8(Some(2))),], - Ok(Some("ab")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Left, - &[lit("abcde"), lit(ScalarValue::Int64(Some(200))),], - Ok(Some("abcde")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Left, - &[lit("abcde"), lit(ScalarValue::Int64(Some(-2))),], - Ok(Some("abc")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Left, - &[lit("abcde"), lit(ScalarValue::Int64(Some(-200))),], - Ok(Some("")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Left, - &[lit("abcde"), lit(ScalarValue::Int64(Some(0))),], - Ok(Some("")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Left, - &[ - lit(ScalarValue::Utf8(None)), - lit(ScalarValue::Int64(Some(2))), - ], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Left, - &[lit("abcde"), lit(ScalarValue::Int64(None)),], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Left, - &[lit("joséésoj"), lit(ScalarValue::Int64(Some(5))),], - Ok(Some("joséé")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Left, - &[lit("joséésoj"), lit(ScalarValue::Int64(Some(-3))),], - Ok(Some("joséé")), - &str, - Utf8, - StringArray - ); - #[cfg(not(feature = "unicode_expressions"))] - test_function!( - Left, - &[ - lit("abcde"), - lit(ScalarValue::Int8(Some(2))), - ], - internal_err!( - "function left requires compilation with feature flag: unicode_expressions." - ), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[lit("josé"), lit(ScalarValue::Int64(Some(5))),], - Ok(Some(" josé")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[lit("hi"), lit(ScalarValue::Int64(Some(5))),], - Ok(Some(" hi")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[lit("hi"), lit(ScalarValue::Int64(Some(0))),], - Ok(Some("")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[lit("hi"), lit(ScalarValue::Int64(None)),], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[ - lit(ScalarValue::Utf8(None)), - lit(ScalarValue::Int64(Some(5))), - ], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[lit("hi"), lit(ScalarValue::Int64(Some(5))), lit("xy"),], - Ok(Some("xyxhi")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[lit("hi"), lit(ScalarValue::Int64(Some(21))), lit("abcdef"),], - Ok(Some("abcdefabcdefabcdefahi")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[lit("hi"), lit(ScalarValue::Int64(Some(5))), lit(" "),], - Ok(Some(" hi")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[lit("hi"), lit(ScalarValue::Int64(Some(5))), lit(""),], - Ok(Some("hi")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[ - lit(ScalarValue::Utf8(None)), - lit(ScalarValue::Int64(Some(5))), - lit("xy"), - ], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[lit("hi"), lit(ScalarValue::Int64(None)), lit("xy"),], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[ - lit("hi"), - lit(ScalarValue::Int64(Some(5))), - lit(ScalarValue::Utf8(None)), - ], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[lit("josé"), lit(ScalarValue::Int64(Some(10))), lit("xy"),], - Ok(Some("xyxyxyjosé")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Lpad, - &[lit("josé"), lit(ScalarValue::Int64(Some(10))), lit("éñ"),], - Ok(Some("éñéñéñjosé")), - &str, - Utf8, - StringArray - ); - #[cfg(not(feature = "unicode_expressions"))] - test_function!( - Lpad, - &[ - lit("josé"), - lit(ScalarValue::Int64(Some(5))), - ], - internal_err!( - "function lpad requires compilation with feature flag: unicode_expressions." - ), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Reverse, - &[lit("abcde")], - Ok(Some("edcba")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Reverse, - &[lit("loẅks")], - Ok(Some("sk̈wol")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Reverse, - &[lit("loẅks")], - Ok(Some("sk̈wol")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Reverse, - &[lit(ScalarValue::Utf8(None))], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(not(feature = "unicode_expressions"))] - test_function!( - Reverse, - &[lit("abcde")], - internal_err!( - "function reverse requires compilation with feature flag: unicode_expressions." - ), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Right, - &[lit("abcde"), lit(ScalarValue::Int8(Some(2))),], - Ok(Some("de")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Right, - &[lit("abcde"), lit(ScalarValue::Int64(Some(200))),], - Ok(Some("abcde")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Right, - &[lit("abcde"), lit(ScalarValue::Int64(Some(-2))),], - Ok(Some("cde")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Right, - &[lit("abcde"), lit(ScalarValue::Int64(Some(-200))),], - Ok(Some("")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Right, - &[lit("abcde"), lit(ScalarValue::Int64(Some(0))),], - Ok(Some("")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Right, - &[ - lit(ScalarValue::Utf8(None)), - lit(ScalarValue::Int64(Some(2))), - ], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Right, - &[lit("abcde"), lit(ScalarValue::Int64(None)),], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Right, - &[lit("joséésoj"), lit(ScalarValue::Int64(Some(5))),], - Ok(Some("éésoj")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Right, - &[lit("joséésoj"), lit(ScalarValue::Int64(Some(-3))),], - Ok(Some("éésoj")), - &str, - Utf8, - StringArray - ); - #[cfg(not(feature = "unicode_expressions"))] - test_function!( - Right, - &[ - lit("abcde"), - lit(ScalarValue::Int8(Some(2))), - ], - internal_err!( - "function right requires compilation with feature flag: unicode_expressions." - ), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[lit("josé"), lit(ScalarValue::Int64(Some(5))),], - Ok(Some("josé ")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[lit("hi"), lit(ScalarValue::Int64(Some(5))),], - Ok(Some("hi ")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[lit("hi"), lit(ScalarValue::Int64(Some(0))),], - Ok(Some("")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[lit("hi"), lit(ScalarValue::Int64(None)),], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[ - lit(ScalarValue::Utf8(None)), - lit(ScalarValue::Int64(Some(5))), - ], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[lit("hi"), lit(ScalarValue::Int64(Some(5))), lit("xy"),], - Ok(Some("hixyx")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[lit("hi"), lit(ScalarValue::Int64(Some(21))), lit("abcdef"),], - Ok(Some("hiabcdefabcdefabcdefa")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[lit("hi"), lit(ScalarValue::Int64(Some(5))), lit(" "),], - Ok(Some("hi ")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[lit("hi"), lit(ScalarValue::Int64(Some(5))), lit(""),], - Ok(Some("hi")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[ - lit(ScalarValue::Utf8(None)), - lit(ScalarValue::Int64(Some(5))), - lit("xy"), - ], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[lit("hi"), lit(ScalarValue::Int64(None)), lit("xy"),], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[ - lit("hi"), - lit(ScalarValue::Int64(Some(5))), - lit(ScalarValue::Utf8(None)), - ], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[lit("josé"), lit(ScalarValue::Int64(Some(10))), lit("xy"),], - Ok(Some("joséxyxyxy")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Rpad, - &[lit("josé"), lit(ScalarValue::Int64(Some(10))), lit("éñ"),], - Ok(Some("josééñéñéñ")), - &str, - Utf8, - StringArray - ); - #[cfg(not(feature = "unicode_expressions"))] - test_function!( - Rpad, - &[ - lit("josé"), - lit(ScalarValue::Int64(Some(5))), - ], - internal_err!( - "function rpad requires compilation with feature flag: unicode_expressions." - ), - &str, - Utf8, - StringArray - ); test_function!( EndsWith, &[lit("alphabet"), lit("alph"),], diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs index 319d9ca2269a..0dbea09ffb51 100644 --- a/datafusion/physical-expr/src/planner.rs +++ b/datafusion/physical-expr/src/planner.rs @@ -335,11 +335,11 @@ mod tests { use arrow_array::{ArrayRef, BooleanArray, RecordBatch, StringArray}; use arrow_schema::{DataType, Field, Schema}; use datafusion_common::{DFSchema, Result}; - use datafusion_expr::{col, left, Literal}; + use datafusion_expr::{col, lit}; #[test] fn test_create_physical_expr_scalar_input_output() -> Result<()> { - let expr = col("letter").eq(left("APACHE".lit(), 1i64.lit())); + let expr = col("letter").eq(lit("A")); let schema = Schema::new(vec![Field::new("letter", DataType::Utf8, false)]); let df_schema = DFSchema::try_from_qualified_schema("data", &schema)?; diff --git a/datafusion/physical-expr/src/unicode_expressions.rs b/datafusion/physical-expr/src/unicode_expressions.rs index c7e4b7d7c443..faff21111a61 100644 --- a/datafusion/physical-expr/src/unicode_expressions.rs +++ b/datafusion/physical-expr/src/unicode_expressions.rs @@ -21,7 +21,7 @@ //! Unicode expressions -use std::cmp::{max, Ordering}; +use std::cmp::max; use std::sync::Arc; use arrow::{ @@ -36,267 +36,6 @@ use datafusion_common::{ exec_err, Result, }; -/// Returns first n characters in the string, or when n is negative, returns all but last |n| characters. -/// left('abcde', 2) = 'ab' -/// The implementation uses UTF-8 code points as characters -pub fn left(args: &[ArrayRef]) -> Result { - let string_array = as_generic_string_array::(&args[0])?; - let n_array = as_int64_array(&args[1])?; - let result = string_array - .iter() - .zip(n_array.iter()) - .map(|(string, n)| match (string, n) { - (Some(string), Some(n)) => match n.cmp(&0) { - Ordering::Less => { - let len = string.chars().count() as i64; - Some(if n.abs() < len { - string.chars().take((len + n) as usize).collect::() - } else { - "".to_string() - }) - } - Ordering::Equal => Some("".to_string()), - Ordering::Greater => { - Some(string.chars().take(n as usize).collect::()) - } - }, - _ => None, - }) - .collect::>(); - - Ok(Arc::new(result) as ArrayRef) -} - -/// Extends the string to length 'length' by prepending the characters fill (a space by default). If the string is already longer than length then it is truncated (on the right). -/// lpad('hi', 5, 'xy') = 'xyxhi' -pub fn lpad(args: &[ArrayRef]) -> Result { - match args.len() { - 2 => { - let string_array = as_generic_string_array::(&args[0])?; - let length_array = as_int64_array(&args[1])?; - - let result = string_array - .iter() - .zip(length_array.iter()) - .map(|(string, length)| match (string, length) { - (Some(string), Some(length)) => { - if length > i32::MAX as i64 { - return exec_err!( - "lpad requested length {length} too large" - ); - } - - let length = if length < 0 { 0 } else { length as usize }; - if length == 0 { - Ok(Some("".to_string())) - } else { - let graphemes = string.graphemes(true).collect::>(); - if length < graphemes.len() { - Ok(Some(graphemes[..length].concat())) - } else { - let mut s: String = " ".repeat(length - graphemes.len()); - s.push_str(string); - Ok(Some(s)) - } - } - } - _ => Ok(None), - }) - .collect::>>()?; - - Ok(Arc::new(result) as ArrayRef) - } - 3 => { - let string_array = as_generic_string_array::(&args[0])?; - let length_array = as_int64_array(&args[1])?; - let fill_array = as_generic_string_array::(&args[2])?; - - let result = string_array - .iter() - .zip(length_array.iter()) - .zip(fill_array.iter()) - .map(|((string, length), fill)| match (string, length, fill) { - (Some(string), Some(length), Some(fill)) => { - if length > i32::MAX as i64 { - return exec_err!( - "lpad requested length {length} too large" - ); - } - - let length = if length < 0 { 0 } else { length as usize }; - if length == 0 { - Ok(Some("".to_string())) - } else { - let graphemes = string.graphemes(true).collect::>(); - let fill_chars = fill.chars().collect::>(); - - if length < graphemes.len() { - Ok(Some(graphemes[..length].concat())) - } else if fill_chars.is_empty() { - Ok(Some(string.to_string())) - } else { - let mut s = string.to_string(); - let mut char_vector = - Vec::::with_capacity(length - graphemes.len()); - for l in 0..length - graphemes.len() { - char_vector.push( - *fill_chars.get(l % fill_chars.len()).unwrap(), - ); - } - s.insert_str( - 0, - char_vector.iter().collect::().as_str(), - ); - Ok(Some(s)) - } - } - } - _ => Ok(None), - }) - .collect::>>()?; - - Ok(Arc::new(result) as ArrayRef) - } - other => exec_err!( - "lpad was called with {other} arguments. It requires at least 2 and at most 3." - ), - } -} - -/// Reverses the order of the characters in the string. -/// reverse('abcde') = 'edcba' -/// The implementation uses UTF-8 code points as characters -pub fn reverse(args: &[ArrayRef]) -> Result { - let string_array = as_generic_string_array::(&args[0])?; - - let result = string_array - .iter() - .map(|string| string.map(|string: &str| string.chars().rev().collect::())) - .collect::>(); - - Ok(Arc::new(result) as ArrayRef) -} - -/// Returns last n characters in the string, or when n is negative, returns all but first |n| characters. -/// right('abcde', 2) = 'de' -/// The implementation uses UTF-8 code points as characters -pub fn right(args: &[ArrayRef]) -> Result { - let string_array = as_generic_string_array::(&args[0])?; - let n_array = as_int64_array(&args[1])?; - - let result = string_array - .iter() - .zip(n_array.iter()) - .map(|(string, n)| match (string, n) { - (Some(string), Some(n)) => match n.cmp(&0) { - Ordering::Less => Some( - string - .chars() - .skip(n.unsigned_abs() as usize) - .collect::(), - ), - Ordering::Equal => Some("".to_string()), - Ordering::Greater => Some( - string - .chars() - .skip(max(string.chars().count() as i64 - n, 0) as usize) - .collect::(), - ), - }, - _ => None, - }) - .collect::>(); - - Ok(Arc::new(result) as ArrayRef) -} - -/// Extends the string to length 'length' by appending the characters fill (a space by default). If the string is already longer than length then it is truncated. -/// rpad('hi', 5, 'xy') = 'hixyx' -pub fn rpad(args: &[ArrayRef]) -> Result { - match args.len() { - 2 => { - let string_array = as_generic_string_array::(&args[0])?; - let length_array = as_int64_array(&args[1])?; - - let result = string_array - .iter() - .zip(length_array.iter()) - .map(|(string, length)| match (string, length) { - (Some(string), Some(length)) => { - if length > i32::MAX as i64 { - return exec_err!( - "rpad requested length {length} too large" - ); - } - - let length = if length < 0 { 0 } else { length as usize }; - if length == 0 { - Ok(Some("".to_string())) - } else { - let graphemes = string.graphemes(true).collect::>(); - if length < graphemes.len() { - Ok(Some(graphemes[..length].concat())) - } else { - let mut s = string.to_string(); - s.push_str(" ".repeat(length - graphemes.len()).as_str()); - Ok(Some(s)) - } - } - } - _ => Ok(None), - }) - .collect::>>()?; - Ok(Arc::new(result) as ArrayRef) - } - 3 => { - let string_array = as_generic_string_array::(&args[0])?; - let length_array = as_int64_array(&args[1])?; - let fill_array = as_generic_string_array::(&args[2])?; - - let result = string_array - .iter() - .zip(length_array.iter()) - .zip(fill_array.iter()) - .map(|((string, length), fill)| match (string, length, fill) { - (Some(string), Some(length), Some(fill)) => { - if length > i32::MAX as i64 { - return exec_err!( - "rpad requested length {length} too large" - ); - } - - let length = if length < 0 { 0 } else { length as usize }; - let graphemes = string.graphemes(true).collect::>(); - let fill_chars = fill.chars().collect::>(); - - if length < graphemes.len() { - Ok(Some(graphemes[..length].concat())) - } else if fill_chars.is_empty() { - Ok(Some(string.to_string())) - } else { - let mut s = string.to_string(); - let mut char_vector = - Vec::::with_capacity(length - graphemes.len()); - for l in 0..length - graphemes.len() { - char_vector - .push(*fill_chars.get(l % fill_chars.len()).unwrap()); - } - s.push_str(char_vector.iter().collect::().as_str()); - Ok(Some(s)) - } - } - _ => Ok(None), - }) - .collect::>>()?; - - Ok(Arc::new(result) as ArrayRef) - } - other => exec_err!( - "rpad was called with {other} arguments. It requires at least 2 and at most 3." - ), - } -} - /// Returns starting index of specified substring within string, or zero if it's not present. (Same as position(substring in string), but note the reversed argument order.) /// strpos('high', 'ig') = 2 /// The implementation uses UTF-8 code points as characters diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 766ca6633ee1..6319372d98d2 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -572,8 +572,8 @@ enum ScalarFunction { // 28 was DatePart // 29 was DateTrunc InitCap = 30; - Left = 31; - Lpad = 32; + // 31 was Left + // 32 was Lpad // 33 was Lower // 34 was Ltrim // 35 was MD5 @@ -583,9 +583,9 @@ enum ScalarFunction { // 39 was RegexpReplace // 40 was Repeat // 41 was Replace - Reverse = 42; - Right = 43; - Rpad = 44; + // 42 was Reverse + // 43 was Right + // 44 was Rpad // 45 was Rtrim // 46 was SHA224 // 47 was SHA256 diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index f2814956ef1b..7281bc9dc263 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -22931,12 +22931,7 @@ impl serde::Serialize for ScalarFunction { Self::Concat => "Concat", Self::ConcatWithSeparator => "ConcatWithSeparator", Self::InitCap => "InitCap", - Self::Left => "Left", - Self::Lpad => "Lpad", Self::Random => "Random", - Self::Reverse => "Reverse", - Self::Right => "Right", - Self::Rpad => "Rpad", Self::Strpos => "Strpos", Self::Substr => "Substr", Self::Translate => "Translate", @@ -22990,12 +22985,7 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Concat", "ConcatWithSeparator", "InitCap", - "Left", - "Lpad", "Random", - "Reverse", - "Right", - "Rpad", "Strpos", "Substr", "Translate", @@ -23078,12 +23068,7 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Concat" => Ok(ScalarFunction::Concat), "ConcatWithSeparator" => Ok(ScalarFunction::ConcatWithSeparator), "InitCap" => Ok(ScalarFunction::InitCap), - "Left" => Ok(ScalarFunction::Left), - "Lpad" => Ok(ScalarFunction::Lpad), "Random" => Ok(ScalarFunction::Random), - "Reverse" => Ok(ScalarFunction::Reverse), - "Right" => Ok(ScalarFunction::Right), - "Rpad" => Ok(ScalarFunction::Rpad), "Strpos" => Ok(ScalarFunction::Strpos), "Substr" => Ok(ScalarFunction::Substr), "Translate" => Ok(ScalarFunction::Translate), diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index ecc94fcdaf99..2fe89efb9cea 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -2871,8 +2871,8 @@ pub enum ScalarFunction { /// 28 was DatePart /// 29 was DateTrunc InitCap = 30, - Left = 31, - Lpad = 32, + /// 31 was Left + /// 32 was Lpad /// 33 was Lower /// 34 was Ltrim /// 35 was MD5 @@ -2882,9 +2882,9 @@ pub enum ScalarFunction { /// 39 was RegexpReplace /// 40 was Repeat /// 41 was Replace - Reverse = 42, - Right = 43, - Rpad = 44, + /// 42 was Reverse + /// 43 was Right + /// 44 was Rpad /// 45 was Rtrim /// 46 was SHA224 /// 47 was SHA256 @@ -3004,12 +3004,7 @@ impl ScalarFunction { ScalarFunction::Concat => "Concat", ScalarFunction::ConcatWithSeparator => "ConcatWithSeparator", ScalarFunction::InitCap => "InitCap", - ScalarFunction::Left => "Left", - ScalarFunction::Lpad => "Lpad", ScalarFunction::Random => "Random", - ScalarFunction::Reverse => "Reverse", - ScalarFunction::Right => "Right", - ScalarFunction::Rpad => "Rpad", ScalarFunction::Strpos => "Strpos", ScalarFunction::Substr => "Substr", ScalarFunction::Translate => "Translate", @@ -3057,12 +3052,7 @@ impl ScalarFunction { "Concat" => Some(Self::Concat), "ConcatWithSeparator" => Some(Self::ConcatWithSeparator), "InitCap" => Some(Self::InitCap), - "Left" => Some(Self::Left), - "Lpad" => Some(Self::Lpad), "Random" => Some(Self::Random), - "Reverse" => Some(Self::Reverse), - "Right" => Some(Self::Right), - "Rpad" => Some(Self::Rpad), "Strpos" => Some(Self::Strpos), "Substr" => Some(Self::Substr), "Translate" => Some(Self::Translate), diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index 19edd71a3a80..2c6f2e479b24 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -17,18 +17,6 @@ use std::sync::Arc; -use crate::protobuf::{ - self, - plan_type::PlanTypeEnum::{ - AnalyzedLogicalPlan, FinalAnalyzedLogicalPlan, FinalLogicalPlan, - FinalPhysicalPlan, FinalPhysicalPlanWithStats, InitialLogicalPlan, - InitialPhysicalPlan, InitialPhysicalPlanWithStats, OptimizedLogicalPlan, - OptimizedPhysicalPlan, - }, - AnalyzedLogicalPlanType, CubeNode, GroupingSetNode, OptimizedLogicalPlanType, - OptimizedPhysicalPlanType, PlaceholderNode, RollupNode, -}; - use arrow::{ array::AsArray, buffer::Buffer, @@ -38,6 +26,7 @@ use arrow::{ }, ipc::{reader::read_record_batch, root_as_message}, }; + use datafusion::execution::registry::FunctionRegistry; use datafusion_common::{ arrow_datafusion_err, internal_err, plan_datafusion_err, Column, Constraint, @@ -51,17 +40,29 @@ use datafusion_expr::{ acosh, asinh, atan, atan2, atanh, cbrt, ceil, coalesce, concat_expr, concat_ws_expr, cos, cosh, cot, degrees, ends_with, exp, expr::{self, InList, Sort, WindowFunction}, - factorial, find_in_set, floor, gcd, initcap, iszero, lcm, left, ln, log, log10, log2, + factorial, find_in_set, floor, gcd, initcap, iszero, lcm, ln, log, log10, log2, logical_plan::{PlanType, StringifiedPlan}, - lpad, nanvl, pi, power, radians, random, reverse, right, round, rpad, signum, sin, - sinh, sqrt, strpos, substr, substr_index, substring, translate, trunc, - AggregateFunction, Between, BinaryExpr, BuiltInWindowFunction, BuiltinScalarFunction, - Case, Cast, Expr, GetFieldAccess, GetIndexedField, GroupingSet, + nanvl, pi, power, radians, random, round, signum, sin, sinh, sqrt, strpos, substr, + substr_index, substring, translate, trunc, AggregateFunction, Between, BinaryExpr, + BuiltInWindowFunction, BuiltinScalarFunction, Case, Cast, Expr, GetFieldAccess, + GetIndexedField, GroupingSet, GroupingSet::GroupingSets, JoinConstraint, JoinType, Like, Operator, TryCast, WindowFrame, WindowFrameBound, WindowFrameUnits, }; +use crate::protobuf::{ + self, + plan_type::PlanTypeEnum::{ + AnalyzedLogicalPlan, FinalAnalyzedLogicalPlan, FinalLogicalPlan, + FinalPhysicalPlan, FinalPhysicalPlanWithStats, InitialLogicalPlan, + InitialPhysicalPlan, InitialPhysicalPlanWithStats, OptimizedLogicalPlan, + OptimizedPhysicalPlan, + }, + AnalyzedLogicalPlanType, CubeNode, GroupingSetNode, OptimizedLogicalPlanType, + OptimizedPhysicalPlanType, PlaceholderNode, RollupNode, +}; + use super::LogicalExtensionCodec; #[derive(Debug)] @@ -453,12 +454,7 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::ConcatWithSeparator => Self::ConcatWithSeparator, ScalarFunction::EndsWith => Self::EndsWith, ScalarFunction::InitCap => Self::InitCap, - ScalarFunction::Left => Self::Left, - ScalarFunction::Lpad => Self::Lpad, ScalarFunction::Random => Self::Random, - ScalarFunction::Reverse => Self::Reverse, - ScalarFunction::Right => Self::Right, - ScalarFunction::Rpad => Self::Rpad, ScalarFunction::Strpos => Self::Strpos, ScalarFunction::Substr => Self::Substr, ScalarFunction::Translate => Self::Translate, @@ -1382,26 +1378,13 @@ pub fn parse_expr( parse_expr(&args[0], registry, codec)?, parse_expr(&args[1], registry, codec)?, )), - ScalarFunction::Left => Ok(left( - parse_expr(&args[0], registry, codec)?, - parse_expr(&args[1], registry, codec)?, - )), ScalarFunction::Random => Ok(random()), - ScalarFunction::Reverse => { - Ok(reverse(parse_expr(&args[0], registry, codec)?)) - } - ScalarFunction::Right => Ok(right( - parse_expr(&args[0], registry, codec)?, - parse_expr(&args[1], registry, codec)?, - )), ScalarFunction::Concat => { Ok(concat_expr(parse_exprs(args, registry, codec)?)) } ScalarFunction::ConcatWithSeparator => { Ok(concat_ws_expr(parse_exprs(args, registry, codec)?)) } - ScalarFunction::Lpad => Ok(lpad(parse_exprs(args, registry, codec)?)), - ScalarFunction::Rpad => Ok(rpad(parse_exprs(args, registry, codec)?)), ScalarFunction::EndsWith => Ok(ends_with( parse_expr(&args[0], registry, codec)?, parse_expr(&args[1], registry, codec)?, diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index 11fc7362c75d..ea682a5a22f8 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -1445,12 +1445,7 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::ConcatWithSeparator => Self::ConcatWithSeparator, BuiltinScalarFunction::EndsWith => Self::EndsWith, BuiltinScalarFunction::InitCap => Self::InitCap, - BuiltinScalarFunction::Left => Self::Left, - BuiltinScalarFunction::Lpad => Self::Lpad, BuiltinScalarFunction::Random => Self::Random, - BuiltinScalarFunction::Reverse => Self::Reverse, - BuiltinScalarFunction::Right => Self::Right, - BuiltinScalarFunction::Rpad => Self::Rpad, BuiltinScalarFunction::Strpos => Self::Strpos, BuiltinScalarFunction::Substr => Self::Substr, BuiltinScalarFunction::Translate => Self::Translate, From 7f497b3b23d4aa2cb6336671d09b9c9837ed0d82 Mon Sep 17 00:00:00 2001 From: Mustafa Akur <106137913+mustafasrepo@users.noreply.github.com> Date: Fri, 29 Mar 2024 10:34:49 +0300 Subject: [PATCH 04/15] Add non-column expression equality tracking to filter exec (#9819) * Add non-column expression equality tracking to filter exec * Minor changes --- datafusion/physical-plan/src/filter.rs | 47 +++++++++---------- datafusion/physical-plan/src/lib.rs | 1 - datafusion/sqllogictest/test_files/select.slt | 21 +++++++++ 3 files changed, 44 insertions(+), 25 deletions(-) diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index 2996152fb924..a9201f435ad8 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -29,7 +29,7 @@ use super::{ }; use crate::{ metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}, - Column, DisplayFormatType, ExecutionPlan, + DisplayFormatType, ExecutionPlan, }; use arrow::compute::filter_record_batch; @@ -192,9 +192,7 @@ impl FilterExec { let mut eq_properties = input.equivalence_properties().clone(); let (equal_pairs, _) = collect_columns_from_predicate(predicate); for (lhs, rhs) in equal_pairs { - let lhs_expr = Arc::new(lhs.clone()) as _; - let rhs_expr = Arc::new(rhs.clone()) as _; - eq_properties.add_equal_conditions(&lhs_expr, &rhs_expr) + eq_properties.add_equal_conditions(lhs, rhs) } // Add the columns that have only one viable value (singleton) after // filtering to constants. @@ -405,34 +403,33 @@ impl RecordBatchStream for FilterExecStream { /// Return the equals Column-Pairs and Non-equals Column-Pairs fn collect_columns_from_predicate(predicate: &Arc) -> EqualAndNonEqual { - let mut eq_predicate_columns = Vec::<(&Column, &Column)>::new(); - let mut ne_predicate_columns = Vec::<(&Column, &Column)>::new(); + let mut eq_predicate_columns = Vec::::new(); + let mut ne_predicate_columns = Vec::::new(); let predicates = split_conjunction(predicate); predicates.into_iter().for_each(|p| { if let Some(binary) = p.as_any().downcast_ref::() { - if let (Some(left_column), Some(right_column)) = ( - binary.left().as_any().downcast_ref::(), - binary.right().as_any().downcast_ref::(), - ) { - match binary.op() { - Operator::Eq => { - eq_predicate_columns.push((left_column, right_column)) - } - Operator::NotEq => { - ne_predicate_columns.push((left_column, right_column)) - } - _ => {} + match binary.op() { + Operator::Eq => { + eq_predicate_columns.push((binary.left(), binary.right())) + } + Operator::NotEq => { + ne_predicate_columns.push((binary.left(), binary.right())) } + _ => {} } } }); (eq_predicate_columns, ne_predicate_columns) } + +/// Pair of `Arc`s +pub type PhysicalExprPairRef<'a> = (&'a Arc, &'a Arc); + /// The equals Column-Pairs and Non-equals Column-Pairs in the Predicates pub type EqualAndNonEqual<'a> = - (Vec<(&'a Column, &'a Column)>, Vec<(&'a Column, &'a Column)>); + (Vec>, Vec>); #[cfg(test)] mod tests { @@ -482,14 +479,16 @@ mod tests { )?; let (equal_pairs, ne_pairs) = collect_columns_from_predicate(&predicate); + assert_eq!(2, equal_pairs.len()); + assert!(equal_pairs[0].0.eq(&col("c2", &schema)?)); + assert!(equal_pairs[0].1.eq(&lit(4u32))); - assert_eq!(1, equal_pairs.len()); - assert_eq!(equal_pairs[0].0.name(), "c2"); - assert_eq!(equal_pairs[0].1.name(), "c9"); + assert!(equal_pairs[1].0.eq(&col("c2", &schema)?)); + assert!(equal_pairs[1].1.eq(&col("c9", &schema)?)); assert_eq!(1, ne_pairs.len()); - assert_eq!(ne_pairs[0].0.name(), "c1"); - assert_eq!(ne_pairs[0].1.name(), "c13"); + assert!(ne_pairs[0].0.eq(&col("c1", &schema)?)); + assert!(ne_pairs[0].1.eq(&col("c13", &schema)?)); Ok(()) } diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index 4b4b37f8b51b..3e8e439c9a38 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -33,7 +33,6 @@ use datafusion_common::config::ConfigOptions; use datafusion_common::utils::DataPtr; use datafusion_common::Result; use datafusion_execution::TaskContext; -use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::{ EquivalenceProperties, LexOrdering, PhysicalSortExpr, PhysicalSortRequirement, }; diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index 3a5c6497ebd4..ad4b0df1a546 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -1386,6 +1386,27 @@ AggregateExec: mode=FinalPartitioned, gby=[c2@0 as c2], aggr=[COUNT(*)] --------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 ----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2], has_header=true +# FilterExec can track equality of non-column expressions. +# plan below shouldn't have a SortExec because given column 'a' is ordered. +# 'CAST(ROUND(b) as INT)' is also ordered. After filter is applied. +query TT +EXPLAIN SELECT * +FROM annotated_data_finite2 +WHERE CAST(ROUND(b) as INT) = a +ORDER BY CAST(ROUND(b) as INT); +---- +logical_plan +Sort: CAST(round(CAST(annotated_data_finite2.b AS Float64)) AS Int32) ASC NULLS LAST +--Filter: CAST(round(CAST(annotated_data_finite2.b AS Float64)) AS Int32) = annotated_data_finite2.a +----TableScan: annotated_data_finite2 projection=[a0, a, b, c, d], partial_filters=[CAST(round(CAST(annotated_data_finite2.b AS Float64)) AS Int32) = annotated_data_finite2.a] +physical_plan +SortPreservingMergeExec: [CAST(round(CAST(b@2 AS Float64)) AS Int32) ASC NULLS LAST] +--CoalesceBatchesExec: target_batch_size=8192 +----FilterExec: CAST(round(CAST(b@2 AS Float64)) AS Int32) = a@1 +------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +--------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true + + statement ok drop table annotated_data_finite2; From d7957636327fb8d89e6428152492107e39d614b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Berkay=20=C5=9Eahin?= <124376117+berkaysynnada@users.noreply.github.com> Date: Fri, 29 Mar 2024 14:28:41 +0300 Subject: [PATCH 05/15] datafusion-cli support for multiple commands in a single line (#9831) * Multiple Create External Table's are supported from CLI * Handle in-quote semicolons * add test --- datafusion-cli/src/exec.rs | 32 +++++++------ datafusion-cli/src/helper.rs | 91 ++++++++++++++++++++++++++++++++---- 2 files changed, 100 insertions(+), 23 deletions(-) diff --git a/datafusion-cli/src/exec.rs b/datafusion-cli/src/exec.rs index 114e3cefa3bf..53375ab4104f 100644 --- a/datafusion-cli/src/exec.rs +++ b/datafusion-cli/src/exec.rs @@ -22,6 +22,7 @@ use std::fs::File; use std::io::prelude::*; use std::io::BufReader; +use crate::helper::split_from_semicolon; use crate::print_format::PrintFormat; use crate::{ command::{Command, OutputFormat}, @@ -164,21 +165,24 @@ pub async fn exec_from_repl( } } Ok(line) => { - rl.add_history_entry(line.trim_end())?; - tokio::select! { - res = exec_and_print(ctx, print_options, line) => match res { - Ok(_) => {} - Err(err) => eprintln!("{err}"), - }, - _ = signal::ctrl_c() => { - println!("^C"); - continue - }, + let lines = split_from_semicolon(line); + for line in lines { + rl.add_history_entry(line.trim_end())?; + tokio::select! { + res = exec_and_print(ctx, print_options, line) => match res { + Ok(_) => {} + Err(err) => eprintln!("{err}"), + }, + _ = signal::ctrl_c() => { + println!("^C"); + continue + }, + } + // dialect might have changed + rl.helper_mut().unwrap().set_dialect( + &ctx.task_ctx().session_config().options().sql_parser.dialect, + ); } - // dialect might have changed - rl.helper_mut().unwrap().set_dialect( - &ctx.task_ctx().session_config().options().sql_parser.dialect, - ); } Err(ReadlineError::Interrupted) => { println!("^C"); diff --git a/datafusion-cli/src/helper.rs b/datafusion-cli/src/helper.rs index a8e149b4c5c6..8b196484ee2c 100644 --- a/datafusion-cli/src/helper.rs +++ b/datafusion-cli/src/helper.rs @@ -86,16 +86,23 @@ impl CliHelper { )))) } }; - - match DFParser::parse_sql_with_dialect(&sql, dialect.as_ref()) { - Ok(statements) if statements.is_empty() => Ok(ValidationResult::Invalid( - Some(" 🤔 You entered an empty statement".to_string()), - )), - Ok(_statements) => Ok(ValidationResult::Valid(None)), - Err(err) => Ok(ValidationResult::Invalid(Some(format!( - " 🤔 Invalid statement: {err}", - )))), + let lines = split_from_semicolon(sql); + for line in lines { + match DFParser::parse_sql_with_dialect(&line, dialect.as_ref()) { + Ok(statements) if statements.is_empty() => { + return Ok(ValidationResult::Invalid(Some( + " 🤔 You entered an empty statement".to_string(), + ))); + } + Ok(_statements) => {} + Err(err) => { + return Ok(ValidationResult::Invalid(Some(format!( + " 🤔 Invalid statement: {err}", + )))); + } + } } + Ok(ValidationResult::Valid(None)) } else if input.starts_with('\\') { // command Ok(ValidationResult::Valid(None)) @@ -197,6 +204,37 @@ pub fn unescape_input(input: &str) -> datafusion::error::Result { Ok(result) } +/// Splits a string which consists of multiple queries. +pub(crate) fn split_from_semicolon(sql: String) -> Vec { + let mut commands = Vec::new(); + let mut current_command = String::new(); + let mut in_single_quote = false; + let mut in_double_quote = false; + + for c in sql.chars() { + if c == '\'' && !in_double_quote { + in_single_quote = !in_single_quote; + } else if c == '"' && !in_single_quote { + in_double_quote = !in_double_quote; + } + + if c == ';' && !in_single_quote && !in_double_quote { + if !current_command.trim().is_empty() { + commands.push(format!("{};", current_command.trim())); + current_command.clear(); + } + } else { + current_command.push(c); + } + } + + if !current_command.trim().is_empty() { + commands.push(format!("{};", current_command.trim())); + } + + commands +} + #[cfg(test)] mod tests { use std::io::{BufRead, Cursor}; @@ -292,4 +330,39 @@ mod tests { Ok(()) } + + #[test] + fn test_split_from_semicolon() { + let sql = "SELECT 1; SELECT 2;"; + let expected = vec!["SELECT 1;", "SELECT 2;"]; + assert_eq!(split_from_semicolon(sql.to_string()), expected); + + let sql = r#"SELECT ";";"#; + let expected = vec![r#"SELECT ";";"#]; + assert_eq!(split_from_semicolon(sql.to_string()), expected); + + let sql = "SELECT ';';"; + let expected = vec!["SELECT ';';"]; + assert_eq!(split_from_semicolon(sql.to_string()), expected); + + let sql = r#"SELECT 1; SELECT 'value;value'; SELECT 1 as "text;text";"#; + let expected = vec![ + "SELECT 1;", + "SELECT 'value;value';", + r#"SELECT 1 as "text;text";"#, + ]; + assert_eq!(split_from_semicolon(sql.to_string()), expected); + + let sql = ""; + let expected: Vec = Vec::new(); + assert_eq!(split_from_semicolon(sql.to_string()), expected); + + let sql = "SELECT 1"; + let expected = vec!["SELECT 1;"]; + assert_eq!(split_from_semicolon(sql.to_string()), expected); + + let sql = "SELECT 1; "; + let expected = vec!["SELECT 1;"]; + assert_eq!(split_from_semicolon(sql.to_string()), expected); + } } From 230a6b476804c0a8964d559cc16e41328a43efc5 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 29 Mar 2024 08:03:21 -0400 Subject: [PATCH 06/15] Add tests for filtering, grouping, aggregation of ARRAYs (#9695) * Add tests for filtering, grouping, aggregation of ARRAYs * Update output to correct results --- .../sqllogictest/test_files/array_query.slt | 160 ++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 datafusion/sqllogictest/test_files/array_query.slt diff --git a/datafusion/sqllogictest/test_files/array_query.slt b/datafusion/sqllogictest/test_files/array_query.slt new file mode 100644 index 000000000000..24c99fc849b6 --- /dev/null +++ b/datafusion/sqllogictest/test_files/array_query.slt @@ -0,0 +1,160 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +## Tests for basic array queries + +# Make a table with multiple input partitions +statement ok +CREATE TABLE data AS + SELECT * FROM (VALUES + ([1,2,3], [4,5], 1) + ) + UNION ALL + SELECT * FROM (VALUES + ([2,3], [2,3], 1), + ([1,2,3], NULL, 1) + ) +; + +query ??I rowsort +SELECT * FROM data; +---- +[1, 2, 3] NULL 1 +[1, 2, 3] [4, 5] 1 +[2, 3] [2, 3] 1 + +########### +# Filtering +########### + +query error DataFusion error: Arrow error: Invalid argument error: Invalid comparison operation: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) == List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) +SELECT * FROM data WHERE column1 = [1,2,3]; + +query error DataFusion error: Arrow error: Invalid argument error: Invalid comparison operation: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) == List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) +SELECT * FROM data WHERE column1 = column2 + +query error DataFusion error: Arrow error: Invalid argument error: Invalid comparison operation: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) != List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) +SELECT * FROM data WHERE column1 != [1,2,3]; + +query error DataFusion error: Arrow error: Invalid argument error: Invalid comparison operation: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) != List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) +SELECT * FROM data WHERE column1 != column2 + +########### +# Aggregates +########### + +query error Internal error: Min/Max accumulator not implemented for type List +SELECT min(column1) FROM data; + +query error Internal error: Min/Max accumulator not implemented for type List +SELECT max(column1) FROM data; + +query I +SELECT count(column1) FROM data; +---- +3 + +# note single count distincts are rewritten to use a group by +query I +SELECT count(distinct column1) FROM data; +---- +2 + +query I +SELECT count(distinct column2) FROM data; +---- +2 + + +# note multiple count distincts are not rewritten +query II +SELECT count(distinct column1), count(distinct column2) FROM data; +---- +2 2 + + +########### +# GROUP BY +########### + + +query I +SELECT count(column1) FROM data GROUP BY column3; +---- +3 + +# note single count distincts are rewritten to use a group by +query I +SELECT count(distinct column1) FROM data GROUP BY column3; +---- +2 + +query I +SELECT count(distinct column2) FROM data GROUP BY column3; +---- +2 + +# note multiple count distincts are not rewritten +query II +SELECT count(distinct column1), count(distinct column2) FROM data GROUP BY column3; +---- +2 2 + + +########### +# ORDER BY +########### + +query ??I +SELECT * FROM data ORDER BY column2; +---- +[2, 3] [2, 3] 1 +[1, 2, 3] [4, 5] 1 +[1, 2, 3] NULL 1 + +query ??I +SELECT * FROM data ORDER BY column2 DESC; +---- +[1, 2, 3] NULL 1 +[1, 2, 3] [4, 5] 1 +[2, 3] [2, 3] 1 + +query ??I +SELECT * FROM data ORDER BY column2 DESC NULLS LAST; +---- +[1, 2, 3] [4, 5] 1 +[2, 3] [2, 3] 1 +[1, 2, 3] NULL 1 + +# multi column +query ??I +SELECT * FROM data ORDER BY column1, column2; +---- +[1, 2, 3] [4, 5] 1 +[1, 2, 3] NULL 1 +[2, 3] [2, 3] 1 + +query ??I +SELECT * FROM data ORDER BY column1, column3, column2; +---- +[1, 2, 3] [4, 5] 1 +[1, 2, 3] NULL 1 +[2, 3] [2, 3] 1 + + +statement ok +drop table data From aaad010e82d51d84c441f11f4359616fab39b960 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 29 Mar 2024 09:25:55 -0400 Subject: [PATCH 07/15] Remove vestigal conbench integration (#9855) --- conbench/.flake8 | 2 - conbench/.gitignore | 130 ----------------- conbench/.isort.cfg | 2 - conbench/README.md | 252 --------------------------------- conbench/_criterion.py | 98 ------------- conbench/benchmarks.json | 8 -- conbench/benchmarks.py | 41 ------ conbench/requirements-test.txt | 3 - conbench/requirements.txt | 1 - 9 files changed, 537 deletions(-) delete mode 100644 conbench/.flake8 delete mode 100755 conbench/.gitignore delete mode 100644 conbench/.isort.cfg delete mode 100644 conbench/README.md delete mode 100644 conbench/_criterion.py delete mode 100644 conbench/benchmarks.json delete mode 100644 conbench/benchmarks.py delete mode 100644 conbench/requirements-test.txt delete mode 100644 conbench/requirements.txt diff --git a/conbench/.flake8 b/conbench/.flake8 deleted file mode 100644 index e44b81084185..000000000000 --- a/conbench/.flake8 +++ /dev/null @@ -1,2 +0,0 @@ -[flake8] -ignore = E501 diff --git a/conbench/.gitignore b/conbench/.gitignore deleted file mode 100755 index aa44ee2adbd4..000000000000 --- a/conbench/.gitignore +++ /dev/null @@ -1,130 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -pip-wheel-metadata/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -.python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - diff --git a/conbench/.isort.cfg b/conbench/.isort.cfg deleted file mode 100644 index f238bf7ea137..000000000000 --- a/conbench/.isort.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[settings] -profile = black diff --git a/conbench/README.md b/conbench/README.md deleted file mode 100644 index f655ac8bd297..000000000000 --- a/conbench/README.md +++ /dev/null @@ -1,252 +0,0 @@ - - -# DataFusion + Conbench Integration - - -## Quick start - -``` -$ cd ~/arrow-datafusion/conbench/ -$ conda create -y -n conbench python=3.9 -$ conda activate conbench -(conbench) $ pip install -r requirements.txt -(conbench) $ conbench datafusion -``` - -## Example output - -``` -{ - "batch_id": "3c82f9d23fce49328b78ba9fd963b254", - "context": { - "benchmark_language": "Rust" - }, - "github": { - "commit": "e8c198b9fac6cd8822b950b9f71898e47965488d", - "repository": "https://github.com/dianaclarke/arrow-datafusion" - }, - "info": {}, - "machine_info": { - "architecture_name": "x86_64", - "cpu_core_count": "8", - "cpu_frequency_max_hz": "2400000000", - "cpu_l1d_cache_bytes": "65536", - "cpu_l1i_cache_bytes": "131072", - "cpu_l2_cache_bytes": "4194304", - "cpu_l3_cache_bytes": "0", - "cpu_model_name": "Apple M1", - "cpu_thread_count": "8", - "gpu_count": "0", - "gpu_product_names": [], - "kernel_name": "20.6.0", - "memory_bytes": "17179869184", - "name": "diana", - "os_name": "macOS", - "os_version": "10.16" - }, - "run_id": "ec2a50b9380c470b96d7eb7d63ab5b77", - "stats": { - "data": [ - "0.001532", - "0.001394", - "0.001333", - "0.001356", - "0.001379", - "0.001361", - "0.001307", - "0.001348", - "0.001436", - "0.001397", - "0.001339", - "0.001523", - "0.001593", - "0.001415", - "0.001344", - "0.001312", - "0.001402", - "0.001362", - "0.001329", - "0.001330", - "0.001447", - "0.001413", - "0.001536", - "0.001330", - "0.001333", - "0.001338", - "0.001333", - "0.001331", - "0.001426", - "0.001575", - "0.001362", - "0.001343", - "0.001334", - "0.001383", - "0.001476", - "0.001356", - "0.001362", - "0.001334", - "0.001390", - "0.001497", - "0.001330", - "0.001347", - "0.001331", - "0.001468", - "0.001377", - "0.001351", - "0.001328", - "0.001509", - "0.001338", - "0.001355", - "0.001332", - "0.001485", - "0.001370", - "0.001366", - "0.001507", - "0.001358", - "0.001331", - "0.001463", - "0.001362", - "0.001336", - "0.001428", - "0.001343", - "0.001359", - "0.001905", - "0.001726", - "0.001411", - "0.001433", - "0.001391", - "0.001453", - "0.001346", - "0.001339", - "0.001420", - "0.001330", - "0.001422", - "0.001683", - "0.001426", - "0.001349", - "0.001342", - "0.001430", - "0.001330", - "0.001436", - "0.001331", - "0.001415", - "0.001332", - "0.001408", - "0.001343", - "0.001392", - "0.001371", - "0.001655", - "0.001354", - "0.001438", - "0.001347", - "0.001341", - "0.001374", - "0.001453", - "0.001352", - "0.001358", - "0.001398", - "0.001362", - "0.001454" - ], - "iqr": "0.000088", - "iterations": 100, - "max": "0.001905", - "mean": "0.001401", - "median": "0.001362", - "min": "0.001307", - "q1": "0.001340", - "q3": "0.001428", - "stdev": "0.000095", - "time_unit": "s", - "times": [], - "unit": "s" - }, - "tags": { - "name": "aggregate_query_group_by", - "suite": "aggregate_query_group_by" - }, - "timestamp": "2022-02-09T01:32:55.769468+00:00" -} -``` - -## Debug with test benchmark - -``` -(conbench) $ cd ~/arrow-datafusion/conbench/ -(conbench) $ conbench test --iterations=3 - -Benchmark result: -{ - "batch_id": "41a144761bc24d82b94efa70d6e460b3", - "context": { - "benchmark_language": "Python" - }, - "github": { - "commit": "e8c198b9fac6cd8822b950b9f71898e47965488d", - "repository": "https://github.com/dianaclarke/arrow-datafusion" - }, - "info": { - "benchmark_language_version": "Python 3.9.7" - }, - "machine_info": { - "architecture_name": "x86_64", - "cpu_core_count": "8", - "cpu_frequency_max_hz": "2400000000", - "cpu_l1d_cache_bytes": "65536", - "cpu_l1i_cache_bytes": "131072", - "cpu_l2_cache_bytes": "4194304", - "cpu_l3_cache_bytes": "0", - "cpu_model_name": "Apple M1", - "cpu_thread_count": "8", - "gpu_count": "0", - "gpu_product_names": [], - "kernel_name": "20.6.0", - "memory_bytes": "17179869184", - "name": "diana", - "os_name": "macOS", - "os_version": "10.16" - }, - "run_id": "71f46362db8844afacea82cba119cefc", - "stats": { - "data": [ - "0.000001", - "0.000001", - "0.000000" - ], - "iqr": "0.000000", - "iterations": 3, - "max": "0.000001", - "mean": "0.000001", - "median": "0.000001", - "min": "0.000000", - "q1": "0.000000", - "q3": "0.000001", - "stdev": "0.000001", - "time_unit": "s", - "times": [], - "unit": "s" - }, - "tags": { - "name": "test" - }, - "timestamp": "2022-02-09T01:36:45.823615+00:00" -} -``` - diff --git a/conbench/_criterion.py b/conbench/_criterion.py deleted file mode 100644 index 168a1b9b6cb1..000000000000 --- a/conbench/_criterion.py +++ /dev/null @@ -1,98 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import collections -import csv -import os -import pathlib -import subprocess - -import conbench.runner -from conbench.machine_info import github_info - - -def _result_in_seconds(row): - # sample_measured_value - The value of the measurement for this sample. - # Note that this is the measured value for the whole sample, not the - # time-per-iteration To calculate the time-per-iteration, use - # sample_measured_value/iteration_count - # -- https://bheisler.github.io/criterion.rs/book/user_guide/csv_output.html - count = int(row["iteration_count"]) - sample = float(row["sample_measured_value"]) - return sample / count / 10**9 - - -def _parse_benchmark_group(row): - parts = row["group"].split(",") - if len(parts) > 1: - suite, name = parts[0], ",".join(parts[1:]) - else: - suite, name = row["group"], row["group"] - return suite.strip(), name.strip() - - -def _read_results(src_dir): - results = collections.defaultdict(lambda: collections.defaultdict(list)) - path = pathlib.Path(os.path.join(src_dir, "target", "criterion")) - for path in list(path.glob("**/new/raw.csv")): - with open(path) as csv_file: - reader = csv.DictReader(csv_file) - for row in reader: - suite, name = _parse_benchmark_group(row) - results[suite][name].append(_result_in_seconds(row)) - return results - - -def _execute_command(command): - try: - print(command) - result = subprocess.run(command, capture_output=True, check=True) - except subprocess.CalledProcessError as e: - print(e.stderr.decode("utf-8")) - raise e - return result.stdout.decode("utf-8"), result.stderr.decode("utf-8") - - -class CriterionBenchmark(conbench.runner.Benchmark): - external = True - - def run(self, **kwargs): - src_dir = os.path.join(os.getcwd(), "..") - self._cargo_bench(src_dir) - results = _read_results(src_dir) - for suite in results: - self.conbench.mark_new_batch() - for name, data in results[suite].items(): - yield self._record_result(suite, name, data, kwargs) - - def _cargo_bench(self, src_dir): - os.chdir(src_dir) - _execute_command(["cargo", "bench"]) - - def _record_result(self, suite, name, data, options): - tags = {"suite": suite} - result = {"data": data, "unit": "s"} - context = {"benchmark_language": "Rust"} - github = github_info() - return self.conbench.record( - result, - name, - tags=tags, - context=context, - github=github, - options=options, - ) diff --git a/conbench/benchmarks.json b/conbench/benchmarks.json deleted file mode 100644 index bb7033547722..000000000000 --- a/conbench/benchmarks.json +++ /dev/null @@ -1,8 +0,0 @@ -[ - { - "command": "datafusion", - "flags": { - "language": "Rust" - } - } -] diff --git a/conbench/benchmarks.py b/conbench/benchmarks.py deleted file mode 100644 index f80b3add90f9..000000000000 --- a/conbench/benchmarks.py +++ /dev/null @@ -1,41 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import conbench.runner - -import _criterion - - -@conbench.runner.register_benchmark -class TestBenchmark(conbench.runner.Benchmark): - name = "test" - - def run(self, **kwargs): - yield self.conbench.benchmark( - self._f(), - self.name, - options=kwargs, - ) - - def _f(self): - return lambda: 1 + 1 - - -@conbench.runner.register_benchmark -class CargoBenchmarks(_criterion.CriterionBenchmark): - name = "datafusion" - description = "Run Arrow DataFusion micro benchmarks." diff --git a/conbench/requirements-test.txt b/conbench/requirements-test.txt deleted file mode 100644 index 5e5647acd2d6..000000000000 --- a/conbench/requirements-test.txt +++ /dev/null @@ -1,3 +0,0 @@ -black -flake8 -isort diff --git a/conbench/requirements.txt b/conbench/requirements.txt deleted file mode 100644 index a877c7b44e9b..000000000000 --- a/conbench/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -conbench From 2d023299fa2544350cb18b45181cc8aa729eda3f Mon Sep 17 00:00:00 2001 From: Renjie Liu Date: Fri, 29 Mar 2024 21:38:43 +0800 Subject: [PATCH 08/15] feat: Add display_pg_json for LogicalPlan (#9789) * feat: Add display_pg_json for LogicalPlan * Fix lints * Fix comments * Fix format --- datafusion-cli/Cargo.lock | 1 + datafusion/expr/Cargo.toml | 1 + datafusion/expr/src/logical_plan/display.rs | 494 +++++++++++++++++++- datafusion/expr/src/logical_plan/plan.rs | 82 ++++ 4 files changed, 577 insertions(+), 1 deletion(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 0277d23f4de0..2bbe89f24bbe 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1249,6 +1249,7 @@ dependencies = [ "chrono", "datafusion-common", "paste", + "serde_json", "sqlparser", "strum 0.26.2", "strum_macros 0.26.2", diff --git a/datafusion/expr/Cargo.toml b/datafusion/expr/Cargo.toml index 621a320230f2..6f6147d36883 100644 --- a/datafusion/expr/Cargo.toml +++ b/datafusion/expr/Cargo.toml @@ -43,6 +43,7 @@ arrow-array = { workspace = true } chrono = { workspace = true } datafusion-common = { workspace = true, default-features = true } paste = "^1.0" +serde_json = { workspace = true } sqlparser = { workspace = true } strum = { version = "0.26.1", features = ["derive"] } strum_macros = "0.26.0" diff --git a/datafusion/expr/src/logical_plan/display.rs b/datafusion/expr/src/logical_plan/display.rs index e0cb44626e24..edc3afd55d63 100644 --- a/datafusion/expr/src/logical_plan/display.rs +++ b/datafusion/expr/src/logical_plan/display.rs @@ -16,14 +16,22 @@ // under the License. //! This module provides logic for displaying LogicalPlans in various styles +use std::collections::HashMap; use std::fmt; -use crate::LogicalPlan; +use crate::{ + expr_vec_fmt, Aggregate, DescribeTable, Distinct, DistinctOn, DmlStatement, Expr, + Filter, Join, Limit, LogicalPlan, Partitioning, Prepare, Projection, RecursiveQuery, + Repartition, Sort, Subquery, SubqueryAlias, TableProviderFilterPushDown, TableScan, + Unnest, Values, Window, +}; +use crate::dml::CopyTo; use arrow::datatypes::Schema; use datafusion_common::display::GraphvizBuilder; use datafusion_common::tree_node::{TreeNodeRecursion, TreeNodeVisitor}; use datafusion_common::DataFusionError; +use serde_json::json; /// Formats plans with a single line per node. For example: /// @@ -221,6 +229,490 @@ impl<'a, 'b> TreeNodeVisitor for GraphvizVisitor<'a, 'b> { } } +/// Formats plans to display as postgresql plan json format. +/// +/// There are already many existing visualizer for this format, for example [dalibo](https://explain.dalibo.com/). +/// Unfortunately, there is no formal spec for this format, but it is widely used in the PostgreSQL community. +/// +/// Here is an example of the format: +/// +/// ```json +/// [ +/// { +/// "Plan": { +/// "Node Type": "Sort", +/// "Output": [ +/// "question_1.id", +/// "question_1.title", +/// "question_1.text", +/// "question_1.file", +/// "question_1.type", +/// "question_1.source", +/// "question_1.exam_id" +/// ], +/// "Sort Key": [ +/// "question_1.id" +/// ], +/// "Plans": [ +/// { +/// "Node Type": "Seq Scan", +/// "Parent Relationship": "Left", +/// "Relation Name": "question", +/// "Schema": "public", +/// "Alias": "question_1", +/// "Output": [ +/// "question_1.id", +/// "question_1.title", +/// "question_1.text", +/// "question_1.file", +/// "question_1.type", +/// "question_1.source", +/// "question_1.exam_id" +/// ], +/// "Filter": "(question_1.exam_id = 1)" +/// } +/// ] +/// } +/// } +/// ] +/// ``` +pub struct PgJsonVisitor<'a, 'b> { + f: &'a mut fmt::Formatter<'b>, + + /// A mapping from plan node id to the plan node json representation. + objects: HashMap, + + next_id: u32, + + /// If true, includes summarized schema information + with_schema: bool, + + /// Holds the ids (as generated from `graphviz_builder` of all + /// parent nodes + parent_ids: Vec, +} + +impl<'a, 'b> PgJsonVisitor<'a, 'b> { + pub fn new(f: &'a mut fmt::Formatter<'b>) -> Self { + Self { + f, + objects: HashMap::new(), + next_id: 0, + with_schema: false, + parent_ids: Vec::new(), + } + } + + /// Sets a flag which controls if the output schema is displayed + pub fn with_schema(&mut self, with_schema: bool) { + self.with_schema = with_schema; + } + + /// Converts a logical plan node to a json object. + fn to_json_value(node: &LogicalPlan) -> serde_json::Value { + match node { + LogicalPlan::EmptyRelation(_) => { + json!({ + "Node Type": "EmptyRelation", + }) + } + LogicalPlan::RecursiveQuery(RecursiveQuery { is_distinct, .. }) => { + json!({ + "Node Type": "RecursiveQuery", + "Is Distinct": is_distinct, + }) + } + LogicalPlan::Values(Values { ref values, .. }) => { + let str_values = values + .iter() + // limit to only 5 values to avoid horrible display + .take(5) + .map(|row| { + let item = row + .iter() + .map(|expr| expr.to_string()) + .collect::>() + .join(", "); + format!("({item})") + }) + .collect::>() + .join(", "); + + let elipse = if values.len() > 5 { "..." } else { "" }; + + let values_str = format!("{}{}", str_values, elipse); + json!({ + "Node Type": "Values", + "Values": values_str + }) + } + LogicalPlan::TableScan(TableScan { + ref source, + ref table_name, + ref filters, + ref fetch, + .. + }) => { + let mut object = json!({ + "Node Type": "TableScan", + "Relation Name": table_name.table(), + }); + + if let Some(s) = table_name.schema() { + object["Schema"] = serde_json::Value::String(s.to_string()); + } + + if let Some(c) = table_name.catalog() { + object["Catalog"] = serde_json::Value::String(c.to_string()); + } + + if !filters.is_empty() { + let mut full_filter = vec![]; + let mut partial_filter = vec![]; + let mut unsupported_filters = vec![]; + let filters: Vec<&Expr> = filters.iter().collect(); + + if let Ok(results) = source.supports_filters_pushdown(&filters) { + filters.iter().zip(results.iter()).for_each( + |(x, res)| match res { + TableProviderFilterPushDown::Exact => full_filter.push(x), + TableProviderFilterPushDown::Inexact => { + partial_filter.push(x) + } + TableProviderFilterPushDown::Unsupported => { + unsupported_filters.push(x) + } + }, + ); + } + + if !full_filter.is_empty() { + object["Full Filters"] = serde_json::Value::String( + expr_vec_fmt!(full_filter).to_string(), + ); + }; + if !partial_filter.is_empty() { + object["Partial Filters"] = serde_json::Value::String( + expr_vec_fmt!(partial_filter).to_string(), + ); + } + if !unsupported_filters.is_empty() { + object["Unsupported Filters"] = serde_json::Value::String( + expr_vec_fmt!(unsupported_filters).to_string(), + ); + } + } + + if let Some(f) = fetch { + object["Fetch"] = serde_json::Value::Number((*f).into()); + } + + object + } + LogicalPlan::Projection(Projection { ref expr, .. }) => { + json!({ + "Node Type": "Projection", + "Expressions": expr.iter().map(|e| e.to_string()).collect::>() + }) + } + LogicalPlan::Dml(DmlStatement { table_name, op, .. }) => { + json!({ + "Node Type": "Projection", + "Operation": op.name(), + "Table Name": table_name.table() + }) + } + LogicalPlan::Copy(CopyTo { + input: _, + output_url, + format_options, + partition_by: _, + options, + }) => { + let op_str = options + .iter() + .map(|(k, v)| format!("{}={}", k, v)) + .collect::>() + .join(", "); + json!({ + "Node Type": "CopyTo", + "Output URL": output_url, + "Format Options": format!("{}", format_options), + "Options": op_str + }) + } + LogicalPlan::Ddl(ddl) => { + json!({ + "Node Type": "Ddl", + "Operation": format!("{}", ddl.display()) + }) + } + LogicalPlan::Filter(Filter { + predicate: ref expr, + .. + }) => { + json!({ + "Node Type": "Filter", + "Condition": format!("{}", expr) + }) + } + LogicalPlan::Window(Window { + ref window_expr, .. + }) => { + json!({ + "Node Type": "WindowAggr", + "Expressions": expr_vec_fmt!(window_expr) + }) + } + LogicalPlan::Aggregate(Aggregate { + ref group_expr, + ref aggr_expr, + .. + }) => { + json!({ + "Node Type": "Aggregate", + "Group By": expr_vec_fmt!(group_expr), + "Aggregates": expr_vec_fmt!(aggr_expr) + }) + } + LogicalPlan::Sort(Sort { expr, fetch, .. }) => { + let mut object = json!({ + "Node Type": "Sort", + "Sort Key": expr_vec_fmt!(expr), + }); + + if let Some(fetch) = fetch { + object["Fetch"] = serde_json::Value::Number((*fetch).into()); + } + + object + } + LogicalPlan::Join(Join { + on: ref keys, + filter, + join_constraint, + join_type, + .. + }) => { + let join_expr: Vec = + keys.iter().map(|(l, r)| format!("{l} = {r}")).collect(); + let filter_expr = filter + .as_ref() + .map(|expr| format!(" Filter: {expr}")) + .unwrap_or_else(|| "".to_string()); + json!({ + "Node Type": format!("{} Join", join_type), + "Join Constraint": format!("{:?}", join_constraint), + "Join Keys": join_expr.join(", "), + "Filter": format!("{}", filter_expr) + }) + } + LogicalPlan::CrossJoin(_) => { + json!({ + "Node Type": "Cross Join" + }) + } + LogicalPlan::Repartition(Repartition { + partitioning_scheme, + .. + }) => match partitioning_scheme { + Partitioning::RoundRobinBatch(n) => { + json!({ + "Node Type": "Repartition", + "Partitioning Scheme": "RoundRobinBatch", + "Partition Count": n + }) + } + Partitioning::Hash(expr, n) => { + let hash_expr: Vec = + expr.iter().map(|e| format!("{e}")).collect(); + + json!({ + "Node Type": "Repartition", + "Partitioning Scheme": "Hash", + "Partition Count": n, + "Partitioning Key": hash_expr + }) + } + Partitioning::DistributeBy(expr) => { + let dist_by_expr: Vec = + expr.iter().map(|e| format!("{e}")).collect(); + json!({ + "Node Type": "Repartition", + "Partitioning Scheme": "DistributeBy", + "Partitioning Key": dist_by_expr + }) + } + }, + LogicalPlan::Limit(Limit { + ref skip, + ref fetch, + .. + }) => { + let mut object = serde_json::json!( + { + "Node Type": "Limit", + "Skip": skip, + } + ); + if let Some(f) = fetch { + object["Fetch"] = serde_json::Value::Number((*f).into()); + }; + object + } + LogicalPlan::Subquery(Subquery { .. }) => { + json!({ + "Node Type": "Subquery" + }) + } + LogicalPlan::SubqueryAlias(SubqueryAlias { ref alias, .. }) => { + json!({ + "Node Type": "Subquery", + "Alias": alias.table(), + }) + } + LogicalPlan::Statement(statement) => { + json!({ + "Node Type": "Statement", + "Statement": format!("{}", statement.display()) + }) + } + LogicalPlan::Distinct(distinct) => match distinct { + Distinct::All(_) => { + json!({ + "Node Type": "DistinctAll" + }) + } + Distinct::On(DistinctOn { + on_expr, + select_expr, + sort_expr, + .. + }) => { + let mut object = json!({ + "Node Type": "DistinctOn", + "On": expr_vec_fmt!(on_expr), + "Select": expr_vec_fmt!(select_expr), + }); + if let Some(sort_expr) = sort_expr { + object["Sort"] = serde_json::Value::String( + expr_vec_fmt!(sort_expr).to_string(), + ); + } + + object + } + }, + LogicalPlan::Explain { .. } => { + json!({ + "Node Type": "Explain" + }) + } + LogicalPlan::Analyze { .. } => { + json!({ + "Node Type": "Analyze" + }) + } + LogicalPlan::Union(_) => { + json!({ + "Node Type": "Union" + }) + } + LogicalPlan::Extension(e) => { + json!({ + "Node Type": e.node.name(), + "Detail": format!("{:?}", e.node) + }) + } + LogicalPlan::Prepare(Prepare { + name, data_types, .. + }) => { + json!({ + "Node Type": "Prepare", + "Name": name, + "Data Types": format!("{:?}", data_types) + }) + } + LogicalPlan::DescribeTable(DescribeTable { .. }) => { + json!({ + "Node Type": "DescribeTable" + }) + } + LogicalPlan::Unnest(Unnest { column, .. }) => { + json!({ + "Node Type": "Unnest", + "Column": format!("{}", column) + }) + } + } + } +} + +impl<'a, 'b> TreeNodeVisitor for PgJsonVisitor<'a, 'b> { + type Node = LogicalPlan; + + fn f_down( + &mut self, + node: &LogicalPlan, + ) -> datafusion_common::Result { + let id = self.next_id; + self.next_id += 1; + let mut object = Self::to_json_value(node); + + object["Plans"] = serde_json::Value::Array(vec![]); + + if self.with_schema { + object["Output"] = serde_json::Value::Array( + node.schema() + .fields() + .iter() + .map(|f| f.name().to_string()) + .map(serde_json::Value::String) + .collect(), + ); + }; + + self.objects.insert(id, object); + self.parent_ids.push(id); + Ok(TreeNodeRecursion::Continue) + } + + fn f_up( + &mut self, + _node: &Self::Node, + ) -> datafusion_common::Result { + let id = self.parent_ids.pop().unwrap(); + + let current_node = self.objects.remove(&id).ok_or_else(|| { + DataFusionError::Internal("Missing current node!".to_string()) + })?; + + if let Some(parent_id) = self.parent_ids.last() { + let parent_node = self + .objects + .get_mut(parent_id) + .expect("Missing parent node!"); + let plans = parent_node + .get_mut("Plans") + .and_then(|p| p.as_array_mut()) + .expect("Plans should be an array"); + + plans.push(current_node); + } else { + // This is the root node + let plan = serde_json::json!([{"Plan": current_node}]); + write!( + self.f, + "{}", + serde_json::to_string_pretty(&plan) + .map_err(|e| DataFusionError::External(Box::new(e)))? + )?; + } + + Ok(TreeNodeRecursion::Continue) + } +} + #[cfg(test)] mod tests { use arrow::datatypes::{DataType, Field}; diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 05d7ac539458..9f4094d483c9 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -54,6 +54,7 @@ use datafusion_common::{ }; // backwards compatibility +use crate::display::PgJsonVisitor; pub use datafusion_common::display::{PlanType, StringifiedPlan, ToStringifiedPlan}; pub use datafusion_common::{JoinConstraint, JoinType}; @@ -1302,6 +1303,26 @@ impl LogicalPlan { Wrapper(self) } + /// Return a displayable structure that produces plan in postgresql JSON format. + /// + /// Users can use this format to visualize the plan in existing plan visualization tools, for example [dalibo](https://explain.dalibo.com/) + pub fn display_pg_json(&self) -> impl Display + '_ { + // Boilerplate structure to wrap LogicalPlan with something + // that that can be formatted + struct Wrapper<'a>(&'a LogicalPlan); + impl<'a> Display for Wrapper<'a> { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + let mut visitor = PgJsonVisitor::new(f); + visitor.with_schema(true); + match self.0.visit(&mut visitor) { + Ok(_) => Ok(()), + Err(_) => Err(fmt::Error), + } + } + } + Wrapper(self) + } + /// Return a `format`able structure that produces lines meant for /// graphical display using the `DOT` language. This format can be /// visualized using software from @@ -2781,6 +2802,67 @@ digraph { Ok(()) } + #[test] + fn test_display_pg_json() -> Result<()> { + let plan = display_plan()?; + + let expected_pg_json = r#"[ + { + "Plan": { + "Expressions": [ + "employee_csv.id" + ], + "Node Type": "Projection", + "Output": [ + "id" + ], + "Plans": [ + { + "Condition": "employee_csv.state IN ()", + "Node Type": "Filter", + "Output": [ + "id", + "state" + ], + "Plans": [ + { + "Node Type": "Subquery", + "Output": [ + "state" + ], + "Plans": [ + { + "Node Type": "TableScan", + "Output": [ + "state" + ], + "Plans": [], + "Relation Name": "employee_csv" + } + ] + }, + { + "Node Type": "TableScan", + "Output": [ + "id", + "state" + ], + "Plans": [], + "Relation Name": "employee_csv" + } + ] + } + ] + } + } +]"#; + + let pg_json = format!("{}", plan.display_pg_json()); + + assert_eq!(expected_pg_json, pg_json); + Ok(()) + } + /// Tests for the Visitor trait and walking logical plan nodes #[derive(Debug, Default)] struct OkVisitor { From 2956ec2962d7af94be53243427f8795d29fa90a3 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 29 Mar 2024 09:39:27 -0400 Subject: [PATCH 09/15] Update `COPY` documentation to reflect cahnges (#9754) --- docs/source/user-guide/sql/dml.md | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/docs/source/user-guide/sql/dml.md b/docs/source/user-guide/sql/dml.md index b9614bb8f929..79c36092fd3d 100644 --- a/docs/source/user-guide/sql/dml.md +++ b/docs/source/user-guide/sql/dml.md @@ -25,11 +25,14 @@ and modifying data in tables. ## COPY Copies the contents of a table or query to file(s). Supported file -formats are `parquet`, `csv`, and `json` and can be inferred based on -filename if writing to a single file. +formats are `parquet`, `csv`, `json`, and `arrow`.
-COPY { table_name | query } TO 'file_name' [ ( option [, ... ] ) ]
+COPY { table_name | query } 
+TO 'file_name'
+[ STORED AS format ]
+[ PARTITIONED BY column_name [, ...] ]
+[ OPTIONS( option [, ... ] ) ]
 
For a detailed list of valid OPTIONS, see [Write Options](write_options). @@ -61,7 +64,7 @@ Copy the contents of `source_table` to multiple directories of hive-style partitioned parquet files: ```sql -> COPY source_table TO 'dir_name' (FORMAT parquet, partition_by 'column1, column2'); +> COPY source_table TO 'dir_name' STORED AS parquet, PARTITIONED BY (column1, column2); +-------+ | count | +-------+ @@ -74,7 +77,7 @@ results (maintaining the order) to a parquet file named `output.parquet` with a maximum parquet row group size of 10MB: ```sql -> COPY (SELECT * from source ORDER BY time) TO 'output.parquet' (ROW_GROUP_LIMIT_BYTES 10000000); +> COPY (SELECT * from source ORDER BY time) TO 'output.parquet' OPTIONS (MAX_ROW_GROUP_SIZE 10000000); +-------+ | count | +-------+ @@ -82,6 +85,12 @@ results (maintaining the order) to a parquet file named +-------+ ``` +The output format is determined by the first match of the following rules: + +1. Value of `STORED AS` +2. Value of the `OPTION (FORMAT ..)` +3. Filename extension (e.g. `foo.parquet` implies `PARQUET` format) + ## INSERT Insert values into a table. From f1adc68394ff378382dad5b36e89886139e902fa Mon Sep 17 00:00:00 2001 From: Marko Grujic Date: Fri, 29 Mar 2024 14:41:40 +0100 Subject: [PATCH 10/15] Remove the two cases most likely to cause OOM in CI (#9858) --- .github/workflows/pr_benchmarks.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/pr_benchmarks.yml b/.github/workflows/pr_benchmarks.yml index 29d001783b17..5827c42e85ae 100644 --- a/.github/workflows/pr_benchmarks.yml +++ b/.github/workflows/pr_benchmarks.yml @@ -47,7 +47,6 @@ jobs: ./bench.sh run tpch ./bench.sh run tpch_mem ./bench.sh run tpch10 - ./bench.sh run tpch_mem10 # For some reason this step doesn't seem to propagate the env var down into the script if [ -d "results/HEAD" ]; then @@ -70,7 +69,6 @@ jobs: ./bench.sh run tpch ./bench.sh run tpch_mem ./bench.sh run tpch10 - ./bench.sh run tpch_mem10 echo ${{ github.event.issue.number }} > pr From bf141dd113291615cbe986545969fae6368efa98 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 29 Mar 2024 09:42:37 -0400 Subject: [PATCH 11/15] Minor: make uuid an optional dependency on datafusion-functions (#9771) * Minor: make uuid an optional dependency on datafusion-functions * fix merge --- datafusion/functions/Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 0cab0276ff4b..3ae3061012e0 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -52,7 +52,7 @@ math_expressions = [] # enable regular expressions regex_expressions = ["regex"] # enable string functions -string_expressions = [] +string_expressions = ["uuid"] # enable unicode functions unicode_expressions = ["unicode-segmentation"] @@ -79,7 +79,7 @@ md-5 = { version = "^0.10.0", optional = true } regex = { version = "1.8", optional = true } sha2 = { version = "^0.10.1", optional = true } unicode-segmentation = { version = "^1.7.1", optional = true } -uuid = { version = "1.7", features = ["v4"] } +uuid = { version = "1.7", features = ["v4"], optional = true } [dev-dependencies] criterion = "0.5" From c202965c1740140fa4ff49364c99f3c4c9293182 Mon Sep 17 00:00:00 2001 From: Phillip LeBlanc Date: Fri, 29 Mar 2024 22:43:51 +0900 Subject: [PATCH 12/15] Add `Spice.ai` to Known Users (#9852) --- docs/source/user-guide/introduction.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/user-guide/introduction.md b/docs/source/user-guide/introduction.md index 0e9d731c6e21..708318db4aba 100644 --- a/docs/source/user-guide/introduction.md +++ b/docs/source/user-guide/introduction.md @@ -116,6 +116,7 @@ Here are some active projects using DataFusion: - [Restate](https://github.com/restatedev) Easily build resilient applications using distributed durable async/await - [ROAPI](https://github.com/roapi/roapi) - [Seafowl](https://github.com/splitgraph/seafowl) CDN-friendly analytical database +- [Spice.ai](https://github.com/spiceai/spiceai) Unified SQL query interface & materialization engine - [Synnada](https://synnada.ai/) Streaming-first framework for data products - [VegaFusion](https://vegafusion.io/) Server-side acceleration for the [Vega](https://vega.github.io/) visualization grammar - [ZincObserve](https://github.com/zinclabs/zincobserve) Distributed cloud native observability platform @@ -146,6 +147,7 @@ Here are some less active projects that used DataFusion: [qv]: https://github.com/timvw/qv [roapi]: https://github.com/roapi/roapi [seafowl]: https://github.com/splitgraph/seafowl +[spice.ai]: https://github.com/spiceai/spiceai [synnada]: https://synnada.ai/ [tensorbase]: https://github.com/tensorbase/tensorbase [vegafusion]: https://vegafusion.io/ From 5ab5511db6f1715dea1f123cc40e480490443bca Mon Sep 17 00:00:00 2001 From: comphead Date: Fri, 29 Mar 2024 06:44:40 -0700 Subject: [PATCH 13/15] minor: add a hint how to adjust max rows displayed (#9845) --- datafusion-cli/src/print_options.rs | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/datafusion-cli/src/print_options.rs b/datafusion-cli/src/print_options.rs index 02cb0fb9c63e..93630c8d48f8 100644 --- a/datafusion-cli/src/print_options.rs +++ b/datafusion-cli/src/print_options.rs @@ -73,21 +73,22 @@ pub struct PrintOptions { pub color: bool, } -fn get_timing_info_str( +// Returns the query execution details formatted +fn get_execution_details_formatted( row_count: usize, maxrows: MaxRows, query_start_time: Instant, ) -> String { - let row_word = if row_count == 1 { "row" } else { "rows" }; let nrows_shown_msg = match maxrows { - MaxRows::Limited(nrows) if nrows < row_count => format!(" ({} shown)", nrows), + MaxRows::Limited(nrows) if nrows < row_count => { + format!("(First {nrows} displayed. Use --maxrows to adjust)") + } _ => String::new(), }; format!( - "{} {} in set{}. Query took {:.3} seconds.\n", + "{} row(s) fetched. {}\nElapsed {:.3} seconds.\n", row_count, - row_word, nrows_shown_msg, query_start_time.elapsed().as_secs_f64() ) @@ -107,7 +108,7 @@ impl PrintOptions { .print_batches(&mut writer, batches, self.maxrows, true)?; let row_count: usize = batches.iter().map(|b| b.num_rows()).sum(); - let timing_info = get_timing_info_str( + let formatted_exec_details = get_execution_details_formatted( row_count, if self.format == PrintFormat::Table { self.maxrows @@ -118,7 +119,7 @@ impl PrintOptions { ); if !self.quiet { - writeln!(writer, "{timing_info}")?; + writeln!(writer, "{formatted_exec_details}")?; } Ok(()) @@ -154,11 +155,14 @@ impl PrintOptions { with_header = false; } - let timing_info = - get_timing_info_str(row_count, MaxRows::Unlimited, query_start_time); + let formatted_exec_details = get_execution_details_formatted( + row_count, + MaxRows::Unlimited, + query_start_time, + ); if !self.quiet { - writeln!(writer, "{timing_info}")?; + writeln!(writer, "{formatted_exec_details}")?; } Ok(()) From 3eeb108125b35424baac39dd20ba88433b347419 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 29 Mar 2024 07:47:14 -0600 Subject: [PATCH 14/15] Exclude .github directory from release tarball (#9850) --- .gitattributes | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitattributes b/.gitattributes index 7ff0bbb6d959..bcdeffc09a11 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,3 @@ +.github/ export-ignore datafusion/proto/src/generated/prost.rs linguist-generated datafusion/proto/src/generated/pbjson.rs linguist-generated From c2879f510533a01bc04ef75da4f1416d0ddb99f6 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Fri, 29 Mar 2024 09:53:03 -0400 Subject: [PATCH 15/15] move strpos, substr functions to datafusion_functions (#9849) * Fix to_timestamp benchmark * Remove reference to simd and nightly build as simd is no longer an available feature in DataFusion and building with nightly may not be a good recommendation when getting started. * Fixed missing trim() function. * Create unicode module in datafusion/functions/src/unicode and unicode_expressions feature flag, move char_length function * move Left, Lpad, Reverse, Right, Rpad functions to datafusion_functions * move strpos, substr functions to datafusion_functions * Cleanup tests --- datafusion/expr/src/built_in_function.rs | 36 +- datafusion/expr/src/expr_fn.rs | 6 - datafusion/functions/src/unicode/mod.rs | 31 ++ datafusion/functions/src/unicode/strpos.rs | 121 ++++++ datafusion/functions/src/unicode/substr.rs | 392 ++++++++++++++++++ datafusion/physical-expr/src/functions.rs | 258 +----------- .../physical-expr/src/unicode_expressions.rs | 95 ----- datafusion/proto/Cargo.toml | 1 + datafusion/proto/proto/datafusion.proto | 4 +- datafusion/proto/src/generated/pbjson.rs | 6 - datafusion/proto/src/generated/prost.rs | 8 +- .../proto/src/logical_plan/from_proto.rs | 29 +- datafusion/proto/src/logical_plan/to_proto.rs | 2 - .../tests/cases/roundtrip_logical_plan.rs | 29 +- datafusion/proto/tests/cases/serialize.rs | 5 +- datafusion/sql/src/expr/mod.rs | 9 +- datafusion/sql/src/expr/substring.rs | 16 +- datafusion/sqllogictest/test_files/scalar.slt | 2 +- 18 files changed, 598 insertions(+), 452 deletions(-) create mode 100644 datafusion/functions/src/unicode/strpos.rs create mode 100644 datafusion/functions/src/unicode/substr.rs diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index 196d278dc70e..423fc11c1d8c 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -113,10 +113,6 @@ pub enum BuiltinScalarFunction { InitCap, /// random Random, - /// strpos - Strpos, - /// substr - Substr, /// translate Translate, /// substr_index @@ -211,8 +207,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::EndsWith => Volatility::Immutable, BuiltinScalarFunction::InitCap => Volatility::Immutable, BuiltinScalarFunction::Radians => Volatility::Immutable, - BuiltinScalarFunction::Strpos => Volatility::Immutable, - BuiltinScalarFunction::Substr => Volatility::Immutable, BuiltinScalarFunction::Translate => Volatility::Immutable, BuiltinScalarFunction::SubstrIndex => Volatility::Immutable, BuiltinScalarFunction::FindInSet => Volatility::Immutable, @@ -252,12 +246,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Pi => Ok(Float64), BuiltinScalarFunction::Random => Ok(Float64), BuiltinScalarFunction::EndsWith => Ok(Boolean), - BuiltinScalarFunction::Strpos => { - utf8_to_int_type(&input_expr_types[0], "strpos/instr/position") - } - BuiltinScalarFunction::Substr => { - utf8_to_str_type(&input_expr_types[0], "substr") - } BuiltinScalarFunction::SubstrIndex => { utf8_to_str_type(&input_expr_types[0], "substr_index") } @@ -341,24 +329,12 @@ impl BuiltinScalarFunction { Signature::uniform(1, vec![Utf8, LargeUtf8], self.volatility()) } - BuiltinScalarFunction::EndsWith | BuiltinScalarFunction::Strpos => { - Signature::one_of( - vec![ - Exact(vec![Utf8, Utf8]), - Exact(vec![Utf8, LargeUtf8]), - Exact(vec![LargeUtf8, Utf8]), - Exact(vec![LargeUtf8, LargeUtf8]), - ], - self.volatility(), - ) - } - - BuiltinScalarFunction::Substr => Signature::one_of( + BuiltinScalarFunction::EndsWith => Signature::one_of( vec![ - Exact(vec![Utf8, Int64]), - Exact(vec![LargeUtf8, Int64]), - Exact(vec![Utf8, Int64, Int64]), - Exact(vec![LargeUtf8, Int64, Int64]), + Exact(vec![Utf8, Utf8]), + Exact(vec![Utf8, LargeUtf8]), + Exact(vec![LargeUtf8, Utf8]), + Exact(vec![LargeUtf8, LargeUtf8]), ], self.volatility(), ), @@ -537,8 +513,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::ConcatWithSeparator => &["concat_ws"], BuiltinScalarFunction::EndsWith => &["ends_with"], BuiltinScalarFunction::InitCap => &["initcap"], - BuiltinScalarFunction::Strpos => &["strpos", "instr", "position"], - BuiltinScalarFunction::Substr => &["substr"], BuiltinScalarFunction::Translate => &["translate"], BuiltinScalarFunction::SubstrIndex => &["substr_index", "substring_index"], BuiltinScalarFunction::FindInSet => &["find_in_set"], diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index 21dab72855e5..09170ae639ff 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -579,9 +579,6 @@ scalar_expr!(Log, log, base x, "logarithm of a `x` for a particular `base`"); scalar_expr!(InitCap, initcap, string, "converts the first letter of each word in `string` in uppercase and the remaining characters in lowercase"); scalar_expr!(EndsWith, ends_with, string suffix, "whether the `string` ends with the `suffix`"); -scalar_expr!(Strpos, strpos, string substring, "finds the position from where the `substring` matches the `string`"); -scalar_expr!(Substr, substr, string position, "substring from the `position` to the end"); -scalar_expr!(Substr, substring, string position length, "substring from the `position` with `length` characters"); scalar_expr!(Translate, translate, string from to, "replaces the characters in `from` with the counterpart in `to`"); nary_scalar_expr!(Coalesce, coalesce, "returns `coalesce(args...)`, which evaluates to the value of the first [Expr] which is not NULL"); //there is a func concat_ws before, so use concat_ws_expr as name.c @@ -1015,9 +1012,6 @@ mod test { test_scalar_expr!(Lcm, lcm, arg_1, arg_2); test_scalar_expr!(InitCap, initcap, string); test_scalar_expr!(EndsWith, ends_with, string, characters); - test_scalar_expr!(Strpos, strpos, string, substring); - test_scalar_expr!(Substr, substr, string, position); - test_scalar_expr!(Substr, substring, string, position, count); test_scalar_expr!(Translate, translate, string, from, to); test_scalar_expr!(SubstrIndex, substr_index, string, delimiter, count); test_scalar_expr!(FindInSet, find_in_set, string, stringlist); diff --git a/datafusion/functions/src/unicode/mod.rs b/datafusion/functions/src/unicode/mod.rs index ea4e70a92199..ddab0d1e27c9 100644 --- a/datafusion/functions/src/unicode/mod.rs +++ b/datafusion/functions/src/unicode/mod.rs @@ -27,6 +27,8 @@ mod lpad; mod reverse; mod right; mod rpad; +mod strpos; +mod substr; // create UDFs make_udf_function!( @@ -39,6 +41,8 @@ make_udf_function!(lpad::LPadFunc, LPAD, lpad); make_udf_function!(right::RightFunc, RIGHT, right); make_udf_function!(reverse::ReverseFunc, REVERSE, reverse); make_udf_function!(rpad::RPadFunc, RPAD, rpad); +make_udf_function!(strpos::StrposFunc, STRPOS, strpos); +make_udf_function!(substr::SubstrFunc, SUBSTR, substr); pub mod expr_fn { use datafusion_expr::Expr; @@ -53,6 +57,11 @@ pub mod expr_fn { super::character_length().call(vec![string]) } + #[doc = "finds the position from where the `substring` matches the `string`"] + pub fn instr(string: Expr, substring: Expr) -> Expr { + strpos(string, substring) + } + #[doc = "the number of characters in the `string`"] pub fn length(string: Expr) -> Expr { character_length(string) @@ -68,6 +77,11 @@ pub mod expr_fn { super::lpad().call(args) } + #[doc = "finds the position from where the `substring` matches the `string`"] + pub fn position(string: Expr, substring: Expr) -> Expr { + strpos(string, substring) + } + #[doc = "reverses the `string`"] pub fn reverse(string: Expr) -> Expr { super::reverse().call(vec![string]) @@ -82,6 +96,21 @@ pub mod expr_fn { pub fn rpad(args: Vec) -> Expr { super::rpad().call(args) } + + #[doc = "finds the position from where the `substring` matches the `string`"] + pub fn strpos(string: Expr, substring: Expr) -> Expr { + super::strpos().call(vec![string, substring]) + } + + #[doc = "substring from the `position` to the end"] + pub fn substr(string: Expr, position: Expr) -> Expr { + super::substr().call(vec![string, position]) + } + + #[doc = "substring from the `position` with `length` characters"] + pub fn substring(string: Expr, position: Expr, length: Expr) -> Expr { + super::substr().call(vec![string, position, length]) + } } /// Return a list of all functions in this package @@ -93,5 +122,7 @@ pub fn functions() -> Vec> { reverse(), right(), rpad(), + strpos(), + substr(), ] } diff --git a/datafusion/functions/src/unicode/strpos.rs b/datafusion/functions/src/unicode/strpos.rs new file mode 100644 index 000000000000..1e8bfa37d40e --- /dev/null +++ b/datafusion/functions/src/unicode/strpos.rs @@ -0,0 +1,121 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{ + ArrayRef, ArrowPrimitiveType, GenericStringArray, OffsetSizeTrait, PrimitiveArray, +}; +use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type}; + +use datafusion_common::cast::as_generic_string_array; +use datafusion_common::{exec_err, Result}; +use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; + +use crate::utils::{make_scalar_function, utf8_to_int_type}; + +#[derive(Debug)] +pub(super) struct StrposFunc { + signature: Signature, + aliases: Vec, +} + +impl StrposFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![ + Exact(vec![Utf8, Utf8]), + Exact(vec![Utf8, LargeUtf8]), + Exact(vec![LargeUtf8, Utf8]), + Exact(vec![LargeUtf8, LargeUtf8]), + ], + Volatility::Immutable, + ), + aliases: vec![String::from("instr"), String::from("position")], + } + } +} + +impl ScalarUDFImpl for StrposFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "strpos" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + utf8_to_int_type(&arg_types[0], "strpos/instr/position") + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + match args[0].data_type() { + DataType::Utf8 => make_scalar_function(strpos::, vec![])(args), + DataType::LargeUtf8 => { + make_scalar_function(strpos::, vec![])(args) + } + other => exec_err!("Unsupported data type {other:?} for function strpos"), + } + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} + +/// Returns starting index of specified substring within string, or zero if it's not present. (Same as position(substring in string), but note the reversed argument order.) +/// strpos('high', 'ig') = 2 +/// The implementation uses UTF-8 code points as characters +fn strpos(args: &[ArrayRef]) -> Result +where + T::Native: OffsetSizeTrait, +{ + let string_array: &GenericStringArray = + as_generic_string_array::(&args[0])?; + + let substring_array: &GenericStringArray = + as_generic_string_array::(&args[1])?; + + let result = string_array + .iter() + .zip(substring_array.iter()) + .map(|(string, substring)| match (string, substring) { + (Some(string), Some(substring)) => { + // the find method returns the byte index of the substring + // Next, we count the number of the chars until that byte + T::Native::from_usize( + string + .find(substring) + .map(|x| string[..x].chars().count() + 1) + .unwrap_or(0), + ) + } + _ => None, + }) + .collect::>(); + + Ok(Arc::new(result) as ArrayRef) +} diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs new file mode 100644 index 000000000000..403157e2a85a --- /dev/null +++ b/datafusion/functions/src/unicode/substr.rs @@ -0,0 +1,392 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::cmp::max; +use std::sync::Arc; + +use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; +use arrow::datatypes::DataType; + +use datafusion_common::cast::{as_generic_string_array, as_int64_array}; +use datafusion_common::{exec_err, Result}; +use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; + +use crate::utils::{make_scalar_function, utf8_to_str_type}; + +#[derive(Debug)] +pub(super) struct SubstrFunc { + signature: Signature, +} + +impl SubstrFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![ + Exact(vec![Utf8, Int64]), + Exact(vec![LargeUtf8, Int64]), + Exact(vec![Utf8, Int64, Int64]), + Exact(vec![LargeUtf8, Int64, Int64]), + ], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for SubstrFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "substr" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + utf8_to_str_type(&arg_types[0], "substr") + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + match args[0].data_type() { + DataType::Utf8 => make_scalar_function(substr::, vec![])(args), + DataType::LargeUtf8 => make_scalar_function(substr::, vec![])(args), + other => exec_err!("Unsupported data type {other:?} for function substr"), + } + } +} + +/// Extracts the substring of string starting at the start'th character, and extending for count characters if that is specified. (Same as substring(string from start for count).) +/// substr('alphabet', 3) = 'phabet' +/// substr('alphabet', 3, 2) = 'ph' +/// The implementation uses UTF-8 code points as characters +pub fn substr(args: &[ArrayRef]) -> Result { + match args.len() { + 2 => { + let string_array = as_generic_string_array::(&args[0])?; + let start_array = as_int64_array(&args[1])?; + + let result = string_array + .iter() + .zip(start_array.iter()) + .map(|(string, start)| match (string, start) { + (Some(string), Some(start)) => { + if start <= 0 { + Some(string.to_string()) + } else { + Some(string.chars().skip(start as usize - 1).collect()) + } + } + _ => None, + }) + .collect::>(); + + Ok(Arc::new(result) as ArrayRef) + } + 3 => { + let string_array = as_generic_string_array::(&args[0])?; + let start_array = as_int64_array(&args[1])?; + let count_array = as_int64_array(&args[2])?; + + let result = string_array + .iter() + .zip(start_array.iter()) + .zip(count_array.iter()) + .map(|((string, start), count)| match (string, start, count) { + (Some(string), Some(start), Some(count)) => { + if count < 0 { + exec_err!( + "negative substring length not allowed: substr(, {start}, {count})" + ) + } else { + let skip = max(0, start - 1); + let count = max(0, count + (if start < 1 {start - 1} else {0})); + Ok(Some(string.chars().skip(skip as usize).take(count as usize).collect::())) + } + } + _ => Ok(None), + }) + .collect::>>()?; + + Ok(Arc::new(result) as ArrayRef) + } + other => { + exec_err!("substr was called with {other} arguments. It requires 2 or 3.") + } + } +} + +#[cfg(test)] +mod tests { + use arrow::array::{Array, StringArray}; + use arrow::datatypes::DataType::Utf8; + + use datafusion_common::{exec_err, Result, ScalarValue}; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + + use crate::unicode::substr::SubstrFunc; + use crate::utils::test::test_function; + + #[test] + fn test_functions() -> Result<()> { + test_function!( + SubstrFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("alphabet")), + ColumnarValue::Scalar(ScalarValue::from(0i64)), + ], + Ok(Some("alphabet")), + &str, + Utf8, + StringArray + ); + test_function!( + SubstrFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("joséésoj")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ], + Ok(Some("ésoj")), + &str, + Utf8, + StringArray + ); + test_function!( + SubstrFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("joséésoj")), + ColumnarValue::Scalar(ScalarValue::from(-5i64)), + ], + Ok(Some("joséésoj")), + &str, + Utf8, + StringArray + ); + test_function!( + SubstrFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("alphabet")), + ColumnarValue::Scalar(ScalarValue::from(1i64)), + ], + Ok(Some("alphabet")), + &str, + Utf8, + StringArray + ); + test_function!( + SubstrFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("alphabet")), + ColumnarValue::Scalar(ScalarValue::from(2i64)), + ], + Ok(Some("lphabet")), + &str, + Utf8, + StringArray + ); + test_function!( + SubstrFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("alphabet")), + ColumnarValue::Scalar(ScalarValue::from(3i64)), + ], + Ok(Some("phabet")), + &str, + Utf8, + StringArray + ); + test_function!( + SubstrFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("alphabet")), + ColumnarValue::Scalar(ScalarValue::from(-3i64)), + ], + Ok(Some("alphabet")), + &str, + Utf8, + StringArray + ); + test_function!( + SubstrFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("alphabet")), + ColumnarValue::Scalar(ScalarValue::from(30i64)), + ], + Ok(Some("")), + &str, + Utf8, + StringArray + ); + test_function!( + SubstrFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("alphabet")), + ColumnarValue::Scalar(ScalarValue::Int64(None)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + SubstrFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("alphabet")), + ColumnarValue::Scalar(ScalarValue::from(3i64)), + ColumnarValue::Scalar(ScalarValue::from(2i64)), + ], + Ok(Some("ph")), + &str, + Utf8, + StringArray + ); + test_function!( + SubstrFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("alphabet")), + ColumnarValue::Scalar(ScalarValue::from(3i64)), + ColumnarValue::Scalar(ScalarValue::from(20i64)), + ], + Ok(Some("phabet")), + &str, + Utf8, + StringArray + ); + test_function!( + SubstrFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("alphabet")), + ColumnarValue::Scalar(ScalarValue::from(0i64)), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ], + Ok(Some("alph")), + &str, + Utf8, + StringArray + ); + // starting from 5 (10 + -5) + test_function!( + SubstrFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("alphabet")), + ColumnarValue::Scalar(ScalarValue::from(-5i64)), + ColumnarValue::Scalar(ScalarValue::from(10i64)), + ], + Ok(Some("alph")), + &str, + Utf8, + StringArray + ); + // starting from -1 (4 + -5) + test_function!( + SubstrFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("alphabet")), + ColumnarValue::Scalar(ScalarValue::from(-5i64)), + ColumnarValue::Scalar(ScalarValue::from(4i64)), + ], + Ok(Some("")), + &str, + Utf8, + StringArray + ); + // starting from 0 (5 + -5) + test_function!( + SubstrFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("alphabet")), + ColumnarValue::Scalar(ScalarValue::from(-5i64)), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ], + Ok(Some("")), + &str, + Utf8, + StringArray + ); + test_function!( + SubstrFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("alphabet")), + ColumnarValue::Scalar(ScalarValue::Int64(None)), + ColumnarValue::Scalar(ScalarValue::from(20i64)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + SubstrFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("alphabet")), + ColumnarValue::Scalar(ScalarValue::from(3i64)), + ColumnarValue::Scalar(ScalarValue::Int64(None)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + SubstrFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("alphabet")), + ColumnarValue::Scalar(ScalarValue::from(1i64)), + ColumnarValue::Scalar(ScalarValue::from(-1i64)), + ], + exec_err!("negative substring length not allowed: substr(, 1, -1)"), + &str, + Utf8, + StringArray + ); + test_function!( + SubstrFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("joséésoj")), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ColumnarValue::Scalar(ScalarValue::from(2i64)), + ], + Ok(Some("és")), + &str, + Utf8, + StringArray + ); + #[cfg(not(feature = "unicode_expressions"))] + test_function!( + SubstrFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("alphabet")), + ColumnarValue::Scalar(ScalarValue::from(0i64)), + ], + internal_err!( + "function substr requires compilation with feature flag: unicode_expressions." + ), + &str, + Utf8, + StringArray + ); + + Ok(()) + } +} diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index c1b4900e399a..513dd71d4074 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -281,34 +281,6 @@ pub fn create_physical_fun( exec_err!("Unsupported data type {other:?} for function ends_with") } }), - BuiltinScalarFunction::Strpos => Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => { - let func = invoke_if_unicode_expressions_feature_flag!( - strpos, Int32Type, "strpos" - ); - make_scalar_function_inner(func)(args) - } - DataType::LargeUtf8 => { - let func = invoke_if_unicode_expressions_feature_flag!( - strpos, Int64Type, "strpos" - ); - make_scalar_function_inner(func)(args) - } - other => exec_err!("Unsupported data type {other:?} for function strpos"), - }), - BuiltinScalarFunction::Substr => Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => { - let func = - invoke_if_unicode_expressions_feature_flag!(substr, i32, "substr"); - make_scalar_function_inner(func)(args) - } - DataType::LargeUtf8 => { - let func = - invoke_if_unicode_expressions_feature_flag!(substr, i64, "substr"); - make_scalar_function_inner(func)(args) - } - other => exec_err!("Unsupported data type {other:?} for function substr"), - }), BuiltinScalarFunction::Translate => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { let func = invoke_if_unicode_expressions_feature_flag!( @@ -450,7 +422,7 @@ mod tests { }; use datafusion_common::cast::as_uint64_array; - use datafusion_common::{exec_err, internal_err, plan_err}; + use datafusion_common::{internal_err, plan_err}; use datafusion_common::{DataFusionError, Result, ScalarValue}; use datafusion_expr::type_coercion::functions::data_types; use datafusion_expr::Signature; @@ -663,234 +635,6 @@ mod tests { BooleanArray ); #[cfg(feature = "unicode_expressions")] - test_function!( - Substr, - &[lit("alphabet"), lit(ScalarValue::Int64(Some(0))),], - Ok(Some("alphabet")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Substr, - &[lit("joséésoj"), lit(ScalarValue::Int64(Some(5))),], - Ok(Some("ésoj")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Substr, - &[lit("joséésoj"), lit(ScalarValue::Int64(Some(-5))),], - Ok(Some("joséésoj")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Substr, - &[lit("alphabet"), lit(ScalarValue::Int64(Some(1))),], - Ok(Some("alphabet")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Substr, - &[lit("alphabet"), lit(ScalarValue::Int64(Some(2))),], - Ok(Some("lphabet")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Substr, - &[lit("alphabet"), lit(ScalarValue::Int64(Some(3))),], - Ok(Some("phabet")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Substr, - &[lit("alphabet"), lit(ScalarValue::Int64(Some(-3))),], - Ok(Some("alphabet")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Substr, - &[lit("alphabet"), lit(ScalarValue::Int64(Some(30))),], - Ok(Some("")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Substr, - &[lit("alphabet"), lit(ScalarValue::Int64(None)),], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Substr, - &[ - lit("alphabet"), - lit(ScalarValue::Int64(Some(3))), - lit(ScalarValue::Int64(Some(2))), - ], - Ok(Some("ph")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Substr, - &[ - lit("alphabet"), - lit(ScalarValue::Int64(Some(3))), - lit(ScalarValue::Int64(Some(20))), - ], - Ok(Some("phabet")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Substr, - &[ - lit("alphabet"), - lit(ScalarValue::Int64(Some(0))), - lit(ScalarValue::Int64(Some(5))), - ], - Ok(Some("alph")), - &str, - Utf8, - StringArray - ); - // starting from 5 (10 + -5) - #[cfg(feature = "unicode_expressions")] - test_function!( - Substr, - &[ - lit("alphabet"), - lit(ScalarValue::Int64(Some(-5))), - lit(ScalarValue::Int64(Some(10))), - ], - Ok(Some("alph")), - &str, - Utf8, - StringArray - ); - // starting from -1 (4 + -5) - #[cfg(feature = "unicode_expressions")] - test_function!( - Substr, - &[ - lit("alphabet"), - lit(ScalarValue::Int64(Some(-5))), - lit(ScalarValue::Int64(Some(4))), - ], - Ok(Some("")), - &str, - Utf8, - StringArray - ); - // starting from 0 (5 + -5) - #[cfg(feature = "unicode_expressions")] - test_function!( - Substr, - &[ - lit("alphabet"), - lit(ScalarValue::Int64(Some(-5))), - lit(ScalarValue::Int64(Some(5))), - ], - Ok(Some("")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Substr, - &[ - lit("alphabet"), - lit(ScalarValue::Int64(None)), - lit(ScalarValue::Int64(Some(20))), - ], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Substr, - &[ - lit("alphabet"), - lit(ScalarValue::Int64(Some(3))), - lit(ScalarValue::Int64(None)), - ], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Substr, - &[ - lit("alphabet"), - lit(ScalarValue::Int64(Some(1))), - lit(ScalarValue::Int64(Some(-1))), - ], - exec_err!("negative substring length not allowed: substr(, 1, -1)"), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Substr, - &[ - lit("joséésoj"), - lit(ScalarValue::Int64(Some(5))), - lit(ScalarValue::Int64(Some(2))), - ], - Ok(Some("és")), - &str, - Utf8, - StringArray - ); - #[cfg(not(feature = "unicode_expressions"))] - test_function!( - Substr, - &[ - lit("alphabet"), - lit(ScalarValue::Int64(Some(0))), - ], - internal_err!( - "function substr requires compilation with feature flag: unicode_expressions." - ), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] test_function!( Translate, &[lit("12345"), lit("143"), lit("ax"),], diff --git a/datafusion/physical-expr/src/unicode_expressions.rs b/datafusion/physical-expr/src/unicode_expressions.rs index faff21111a61..ecbd1ea320d4 100644 --- a/datafusion/physical-expr/src/unicode_expressions.rs +++ b/datafusion/physical-expr/src/unicode_expressions.rs @@ -21,7 +21,6 @@ //! Unicode expressions -use std::cmp::max; use std::sync::Arc; use arrow::{ @@ -36,100 +35,6 @@ use datafusion_common::{ exec_err, Result, }; -/// Returns starting index of specified substring within string, or zero if it's not present. (Same as position(substring in string), but note the reversed argument order.) -/// strpos('high', 'ig') = 2 -/// The implementation uses UTF-8 code points as characters -pub fn strpos(args: &[ArrayRef]) -> Result -where - T::Native: OffsetSizeTrait, -{ - let string_array: &GenericStringArray = - as_generic_string_array::(&args[0])?; - - let substring_array: &GenericStringArray = - as_generic_string_array::(&args[1])?; - - let result = string_array - .iter() - .zip(substring_array.iter()) - .map(|(string, substring)| match (string, substring) { - (Some(string), Some(substring)) => { - // the find method returns the byte index of the substring - // Next, we count the number of the chars until that byte - T::Native::from_usize( - string - .find(substring) - .map(|x| string[..x].chars().count() + 1) - .unwrap_or(0), - ) - } - _ => None, - }) - .collect::>(); - - Ok(Arc::new(result) as ArrayRef) -} - -/// Extracts the substring of string starting at the start'th character, and extending for count characters if that is specified. (Same as substring(string from start for count).) -/// substr('alphabet', 3) = 'phabet' -/// substr('alphabet', 3, 2) = 'ph' -/// The implementation uses UTF-8 code points as characters -pub fn substr(args: &[ArrayRef]) -> Result { - match args.len() { - 2 => { - let string_array = as_generic_string_array::(&args[0])?; - let start_array = as_int64_array(&args[1])?; - - let result = string_array - .iter() - .zip(start_array.iter()) - .map(|(string, start)| match (string, start) { - (Some(string), Some(start)) => { - if start <= 0 { - Some(string.to_string()) - } else { - Some(string.chars().skip(start as usize - 1).collect()) - } - } - _ => None, - }) - .collect::>(); - - Ok(Arc::new(result) as ArrayRef) - } - 3 => { - let string_array = as_generic_string_array::(&args[0])?; - let start_array = as_int64_array(&args[1])?; - let count_array = as_int64_array(&args[2])?; - - let result = string_array - .iter() - .zip(start_array.iter()) - .zip(count_array.iter()) - .map(|((string, start), count)| match (string, start, count) { - (Some(string), Some(start), Some(count)) => { - if count < 0 { - exec_err!( - "negative substring length not allowed: substr(, {start}, {count})" - ) - } else { - let skip = max(0, start - 1); - let count = max(0, count + (if start < 1 {start - 1} else {0})); - Ok(Some(string.chars().skip(skip as usize).take(count as usize).collect::())) - } - } - _ => Ok(None), - }) - .collect::>>()?; - - Ok(Arc::new(result) as ArrayRef) - } - other => { - exec_err!("substr was called with {other} arguments. It requires 2 or 3.") - } - } -} - /// Replaces each character in string that matches a character in the from set with the corresponding character in the to set. If from is longer than to, occurrences of the extra characters in from are deleted. /// translate('12345', '143', 'ax') = 'a2x5' pub fn translate(args: &[ArrayRef]) -> Result { diff --git a/datafusion/proto/Cargo.toml b/datafusion/proto/Cargo.toml index f5297aefcd1c..bec2b8c53a7a 100644 --- a/datafusion/proto/Cargo.toml +++ b/datafusion/proto/Cargo.toml @@ -54,6 +54,7 @@ serde = { version = "1.0", optional = true } serde_json = { workspace = true, optional = true } [dev-dependencies] +datafusion-functions = { workspace = true, default-features = true } doc-comment = { workspace = true } strum = { version = "0.26.1", features = ["derive"] } tokio = { workspace = true, features = ["rt-multi-thread"] } diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 6319372d98d2..3a187eabe836 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -593,8 +593,8 @@ enum ScalarFunction { // 49 was SHA512 // 50 was SplitPart // StartsWith = 51; - Strpos = 52; - Substr = 53; + // 52 was Strpos + // 53 was Substr // ToHex = 54; // 55 was ToTimestamp // 56 was ToTimestampMillis diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index 7281bc9dc263..07b91b26d60b 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -22932,8 +22932,6 @@ impl serde::Serialize for ScalarFunction { Self::ConcatWithSeparator => "ConcatWithSeparator", Self::InitCap => "InitCap", Self::Random => "Random", - Self::Strpos => "Strpos", - Self::Substr => "Substr", Self::Translate => "Translate", Self::Coalesce => "Coalesce", Self::Power => "Power", @@ -22986,8 +22984,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "ConcatWithSeparator", "InitCap", "Random", - "Strpos", - "Substr", "Translate", "Coalesce", "Power", @@ -23069,8 +23065,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "ConcatWithSeparator" => Ok(ScalarFunction::ConcatWithSeparator), "InitCap" => Ok(ScalarFunction::InitCap), "Random" => Ok(ScalarFunction::Random), - "Strpos" => Ok(ScalarFunction::Strpos), - "Substr" => Ok(ScalarFunction::Substr), "Translate" => Ok(ScalarFunction::Translate), "Coalesce" => Ok(ScalarFunction::Coalesce), "Power" => Ok(ScalarFunction::Power), diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index 2fe89efb9cea..babeccec595f 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -2892,8 +2892,8 @@ pub enum ScalarFunction { /// 49 was SHA512 /// 50 was SplitPart /// StartsWith = 51; - Strpos = 52, - Substr = 53, + /// 52 was Strpos + /// 53 was Substr /// ToHex = 54; /// 55 was ToTimestamp /// 56 was ToTimestampMillis @@ -3005,8 +3005,6 @@ impl ScalarFunction { ScalarFunction::ConcatWithSeparator => "ConcatWithSeparator", ScalarFunction::InitCap => "InitCap", ScalarFunction::Random => "Random", - ScalarFunction::Strpos => "Strpos", - ScalarFunction::Substr => "Substr", ScalarFunction::Translate => "Translate", ScalarFunction::Coalesce => "Coalesce", ScalarFunction::Power => "Power", @@ -3053,8 +3051,6 @@ impl ScalarFunction { "ConcatWithSeparator" => Some(Self::ConcatWithSeparator), "InitCap" => Some(Self::InitCap), "Random" => Some(Self::Random), - "Strpos" => Some(Self::Strpos), - "Substr" => Some(Self::Substr), "Translate" => Some(Self::Translate), "Coalesce" => Some(Self::Coalesce), "Power" => Some(Self::Power), diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index 2c6f2e479b24..ff3d6773d512 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -42,10 +42,10 @@ use datafusion_expr::{ expr::{self, InList, Sort, WindowFunction}, factorial, find_in_set, floor, gcd, initcap, iszero, lcm, ln, log, log10, log2, logical_plan::{PlanType, StringifiedPlan}, - nanvl, pi, power, radians, random, round, signum, sin, sinh, sqrt, strpos, substr, - substr_index, substring, translate, trunc, AggregateFunction, Between, BinaryExpr, - BuiltInWindowFunction, BuiltinScalarFunction, Case, Cast, Expr, GetFieldAccess, - GetIndexedField, GroupingSet, + nanvl, pi, power, radians, random, round, signum, sin, sinh, sqrt, substr_index, + translate, trunc, AggregateFunction, Between, BinaryExpr, BuiltInWindowFunction, + BuiltinScalarFunction, Case, Cast, Expr, GetFieldAccess, GetIndexedField, + GroupingSet, GroupingSet::GroupingSets, JoinConstraint, JoinType, Like, Operator, TryCast, WindowFrame, WindowFrameBound, WindowFrameUnits, @@ -455,8 +455,6 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::EndsWith => Self::EndsWith, ScalarFunction::InitCap => Self::InitCap, ScalarFunction::Random => Self::Random, - ScalarFunction::Strpos => Self::Strpos, - ScalarFunction::Substr => Self::Substr, ScalarFunction::Translate => Self::Translate, ScalarFunction::Coalesce => Self::Coalesce, ScalarFunction::Pi => Self::Pi, @@ -1389,25 +1387,6 @@ pub fn parse_expr( parse_expr(&args[0], registry, codec)?, parse_expr(&args[1], registry, codec)?, )), - ScalarFunction::Strpos => Ok(strpos( - parse_expr(&args[0], registry, codec)?, - parse_expr(&args[1], registry, codec)?, - )), - ScalarFunction::Substr => { - if args.len() > 2 { - assert_eq!(args.len(), 3); - Ok(substring( - parse_expr(&args[0], registry, codec)?, - parse_expr(&args[1], registry, codec)?, - parse_expr(&args[2], registry, codec)?, - )) - } else { - Ok(substr( - parse_expr(&args[0], registry, codec)?, - parse_expr(&args[1], registry, codec)?, - )) - } - } ScalarFunction::Translate => Ok(translate( parse_expr(&args[0], registry, codec)?, parse_expr(&args[1], registry, codec)?, diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index ea682a5a22f8..89d49c5658a2 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -1446,8 +1446,6 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::EndsWith => Self::EndsWith, BuiltinScalarFunction::InitCap => Self::InitCap, BuiltinScalarFunction::Random => Self::Random, - BuiltinScalarFunction::Strpos => Self::Strpos, - BuiltinScalarFunction::Substr => Self::Substr, BuiltinScalarFunction::Translate => Self::Translate, BuiltinScalarFunction::Coalesce => Self::Coalesce, BuiltinScalarFunction::Pi => Self::Pi, diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 3c43f100750f..3a47f556c0f3 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -34,8 +34,8 @@ use datafusion::test_util::{TestTableFactory, TestTableProvider}; use datafusion_common::config::{FormatOptions, TableOptions}; use datafusion_common::scalar::ScalarStructBuilder; use datafusion_common::{ - internal_err, not_impl_err, plan_err, DFField, DFSchema, DFSchemaRef, - DataFusionError, FileType, Result, ScalarValue, + internal_datafusion_err, internal_err, not_impl_err, plan_err, DFField, DFSchema, + DFSchemaRef, DataFusionError, FileType, Result, ScalarValue, }; use datafusion_expr::dml::CopyTo; use datafusion_expr::expr::{ @@ -44,8 +44,7 @@ use datafusion_expr::expr::{ }; use datafusion_expr::logical_plan::{Extension, UserDefinedLogicalNodeCore}; use datafusion_expr::{ - col, create_udaf, lit, Accumulator, AggregateFunction, - BuiltinScalarFunction::{Sqrt, Substr}, + col, create_udaf, lit, Accumulator, AggregateFunction, BuiltinScalarFunction::Sqrt, ColumnarValue, Expr, ExprSchemable, LogicalPlan, Operator, PartitionEvaluator, ScalarUDF, ScalarUDFImpl, Signature, TryCast, Volatility, WindowFrame, WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition, WindowUDF, @@ -60,6 +59,7 @@ use datafusion_proto::logical_plan::LogicalExtensionCodec; use datafusion_proto::logical_plan::{from_proto, DefaultLogicalExtensionCodec}; use datafusion_proto::protobuf; +use datafusion::execution::FunctionRegistry; use prost::Message; #[cfg(feature = "json")] @@ -1863,17 +1863,28 @@ fn roundtrip_cube() { #[test] fn roundtrip_substr() { + let ctx = SessionContext::new(); + + let fun = ctx + .state() + .udf("substr") + .map_err(|e| { + internal_datafusion_err!("Unable to find expected 'substr' function: {e:?}") + }) + .unwrap(); + // substr(string, position) - let test_expr = - Expr::ScalarFunction(ScalarFunction::new(Substr, vec![col("col"), lit(1_i64)])); + let test_expr = Expr::ScalarFunction(ScalarFunction::new_udf( + fun.clone(), + vec![col("col"), lit(1_i64)], + )); // substr(string, position, count) - let test_expr_with_count = Expr::ScalarFunction(ScalarFunction::new( - Substr, + let test_expr_with_count = Expr::ScalarFunction(ScalarFunction::new_udf( + fun, vec![col("col"), lit(1_i64), lit(1_i64)], )); - let ctx = SessionContext::new(); roundtrip_expr_test(test_expr, ctx.clone()); roundtrip_expr_test(test_expr_with_count, ctx); } diff --git a/datafusion/proto/tests/cases/serialize.rs b/datafusion/proto/tests/cases/serialize.rs index d4a1ab44a6ea..972382b841d5 100644 --- a/datafusion/proto/tests/cases/serialize.rs +++ b/datafusion/proto/tests/cases/serialize.rs @@ -260,10 +260,7 @@ fn test_expression_serialization_roundtrip() { let lit = Expr::Literal(ScalarValue::Utf8(None)); for builtin_fun in BuiltinScalarFunction::iter() { // default to 4 args (though some exprs like substr have error checking) - let num_args = match builtin_fun { - BuiltinScalarFunction::Substr => 3, - _ => 4, - }; + let num_args = 4; let args: Vec<_> = std::iter::repeat(&lit).take(num_args).cloned().collect(); let expr = Expr::ScalarFunction(ScalarFunction::new(builtin_fun, args)); diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index d1fc03194997..43bf2d871564 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -823,12 +823,17 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { schema: &DFSchema, planner_context: &mut PlannerContext, ) -> Result { - let fun = BuiltinScalarFunction::Strpos; + let fun = self + .context_provider + .get_function_meta("strpos") + .ok_or_else(|| { + internal_datafusion_err!("Unable to find expected 'strpos' function") + })?; let substr = self.sql_expr_to_logical_expr(substr_expr, schema, planner_context)?; let fullstr = self.sql_expr_to_logical_expr(str_expr, schema, planner_context)?; let args = vec![fullstr, substr]; - Ok(Expr::ScalarFunction(ScalarFunction::new(fun, args))) + Ok(Expr::ScalarFunction(ScalarFunction::new_udf(fun, args))) } fn sql_agg_with_filter_to_expr( &self, diff --git a/datafusion/sql/src/expr/substring.rs b/datafusion/sql/src/expr/substring.rs index a5d1abf0f265..f58c6f3b94d0 100644 --- a/datafusion/sql/src/expr/substring.rs +++ b/datafusion/sql/src/expr/substring.rs @@ -16,10 +16,10 @@ // under the License. use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; -use datafusion_common::plan_err; +use datafusion_common::{internal_datafusion_err, plan_err}; use datafusion_common::{DFSchema, Result, ScalarValue}; use datafusion_expr::expr::ScalarFunction; -use datafusion_expr::{BuiltinScalarFunction, Expr}; +use datafusion_expr::Expr; use sqlparser::ast::Expr as SQLExpr; impl<'a, S: ContextProvider> SqlToRel<'a, S> { @@ -68,9 +68,13 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } }; - Ok(Expr::ScalarFunction(ScalarFunction::new( - BuiltinScalarFunction::Substr, - args, - ))) + let fun = self + .context_provider + .get_function_meta("substr") + .ok_or_else(|| { + internal_datafusion_err!("Unable to find expected 'substr' function") + })?; + + Ok(Expr::ScalarFunction(ScalarFunction::new_udf(fun, args))) } } diff --git a/datafusion/sqllogictest/test_files/scalar.slt b/datafusion/sqllogictest/test_files/scalar.slt index a77a2bf4059c..20c8b3d25fdd 100644 --- a/datafusion/sqllogictest/test_files/scalar.slt +++ b/datafusion/sqllogictest/test_files/scalar.slt @@ -2087,7 +2087,7 @@ select position('' in '') 1 -query error DataFusion error: Error during planning: The STRPOS/INSTR/POSITION function can only accept strings, but got Int64. +query error DataFusion error: Execution error: The STRPOS/INSTR/POSITION function can only accept strings, but got Int64. select position(1 in 1)