diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml index 308abd1688a6..4e44e47f5968 100644 --- a/.github/workflows/dev_pr/labeler.yml +++ b/.github/workflows/dev_pr/labeler.yml @@ -33,16 +33,37 @@ logical-expr: physical-expr: - changed-files: - - any-glob-to-any-file: ['datafusion/physical-expr/**/*'] + - any-glob-to-any-file: ['datafusion/physical-expr/**/*', 'datafusion/physical-expr-common/**/*', 'datafusion/physical-expr-aggregate/**/*', 'datafusion/physical-plan/**/*'] + +catalog: + - changed-files: + - any-glob-to-any-file: ['datafusion/catalog/**/*'] + +common: + - changed-files: + - any-glob-to-any-file: ['datafusion/common/**/*', 'datafusion/common-runtime/**/*'] + +execution: + - changed-files: + - any-glob-to-any-file: ['datafusion/execution/**/*'] + +functions: + - changed-files: + - any-glob-to-any-file: ['datafusion/functions/**/*', 'datafusion/functions-aggregate/**/*', 'datafusion/functions-aggregate-common', 'datafusion/functions-nested'] + optimizer: - changed-files: - - any-glob-to-any-file: ['datafusion/optimizer/**/*'] + - any-glob-to-any-file: ['datafusion/optimizer/**/*', 'datafusion/physical-optimizer/**/*'] core: - changed-files: - any-glob-to-any-file: ['datafusion/core/**/*'] +proto: + - changed-files: + - any-glob-to-any-file: ['datafusion/proto/**/*', 'datafusion/proto-common/**/*'] + substrait: - changed-files: - any-glob-to-any-file: ['datafusion/substrait/**/*'] diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 134cde8976d6..90995c1d116a 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -347,13 +347,14 @@ dependencies = [ [[package]] name = "assert_cmd" -version = "2.0.15" +version = "2.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc65048dd435533bb1baf2ed9956b9a278fbfdcf90301b39ee117f06c0199d37" +checksum = "dc1835b7f27878de8525dc71410b5a31cdcc5f230aed5ba5df968e09c201b23d" dependencies = [ "anstyle", "bstr", "doc-comment", + "libc", "predicates", "predicates-core", "predicates-tree", @@ -386,7 +387,7 @@ checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -874,9 +875,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.8" +version = "1.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "504bdec147f2cc13c8b57ed9401fd8a147cc66b67ad5cb241394244f2c947549" +checksum = "e9e8aabfac534be767c909e0690571677d49f41bd8465ae876fe043d52ba5292" dependencies = [ "jobserver", "libc", @@ -1022,9 +1023,9 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.6" +version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" [[package]] name = "core2" @@ -1037,9 +1038,9 @@ dependencies = [ [[package]] name = "cpufeatures" -version = "0.2.12" +version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" +checksum = "51e852e6dc9a5bed1fae92dd2375037bf2b768725bf3be87811edee3249d09ad" dependencies = [ "libc", ] @@ -1103,7 +1104,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "edb49164822f3ee45b17acd4a208cfc1251410cf0cad9a833234c9890774dd9f" dependencies = [ "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1240,6 +1241,7 @@ dependencies = [ "num_cpus", "object_store", "parquet", + "paste", "sqlparser", ] @@ -1762,7 +1764,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -2441,9 +2443,9 @@ dependencies = [ [[package]] name = "mio" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4569e456d394deccd22ce1c1913e6ea0e54519f577285001215d33557431afe4" +checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" dependencies = [ "hermit-abi 0.3.9", "libc", @@ -2785,7 +2787,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -3386,29 +3388,29 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.205" +version = "1.0.207" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e33aedb1a7135da52b7c21791455563facbbcc43d0f0f66165b42c21b3dfb150" +checksum = "5665e14a49a4ea1b91029ba7d3bca9f299e1f7cfa194388ccc20f14743e784f2" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.205" +version = "1.0.207" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "692d6f5ac90220161d6774db30c662202721e64aed9058d2c394f451261420c1" +checksum = "6aea2634c86b0e8ef2cfdc0c340baede54ec27b1e46febd7f80dffb2aa44a00e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] name = "serde_json" -version = "1.0.122" +version = "1.0.124" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "784b6203951c57ff748476b126ccb5e8e2959a5c19e5c617ab1956be3dbc68da" +checksum = "66ad62847a56b3dba58cc891acd13884b9c61138d330c0d7b6181713d4fce38d" dependencies = [ "itoa", "memchr", @@ -3537,7 +3539,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -3583,7 +3585,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -3596,7 +3598,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -3618,9 +3620,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.72" +version = "2.0.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc4b9b9bf2add8093d3f2c0204471e951b2285580335de42f9d2534f3ae7a8af" +checksum = "1fceb41e3d546d0bd83421d3409b1460cc7444cd389341a4c880fe7a042cb3d7" dependencies = [ "proc-macro2", "quote", @@ -3684,7 +3686,7 @@ checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -3778,7 +3780,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -3875,7 +3877,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -3920,7 +3922,7 @@ checksum = "f03ca4cb38206e2bef0700092660bb74d696f808514dae47fa1467cbfe26e96e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -4074,7 +4076,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", "wasm-bindgen-shared", ] @@ -4108,7 +4110,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4383,7 +4385,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] diff --git a/datafusion/catalog/Cargo.toml b/datafusion/catalog/Cargo.toml index 2ebca511c5c8..ff28d8e0c64a 100644 --- a/datafusion/catalog/Cargo.toml +++ b/datafusion/catalog/Cargo.toml @@ -17,6 +17,7 @@ [package] name = "datafusion-catalog" +description = "datafusion-catalog" authors.workspace = true edition.workspace = true homepage.workspace = true diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 85dfb2e8f73a..8435d0632576 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -60,6 +60,7 @@ libc = "0.2.140" num_cpus = { workspace = true } object_store = { workspace = true, optional = true } parquet = { workspace = true, optional = true, default-features = true } +paste = "1.0.15" pyo3 = { version = "0.21.0", optional = true } sqlparser = { workspace = true } diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index f62acaf0493b..27a25d0c9dd5 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -481,13 +481,6 @@ macro_rules! unwrap_or_internal_err { }; } -macro_rules! with_dollar_sign { - ($($body:tt)*) => { - macro_rules! __with_dollar_sign { $($body)* } - __with_dollar_sign!($); - } -} - /// Add a macros for concise DataFusionError::* errors declaration /// supports placeholders the same way as `format!` /// Examples: @@ -501,37 +494,41 @@ macro_rules! with_dollar_sign { /// `NAME_DF_ERR` - macro name for wrapping DataFusionError::*. Needed to keep backtrace opportunity /// in construction where DataFusionError::* used directly, like `map_err`, `ok_or_else`, etc macro_rules! make_error { - ($NAME_ERR:ident, $NAME_DF_ERR: ident, $ERR:ident) => { - with_dollar_sign! { - ($d:tt) => { - /// Macro wraps `$ERR` to add backtrace feature - #[macro_export] - macro_rules! $NAME_DF_ERR { - ($d($d args:expr),*) => { - $crate::DataFusionError::$ERR( - format!( - "{}{}", - format!($d($d args),*), - $crate::DataFusionError::get_back_trace(), - ).into() - ) - } + ($NAME_ERR:ident, $NAME_DF_ERR: ident, $ERR:ident) => { make_error!(@inner ($), $NAME_ERR, $NAME_DF_ERR, $ERR); }; + (@inner ($d:tt), $NAME_ERR:ident, $NAME_DF_ERR:ident, $ERR:ident) => { + ::paste::paste!{ + /// Macro wraps `$ERR` to add backtrace feature + #[macro_export] + macro_rules! $NAME_DF_ERR { + ($d($d args:expr),*) => { + $crate::DataFusionError::$ERR( + ::std::format!( + "{}{}", + ::std::format!($d($d args),*), + $crate::DataFusionError::get_back_trace(), + ).into() + ) } + } - /// Macro wraps Err(`$ERR`) to add backtrace feature - #[macro_export] - macro_rules! $NAME_ERR { - ($d($d args:expr),*) => { - Err($crate::DataFusionError::$ERR( - format!( - "{}{}", - format!($d($d args),*), - $crate::DataFusionError::get_back_trace(), - ).into() - )) - } + /// Macro wraps Err(`$ERR`) to add backtrace feature + #[macro_export] + macro_rules! $NAME_ERR { + ($d($d args:expr),*) => { + Err($crate::[<_ $NAME_DF_ERR>]!($d($d args),*)) } } + + + // Note: Certain macros are used in this crate, but not all. + // This macro generates a use or all of them in case they are needed + // so we allow unused code to avoid warnings when they are not used + #[doc(hidden)] + #[allow(unused)] + pub use $NAME_ERR as [<_ $NAME_ERR>]; + #[doc(hidden)] + #[allow(unused)] + pub use $NAME_DF_ERR as [<_ $NAME_DF_ERR>]; } }; } @@ -613,12 +610,6 @@ macro_rules! schema_err { // To avoid compiler error when using macro in the same crate: // macros from the current crate cannot be referred to by absolute paths -pub use config_err as _config_err; -pub use internal_datafusion_err as _internal_datafusion_err; -pub use internal_err as _internal_err; -pub use not_impl_err as _not_impl_err; -pub use plan_datafusion_err as _plan_datafusion_err; -pub use plan_err as _plan_err; pub use schema_err as _schema_err; /// Create a "field not found" DataFusion::SchemaError diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index 8cd64e7d16a2..19af889e426a 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -73,6 +73,18 @@ pub use table_reference::{ResolvedTableReference, TableReference}; pub use unnest::UnnestOptions; pub use utils::project_schema; +// These are hidden from docs purely to avoid polluting the public view of what this crate exports. +// These are just re-exports of macros by the same name, which gets around the 'cannot refer to +// macro-expanded macro_export macros by their full path' error. +// The design to get around this comes from this comment: +// https://github.com/rust-lang/rust/pull/52234#issuecomment-976702997 +#[doc(hidden)] +pub use error::{ + _config_datafusion_err, _exec_datafusion_err, _internal_datafusion_err, + _not_impl_datafusion_err, _plan_datafusion_err, _resources_datafusion_err, + _substrait_datafusion_err, +}; + /// Downcast an Arrow Array to a concrete type, return an `DataFusionError::Internal` if the cast is /// not possible. In normal usage of DataFusion the downcast should always succeed. /// diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs index d6b5310581d7..50ae4e3ca71f 100644 --- a/datafusion/common/src/stats.rs +++ b/datafusion/common/src/stats.rs @@ -25,7 +25,7 @@ use arrow_schema::{Schema, SchemaRef}; /// Represents a value with a degree of certainty. `Precision` is used to /// propagate information the precision of statistical values. -#[derive(Clone, PartialEq, Eq, Default)] +#[derive(Clone, PartialEq, Eq, Default, Copy)] pub enum Precision { /// The exact value is known Exact(T), @@ -503,9 +503,9 @@ mod tests { let inexact_precision = Precision::Inexact(42); let absent_precision = Precision::::Absent; - assert_eq!(exact_precision.clone().to_inexact(), inexact_precision); - assert_eq!(inexact_precision.clone().to_inexact(), inexact_precision); - assert_eq!(absent_precision.clone().to_inexact(), absent_precision); + assert_eq!(exact_precision.to_inexact(), inexact_precision); + assert_eq!(inexact_precision.to_inexact(), inexact_precision); + assert_eq!(absent_precision.to_inexact(), absent_precision); } #[test] @@ -545,4 +545,19 @@ mod tests { assert_eq!(precision2.multiply(&precision3), Precision::Inexact(15)); assert_eq!(precision1.multiply(&absent_precision), Precision::Absent); } + + #[test] + fn test_precision_cloning() { + // Precision is copy + let precision: Precision = Precision::Exact(42); + let p2 = precision; + assert_eq!(precision, p2); + + // Precision is not copy (requires .clone()) + let precision: Precision = + Precision::Exact(ScalarValue::Int64(Some(42))); + // Clippy would complain about this if it were Copy + let p2 = precision.clone(); + assert_eq!(precision, p2); + } } diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs index 58dc8f40b577..bf506c0551eb 100644 --- a/datafusion/common/src/utils/mod.rs +++ b/datafusion/common/src/utils/mod.rs @@ -29,15 +29,17 @@ use arrow::compute; use arrow::compute::{partition, SortColumn, SortOptions}; use arrow::datatypes::{Field, SchemaRef, UInt32Type}; use arrow::record_batch::RecordBatch; +use arrow_array::cast::AsArray; use arrow_array::{ - Array, FixedSizeListArray, LargeListArray, ListArray, RecordBatchOptions, + Array, FixedSizeListArray, LargeListArray, ListArray, OffsetSizeTrait, + RecordBatchOptions, }; use arrow_schema::DataType; use sqlparser::ast::Ident; use sqlparser::dialect::GenericDialect; use sqlparser::parser::Parser; use std::borrow::{Borrow, Cow}; -use std::cmp::Ordering; +use std::cmp::{min, Ordering}; use std::collections::HashSet; use std::ops::Range; use std::sync::Arc; @@ -440,6 +442,11 @@ pub fn arrays_into_list_array( )) } +/// Helper function to convert a ListArray into a vector of ArrayRefs. +pub fn list_to_arrays(a: ArrayRef) -> Vec { + a.as_list::().iter().flatten().collect::>() +} + /// Get the base type of a data type. /// /// Example @@ -683,6 +690,69 @@ pub fn transpose(original: Vec>) -> Vec> { } } +/// Computes the `skip` and `fetch` parameters of a single limit that would be +/// equivalent to two consecutive limits with the given `skip`/`fetch` parameters. +/// +/// There are multiple cases to consider: +/// +/// # Case 0: Parent and child are disjoint (`child_fetch <= skip`). +/// +/// ```text +/// Before merging: +/// |........skip........|---fetch-->| Parent limit +/// |...child_skip...|---child_fetch-->| Child limit +/// ``` +/// +/// After merging: +/// ```text +/// |.........(child_skip + skip).........| +/// ``` +/// +/// # Case 1: Parent is beyond child's range (`skip < child_fetch <= skip + fetch`). +/// +/// Before merging: +/// ```text +/// |...skip...|------------fetch------------>| Parent limit +/// |...child_skip...|-------------child_fetch------------>| Child limit +/// ``` +/// +/// After merging: +/// ```text +/// |....(child_skip + skip)....|---(child_fetch - skip)-->| +/// ``` +/// +/// # Case 2: Parent is within child's range (`skip + fetch < child_fetch`). +/// +/// Before merging: +/// ```text +/// |...skip...|---fetch-->| Parent limit +/// |...child_skip...|-------------child_fetch------------>| Child limit +/// ``` +/// +/// After merging: +/// ```text +/// |....(child_skip + skip)....|---fetch-->| +/// ``` +pub fn combine_limit( + parent_skip: usize, + parent_fetch: Option, + child_skip: usize, + child_fetch: Option, +) -> (usize, Option) { + let combined_skip = child_skip.saturating_add(parent_skip); + + let combined_fetch = match (parent_fetch, child_fetch) { + (Some(parent_fetch), Some(child_fetch)) => { + Some(min(parent_fetch, child_fetch.saturating_sub(parent_skip))) + } + (Some(parent_fetch), None) => Some(parent_fetch), + (None, Some(child_fetch)) => Some(child_fetch.saturating_sub(parent_skip)), + (None, None) => None, + }; + + (combined_skip, combined_fetch) +} + #[cfg(test)] mod tests { use crate::ScalarValue::Null; diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index 17850ea7585a..34fb6226c1a2 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -244,7 +244,7 @@ impl FileScanConfig { } let table_stats = Statistics { - num_rows: self.statistics.num_rows.clone(), + num_rows: self.statistics.num_rows, // TODO correct byte size? total_byte_size: Precision::Absent, column_statistics: table_cols_stats, diff --git a/datafusion/core/src/datasource/physical_plan/parquet/row_filter.rs b/datafusion/core/src/datasource/physical_plan/parquet/row_filter.rs index f9cce5f783ff..9de132169389 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/row_filter.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/row_filter.rs @@ -341,14 +341,9 @@ pub fn build_row_filter( let mut candidates: Vec = predicates .into_iter() .flat_map(|expr| { - if let Ok(candidate) = - FilterCandidateBuilder::new(expr.clone(), file_schema, table_schema) - .build(metadata) - { - candidate - } else { - None - } + FilterCandidateBuilder::new(expr.clone(), file_schema, table_schema) + .build(metadata) + .unwrap_or_default() }) .collect(); diff --git a/datafusion/core/src/datasource/statistics.rs b/datafusion/core/src/datasource/statistics.rs index 669755877680..6f89657defd3 100644 --- a/datafusion/core/src/datasource/statistics.rs +++ b/datafusion/core/src/datasource/statistics.rs @@ -18,17 +18,18 @@ use std::mem; use std::sync::Arc; -use super::listing::PartitionedFile; -use crate::arrow::datatypes::{Schema, SchemaRef}; -use crate::error::Result; -use crate::functions_aggregate::min_max::{MaxAccumulator, MinAccumulator}; -use crate::physical_plan::{Accumulator, ColumnStatistics, Statistics}; use arrow_schema::DataType; +use futures::{Stream, StreamExt}; use datafusion_common::stats::Precision; use datafusion_common::ScalarValue; -use futures::{Stream, StreamExt}; +use crate::arrow::datatypes::{Schema, SchemaRef}; +use crate::error::Result; +use crate::functions_aggregate::min_max::{MaxAccumulator, MinAccumulator}; +use crate::physical_plan::{Accumulator, ColumnStatistics, Statistics}; + +use super::listing::PartitionedFile; /// Get all files as well as the file level summary statistics (no statistic for partition columns). /// If the optional `limit` is provided, includes only sufficient files. Needed to read up to @@ -62,8 +63,8 @@ pub async fn get_statistics_with_limit( result_files.push(file); // First file, we set them directly from the file statistics. - num_rows = file_stats.num_rows.clone(); - total_byte_size = file_stats.total_byte_size.clone(); + num_rows = file_stats.num_rows; + total_byte_size = file_stats.total_byte_size; for (index, file_column) in file_stats.column_statistics.clone().into_iter().enumerate() { @@ -93,10 +94,10 @@ pub async fn get_statistics_with_limit( // counts across all the files in question. If any file does not // provide any information or provides an inexact value, we demote // the statistic precision to inexact. - num_rows = add_row_stats(file_stats.num_rows.clone(), num_rows); + num_rows = add_row_stats(file_stats.num_rows, num_rows); total_byte_size = - add_row_stats(file_stats.total_byte_size.clone(), total_byte_size); + add_row_stats(file_stats.total_byte_size, total_byte_size); for (file_col_stats, col_stats) in file_stats .column_statistics @@ -110,8 +111,7 @@ pub async fn get_statistics_with_limit( distinct_count: _, } = file_col_stats; - col_stats.null_count = - add_row_stats(file_nc.clone(), col_stats.null_count.clone()); + col_stats.null_count = add_row_stats(*file_nc, col_stats.null_count); set_max_if_greater(file_max, &mut col_stats.max_value); set_min_if_lesser(file_min, &mut col_stats.min_value) } @@ -192,7 +192,7 @@ pub(crate) fn get_col_stats( None => None, }; ColumnStatistics { - null_count: null_counts[i].clone(), + null_count: null_counts[i], max_value: max_value.map(Precision::Exact).unwrap_or(Precision::Absent), min_value: min_value.map(Precision::Exact).unwrap_or(Precision::Absent), distinct_count: Precision::Absent, diff --git a/datafusion/core/src/datasource/view.rs b/datafusion/core/src/datasource/view.rs index 98d118c027b7..a81942bf769e 100644 --- a/datafusion/core/src/datasource/view.rs +++ b/datafusion/core/src/datasource/view.rs @@ -19,17 +19,19 @@ use std::{any::Any, sync::Arc}; -use arrow::datatypes::SchemaRef; -use async_trait::async_trait; -use datafusion_catalog::Session; -use datafusion_common::Column; -use datafusion_expr::{LogicalPlanBuilder, TableProviderFilterPushDown}; - use crate::{ error::Result, logical_expr::{Expr, LogicalPlan}, physical_plan::ExecutionPlan, }; +use arrow::datatypes::SchemaRef; +use async_trait::async_trait; +use datafusion_catalog::Session; +use datafusion_common::config::ConfigOptions; +use datafusion_common::Column; +use datafusion_expr::{LogicalPlanBuilder, TableProviderFilterPushDown}; +use datafusion_optimizer::analyzer::expand_wildcard_rule::ExpandWildcardRule; +use datafusion_optimizer::Analyzer; use crate::datasource::{TableProvider, TableType}; @@ -50,6 +52,7 @@ impl ViewTable { logical_plan: LogicalPlan, definition: Option, ) -> Result { + let logical_plan = Self::apply_required_rule(logical_plan)?; let table_schema = logical_plan.schema().as_ref().to_owned().into(); let view = Self { @@ -61,6 +64,15 @@ impl ViewTable { Ok(view) } + fn apply_required_rule(logical_plan: LogicalPlan) -> Result { + let options = ConfigOptions::default(); + Analyzer::with_rules(vec![Arc::new(ExpandWildcardRule::new())]).execute_and_check( + logical_plan, + &options, + |_, _| {}, + ) + } + /// Get definition ref pub fn definition(&self) -> Option<&String> { self.definition.as_ref() @@ -232,6 +244,26 @@ mod tests { assert_batches_eq!(expected, &results); + let view_sql = + "CREATE VIEW replace_xyz AS SELECT * REPLACE (column1*2 as column1) FROM xyz"; + session_ctx.sql(view_sql).await?.collect().await?; + + let results = session_ctx + .sql("SELECT * FROM replace_xyz") + .await? + .collect() + .await?; + + let expected = [ + "+---------+---------+---------+", + "| column1 | column2 | column3 |", + "+---------+---------+---------+", + "| 2 | 2 | 3 |", + "| 8 | 5 | 6 |", + "+---------+---------+---------+", + ]; + + assert_batches_eq!(expected, &results); Ok(()) } diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index c63ffddd81b3..972a6f643733 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -718,7 +718,6 @@ impl SessionContext { } (_, Err(_)) => { let table = Arc::new(ViewTable::try_new((*input).clone(), definition)?); - self.register_table(name, table)?; self.return_empty_dataframe() } diff --git a/datafusion/core/src/physical_optimizer/limit_pushdown.rs b/datafusion/core/src/physical_optimizer/limit_pushdown.rs deleted file mode 100644 index d02737ff0959..000000000000 --- a/datafusion/core/src/physical_optimizer/limit_pushdown.rs +++ /dev/null @@ -1,661 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! [`LimitPushdown`] pushes `LIMIT` down through `ExecutionPlan`s to reduce -//! data transfer as much as possible. - -use std::fmt::Debug; -use std::sync::Arc; - -use crate::error::Result; -use crate::physical_optimizer::PhysicalOptimizerRule; -use crate::physical_plan::ExecutionPlan; - -use datafusion_common::config::ConfigOptions; -use datafusion_common::plan_datafusion_err; -use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; -use datafusion_optimizer::push_down_limit::combine_limit; -use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec; -use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; -use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; - -/// This rule inspects [`ExecutionPlan`]'s and pushes down the fetch limit from -/// the parent to the child if applicable. -#[derive(Default)] -pub struct LimitPushdown {} - -impl LimitPushdown { - #[allow(missing_docs)] - pub fn new() -> Self { - Self {} - } -} - -impl PhysicalOptimizerRule for LimitPushdown { - fn optimize( - &self, - plan: Arc, - _config: &ConfigOptions, - ) -> Result> { - plan.transform_down(push_down_limits).data() - } - - fn name(&self) -> &str { - "LimitPushdown" - } - - fn schema_check(&self) -> bool { - true - } -} - -/// This enumeration makes `skip` and `fetch` calculations easier by providing -/// a single API for both local and global limit operators. -#[derive(Debug)] -enum LimitExec { - Global(GlobalLimitExec), - Local(LocalLimitExec), -} - -impl LimitExec { - fn input(&self) -> &Arc { - match self { - Self::Global(global) => global.input(), - Self::Local(local) => local.input(), - } - } - - fn fetch(&self) -> Option { - match self { - Self::Global(global) => global.fetch(), - Self::Local(local) => Some(local.fetch()), - } - } - - fn skip(&self) -> usize { - match self { - Self::Global(global) => global.skip(), - Self::Local(_) => 0, - } - } - - fn with_child(&self, child: Arc) -> Self { - match self { - Self::Global(global) => { - Self::Global(GlobalLimitExec::new(child, global.skip(), global.fetch())) - } - Self::Local(local) => Self::Local(LocalLimitExec::new(child, local.fetch())), - } - } -} - -impl From for Arc { - fn from(limit_exec: LimitExec) -> Self { - match limit_exec { - LimitExec::Global(global) => Arc::new(global), - LimitExec::Local(local) => Arc::new(local), - } - } -} - -/// Pushes down the limit through the plan. -pub fn push_down_limits( - plan: Arc, -) -> Result>> { - let maybe_modified = if let Some(limit_exec) = extract_limit(&plan) { - let child = limit_exec.input(); - if let Some(child_limit) = extract_limit(child) { - let merged = merge_limits(&limit_exec, &child_limit); - // Revisit current node in case of consecutive pushdowns - Some(push_down_limits(merged)?.data) - } else if child.supports_limit_pushdown() { - try_push_down_limit(&limit_exec, child.clone())? - } else { - add_fetch_to_child(&limit_exec, child.clone()) - } - } else { - None - }; - - Ok(maybe_modified.map_or(Transformed::no(plan), Transformed::yes)) -} - -/// Transforms the [`ExecutionPlan`] into a [`LimitExec`] if it is a -/// [`GlobalLimitExec`] or a [`LocalLimitExec`]. -fn extract_limit(plan: &Arc) -> Option { - if let Some(global_limit) = plan.as_any().downcast_ref::() { - Some(LimitExec::Global(GlobalLimitExec::new( - global_limit.input().clone(), - global_limit.skip(), - global_limit.fetch(), - ))) - } else { - plan.as_any() - .downcast_ref::() - .map(|local_limit| { - LimitExec::Local(LocalLimitExec::new( - local_limit.input().clone(), - local_limit.fetch(), - )) - }) - } -} - -/// Merge the limits of the parent and the child. If at least one of them is a -/// [`GlobalLimitExec`], the result is also a [`GlobalLimitExec`]. Otherwise, -/// the result is a [`LocalLimitExec`]. -fn merge_limits( - parent_limit_exec: &LimitExec, - child_limit_exec: &LimitExec, -) -> Arc { - // We can use the logic in `combine_limit` from the logical optimizer: - let (skip, fetch) = combine_limit( - parent_limit_exec.skip(), - parent_limit_exec.fetch(), - child_limit_exec.skip(), - child_limit_exec.fetch(), - ); - match (parent_limit_exec, child_limit_exec) { - (LimitExec::Local(_), LimitExec::Local(_)) => { - // The fetch is present in this case, can unwrap. - Arc::new(LocalLimitExec::new( - child_limit_exec.input().clone(), - fetch.unwrap(), - )) - } - _ => Arc::new(GlobalLimitExec::new( - child_limit_exec.input().clone(), - skip, - fetch, - )), - } -} - -/// Pushes down the limit through the child. If the child has a single input -/// partition, simply swaps the parent and the child. Otherwise, adds a -/// [`LocalLimitExec`] after in between in addition to swapping, because of -/// multiple input partitions. -fn try_push_down_limit( - limit_exec: &LimitExec, - child: Arc, -) -> Result>> { - let grandchildren = child.children(); - if let Some(&grandchild) = grandchildren.first() { - // GlobalLimitExec and LocalLimitExec must have an input after pushdown - if combines_input_partitions(&child) { - // We still need a LocalLimitExec after the child - if let Some(fetch) = limit_exec.fetch() { - let new_local_limit = Arc::new(LocalLimitExec::new( - grandchild.clone(), - fetch + limit_exec.skip(), - )); - let new_child = child.clone().with_new_children(vec![new_local_limit])?; - Ok(Some(limit_exec.with_child(new_child).into())) - } else { - Ok(None) - } - } else { - // Swap current with child - let new_limit = limit_exec.with_child(grandchild.clone()); - let new_child = child.clone().with_new_children(vec![new_limit.into()])?; - Ok(Some(new_child)) - } - } else { - // Operators supporting limit push down must have a child. - Err(plan_datafusion_err!( - "{:#?} must have a child to push down limit", - child - )) - } -} - -fn combines_input_partitions(exec: &Arc) -> bool { - let exec = exec.as_any(); - exec.is::() || exec.is::() -} - -/// Transforms child to the fetching version if supported. Removes the parent if -/// skip is zero. Otherwise, keeps the parent. -fn add_fetch_to_child( - limit_exec: &LimitExec, - child: Arc, -) -> Option> { - let fetch = limit_exec.fetch(); - let skip = limit_exec.skip(); - - let child_fetch = fetch.map(|f| f + skip); - - if let Some(child_with_fetch) = child.with_fetch(child_fetch) { - if skip > 0 { - Some(limit_exec.with_child(child_with_fetch).into()) - } else { - Some(child_with_fetch) - } - } else { - None - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use arrow_schema::{DataType, Field, Schema, SchemaRef}; - use datafusion_execution::{SendableRecordBatchStream, TaskContext}; - use datafusion_expr::Operator; - use datafusion_physical_expr::expressions::BinaryExpr; - use datafusion_physical_expr::expressions::{col, lit}; - use datafusion_physical_expr::Partitioning; - use datafusion_physical_plan::coalesce_batches::CoalesceBatchesExec; - use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec; - use datafusion_physical_plan::empty::EmptyExec; - use datafusion_physical_plan::filter::FilterExec; - use datafusion_physical_plan::get_plan_string; - use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; - use datafusion_physical_plan::projection::ProjectionExec; - use datafusion_physical_plan::repartition::RepartitionExec; - use datafusion_physical_plan::streaming::{PartitionStream, StreamingTableExec}; - - struct DummyStreamPartition { - schema: SchemaRef, - } - impl PartitionStream for DummyStreamPartition { - fn schema(&self) -> &SchemaRef { - &self.schema - } - fn execute(&self, _ctx: Arc) -> SendableRecordBatchStream { - unreachable!() - } - } - - #[test] - fn transforms_streaming_table_exec_into_fetching_version_when_skip_is_zero( - ) -> Result<()> { - let schema = create_schema(); - let streaming_table = streaming_table_exec(schema.clone())?; - let global_limit = global_limit_exec(streaming_table, 0, Some(5)); - - let initial = get_plan_string(&global_limit); - let expected_initial = [ - "GlobalLimitExec: skip=0, fetch=5", - " StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true" - ]; - assert_eq!(initial, expected_initial); - - let after_optimize = - LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?; - - let expected = [ - "StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true, fetch=5" - ]; - assert_eq!(get_plan_string(&after_optimize), expected); - - Ok(()) - } - - #[test] - fn transforms_streaming_table_exec_into_fetching_version_and_keeps_the_global_limit_when_skip_is_nonzero( - ) -> Result<()> { - let schema = create_schema(); - let streaming_table = streaming_table_exec(schema.clone())?; - let global_limit = global_limit_exec(streaming_table, 2, Some(5)); - - let initial = get_plan_string(&global_limit); - let expected_initial = [ - "GlobalLimitExec: skip=2, fetch=5", - " StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true" - ]; - assert_eq!(initial, expected_initial); - - let after_optimize = - LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?; - - let expected = [ - "GlobalLimitExec: skip=2, fetch=5", - " StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true, fetch=7" - ]; - assert_eq!(get_plan_string(&after_optimize), expected); - - Ok(()) - } - - #[test] - fn transforms_coalesce_batches_exec_into_fetching_version_and_removes_local_limit( - ) -> Result<()> { - let schema = create_schema(); - let streaming_table = streaming_table_exec(schema.clone())?; - let repartition = repartition_exec(streaming_table)?; - let filter = filter_exec(schema.clone(), repartition)?; - let coalesce_batches = coalesce_batches_exec(filter); - let local_limit = local_limit_exec(coalesce_batches, 5); - let coalesce_partitions = coalesce_partitions_exec(local_limit); - let global_limit = global_limit_exec(coalesce_partitions, 0, Some(5)); - - let initial = get_plan_string(&global_limit); - let expected_initial = [ - "GlobalLimitExec: skip=0, fetch=5", - " CoalescePartitionsExec", - " LocalLimitExec: fetch=5", - " CoalesceBatchesExec: target_batch_size=8192", - " FilterExec: c3@2 > 0", - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", - " StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true" - ]; - assert_eq!(initial, expected_initial); - - let after_optimize = - LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?; - - let expected = [ - "GlobalLimitExec: skip=0, fetch=5", - " CoalescePartitionsExec", - " CoalesceBatchesExec: target_batch_size=8192, fetch=5", - " FilterExec: c3@2 > 0", - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", - " StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true" - ]; - assert_eq!(get_plan_string(&after_optimize), expected); - - Ok(()) - } - - #[test] - fn pushes_global_limit_exec_through_projection_exec() -> Result<()> { - let schema = create_schema(); - let streaming_table = streaming_table_exec(schema.clone())?; - let filter = filter_exec(schema.clone(), streaming_table)?; - let projection = projection_exec(schema.clone(), filter)?; - let global_limit = global_limit_exec(projection, 0, Some(5)); - - let initial = get_plan_string(&global_limit); - let expected_initial = [ - "GlobalLimitExec: skip=0, fetch=5", - " ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]", - " FilterExec: c3@2 > 0", - " StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true" - ]; - assert_eq!(initial, expected_initial); - - let after_optimize = - LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?; - - let expected = [ - "ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]", - " GlobalLimitExec: skip=0, fetch=5", - " FilterExec: c3@2 > 0", - " StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true" - ]; - assert_eq!(get_plan_string(&after_optimize), expected); - - Ok(()) - } - - #[test] - fn pushes_global_limit_exec_through_projection_exec_and_transforms_coalesce_batches_exec_into_fetching_version( - ) -> Result<()> { - let schema = create_schema(); - let streaming_table = streaming_table_exec(schema.clone()).unwrap(); - let coalesce_batches = coalesce_batches_exec(streaming_table); - let projection = projection_exec(schema.clone(), coalesce_batches)?; - let global_limit = global_limit_exec(projection, 0, Some(5)); - - let initial = get_plan_string(&global_limit); - let expected_initial = [ - "GlobalLimitExec: skip=0, fetch=5", - " ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]", - " CoalesceBatchesExec: target_batch_size=8192", - " StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true" - ]; - - assert_eq!(initial, expected_initial); - - let after_optimize = - LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?; - - let expected = [ - "ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]", - " CoalesceBatchesExec: target_batch_size=8192, fetch=5", - " StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true" - ]; - assert_eq!(get_plan_string(&after_optimize), expected); - - Ok(()) - } - - #[test] - fn keeps_pushed_local_limit_exec_when_there_are_multiple_input_partitions( - ) -> Result<()> { - let schema = create_schema(); - let streaming_table = streaming_table_exec(schema.clone())?; - let repartition = repartition_exec(streaming_table)?; - let filter = filter_exec(schema.clone(), repartition)?; - let coalesce_partitions = coalesce_partitions_exec(filter); - let global_limit = global_limit_exec(coalesce_partitions, 0, Some(5)); - - let initial = get_plan_string(&global_limit); - let expected_initial = [ - "GlobalLimitExec: skip=0, fetch=5", - " CoalescePartitionsExec", - " FilterExec: c3@2 > 0", - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", - " StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true" - ]; - assert_eq!(initial, expected_initial); - - let after_optimize = - LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?; - - let expected = [ - "GlobalLimitExec: skip=0, fetch=5", - " CoalescePartitionsExec", - " LocalLimitExec: fetch=5", - " FilterExec: c3@2 > 0", - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", - " StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true" - ]; - assert_eq!(get_plan_string(&after_optimize), expected); - - Ok(()) - } - - #[test] - fn merges_local_limit_with_local_limit() -> Result<()> { - let schema = create_schema(); - let empty_exec = empty_exec(schema); - let child_local_limit = local_limit_exec(empty_exec, 10); - let parent_local_limit = local_limit_exec(child_local_limit, 20); - - let initial = get_plan_string(&parent_local_limit); - let expected_initial = [ - "LocalLimitExec: fetch=20", - " LocalLimitExec: fetch=10", - " EmptyExec", - ]; - - assert_eq!(initial, expected_initial); - - let after_optimize = - LimitPushdown::new().optimize(parent_local_limit, &ConfigOptions::new())?; - - let expected = ["LocalLimitExec: fetch=10", " EmptyExec"]; - assert_eq!(get_plan_string(&after_optimize), expected); - - Ok(()) - } - - #[test] - fn merges_global_limit_with_global_limit() -> Result<()> { - let schema = create_schema(); - let empty_exec = empty_exec(schema); - let child_global_limit = global_limit_exec(empty_exec, 10, Some(30)); - let parent_global_limit = global_limit_exec(child_global_limit, 10, Some(20)); - - let initial = get_plan_string(&parent_global_limit); - let expected_initial = [ - "GlobalLimitExec: skip=10, fetch=20", - " GlobalLimitExec: skip=10, fetch=30", - " EmptyExec", - ]; - - assert_eq!(initial, expected_initial); - - let after_optimize = - LimitPushdown::new().optimize(parent_global_limit, &ConfigOptions::new())?; - - let expected = ["GlobalLimitExec: skip=20, fetch=20", " EmptyExec"]; - assert_eq!(get_plan_string(&after_optimize), expected); - - Ok(()) - } - - #[test] - fn merges_global_limit_with_local_limit() -> Result<()> { - let schema = create_schema(); - let empty_exec = empty_exec(schema); - let local_limit = local_limit_exec(empty_exec, 40); - let global_limit = global_limit_exec(local_limit, 20, Some(30)); - - let initial = get_plan_string(&global_limit); - let expected_initial = [ - "GlobalLimitExec: skip=20, fetch=30", - " LocalLimitExec: fetch=40", - " EmptyExec", - ]; - - assert_eq!(initial, expected_initial); - - let after_optimize = - LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?; - - let expected = ["GlobalLimitExec: skip=20, fetch=20", " EmptyExec"]; - assert_eq!(get_plan_string(&after_optimize), expected); - - Ok(()) - } - - #[test] - fn merges_local_limit_with_global_limit() -> Result<()> { - let schema = create_schema(); - let empty_exec = empty_exec(schema); - let global_limit = global_limit_exec(empty_exec, 20, Some(30)); - let local_limit = local_limit_exec(global_limit, 20); - - let initial = get_plan_string(&local_limit); - let expected_initial = [ - "LocalLimitExec: fetch=20", - " GlobalLimitExec: skip=20, fetch=30", - " EmptyExec", - ]; - - assert_eq!(initial, expected_initial); - - let after_optimize = - LimitPushdown::new().optimize(local_limit, &ConfigOptions::new())?; - - let expected = ["GlobalLimitExec: skip=20, fetch=20", " EmptyExec"]; - assert_eq!(get_plan_string(&after_optimize), expected); - - Ok(()) - } - - fn create_schema() -> SchemaRef { - Arc::new(Schema::new(vec![ - Field::new("c1", DataType::Int32, true), - Field::new("c2", DataType::Int32, true), - Field::new("c3", DataType::Int32, true), - ])) - } - - fn streaming_table_exec(schema: SchemaRef) -> Result> { - Ok(Arc::new(StreamingTableExec::try_new( - schema.clone(), - vec![Arc::new(DummyStreamPartition { - schema: schema.clone(), - }) as _], - None, - None, - true, - None, - )?)) - } - - fn global_limit_exec( - input: Arc, - skip: usize, - fetch: Option, - ) -> Arc { - Arc::new(GlobalLimitExec::new(input, skip, fetch)) - } - - fn local_limit_exec( - input: Arc, - fetch: usize, - ) -> Arc { - Arc::new(LocalLimitExec::new(input, fetch)) - } - - fn projection_exec( - schema: SchemaRef, - input: Arc, - ) -> Result> { - Ok(Arc::new(ProjectionExec::try_new( - vec![ - (col("c1", schema.as_ref()).unwrap(), "c1".to_string()), - (col("c2", schema.as_ref()).unwrap(), "c2".to_string()), - (col("c3", schema.as_ref()).unwrap(), "c3".to_string()), - ], - input, - )?)) - } - - fn filter_exec( - schema: SchemaRef, - input: Arc, - ) -> Result> { - Ok(Arc::new(FilterExec::try_new( - Arc::new(BinaryExpr::new( - col("c3", schema.as_ref()).unwrap(), - Operator::Gt, - lit(0), - )), - input, - )?)) - } - - fn coalesce_batches_exec(input: Arc) -> Arc { - Arc::new(CoalesceBatchesExec::new(input, 8192)) - } - - fn coalesce_partitions_exec( - local_limit: Arc, - ) -> Arc { - Arc::new(CoalescePartitionsExec::new(local_limit)) - } - - fn repartition_exec( - streaming_table: Arc, - ) -> Result> { - Ok(Arc::new(RepartitionExec::try_new( - streaming_table, - Partitioning::RoundRobinBatch(8), - )?)) - } - - fn empty_exec(schema: SchemaRef) -> Arc { - Arc::new(EmptyExec::new(schema)) - } -} diff --git a/datafusion/core/src/physical_optimizer/mod.rs b/datafusion/core/src/physical_optimizer/mod.rs index 9291d0b84865..0e68a05d855c 100644 --- a/datafusion/core/src/physical_optimizer/mod.rs +++ b/datafusion/core/src/physical_optimizer/mod.rs @@ -26,7 +26,6 @@ pub mod combine_partial_final_agg; pub mod enforce_distribution; pub mod enforce_sorting; pub mod join_selection; -pub mod limit_pushdown; pub mod limited_distinct_aggregation; pub mod optimizer; pub mod projection_pushdown; diff --git a/datafusion/core/src/physical_optimizer/test_utils.rs b/datafusion/core/src/physical_optimizer/test_utils.rs index 55a0fa814552..90853c347672 100644 --- a/datafusion/core/src/physical_optimizer/test_utils.rs +++ b/datafusion/core/src/physical_optimizer/test_utils.rs @@ -251,7 +251,6 @@ pub fn bounded_window_exec( "count".to_owned(), &[col(col_name, &schema).unwrap()], &[], - &[], &sort_exprs, Arc::new(WindowFrame::new(Some(false))), schema.as_ref(), diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 7eb468f56eeb..9cc2f253f8da 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -1510,7 +1510,6 @@ pub fn create_window_expr_with_name( fun, name, &physical_args, - args, &partition_by, &order_by, window_frame, diff --git a/datafusion/core/tests/fuzz_cases/window_fuzz.rs b/datafusion/core/tests/fuzz_cases/window_fuzz.rs index 813862c4cc2f..d75d8e43370d 100644 --- a/datafusion/core/tests/fuzz_cases/window_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/window_fuzz.rs @@ -253,7 +253,6 @@ async fn bounded_window_causal_non_causal() -> Result<()> { let partitionby_exprs = vec![]; let orderby_exprs = vec![]; - let logical_exprs = vec![]; // Window frame starts with "UNBOUNDED PRECEDING": let start_bound = WindowFrameBound::Preceding(ScalarValue::UInt64(None)); @@ -285,7 +284,6 @@ async fn bounded_window_causal_non_causal() -> Result<()> { &window_fn, fn_name.to_string(), &args, - &logical_exprs, &partitionby_exprs, &orderby_exprs, Arc::new(window_frame), @@ -674,7 +672,6 @@ async fn run_window_test( &window_fn, fn_name.clone(), &args, - &[], &partitionby_exprs, &orderby_exprs, Arc::new(window_frame.clone()), @@ -693,7 +690,6 @@ async fn run_window_test( &window_fn, fn_name, &args, - &[], &partitionby_exprs, &orderby_exprs, Arc::new(window_frame.clone()), diff --git a/datafusion/core/tests/physical_optimizer/limit_pushdown.rs b/datafusion/core/tests/physical_optimizer/limit_pushdown.rs new file mode 100644 index 000000000000..8f3a47c95e9d --- /dev/null +++ b/datafusion/core/tests/physical_optimizer/limit_pushdown.rs @@ -0,0 +1,427 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use datafusion::physical_optimizer::limit_pushdown::LimitPushdown; +use datafusion_common::config::ConfigOptions; +use datafusion_execution::{SendableRecordBatchStream, TaskContext}; +use datafusion_expr::Operator; +use datafusion_physical_expr::expressions::BinaryExpr; +use datafusion_physical_expr::expressions::{col, lit}; +use datafusion_physical_expr::Partitioning; +use datafusion_physical_optimizer::PhysicalOptimizerRule; +use datafusion_physical_plan::coalesce_batches::CoalesceBatchesExec; +use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec; +use datafusion_physical_plan::empty::EmptyExec; +use datafusion_physical_plan::filter::FilterExec; +use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; +use datafusion_physical_plan::projection::ProjectionExec; +use datafusion_physical_plan::repartition::RepartitionExec; +use datafusion_physical_plan::streaming::{PartitionStream, StreamingTableExec}; +use datafusion_physical_plan::{get_plan_string, ExecutionPlan}; +use std::sync::Arc; + +struct DummyStreamPartition { + schema: SchemaRef, +} +impl PartitionStream for DummyStreamPartition { + fn schema(&self) -> &SchemaRef { + &self.schema + } + fn execute(&self, _ctx: Arc) -> SendableRecordBatchStream { + unreachable!() + } +} + +#[test] +fn transforms_streaming_table_exec_into_fetching_version_when_skip_is_zero( +) -> datafusion_common::Result<()> { + let schema = create_schema(); + let streaming_table = streaming_table_exec(schema.clone())?; + let global_limit = global_limit_exec(streaming_table, 0, Some(5)); + + let initial = get_plan_string(&global_limit); + let expected_initial = [ + "GlobalLimitExec: skip=0, fetch=5", + " StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true" + ]; + assert_eq!(initial, expected_initial); + + let after_optimize = + LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?; + + let expected = [ + "StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true, fetch=5" + ]; + assert_eq!(get_plan_string(&after_optimize), expected); + + Ok(()) +} + +#[test] +fn transforms_streaming_table_exec_into_fetching_version_and_keeps_the_global_limit_when_skip_is_nonzero( +) -> datafusion_common::Result<()> { + let schema = create_schema(); + let streaming_table = streaming_table_exec(schema.clone())?; + let global_limit = global_limit_exec(streaming_table, 2, Some(5)); + + let initial = get_plan_string(&global_limit); + let expected_initial = [ + "GlobalLimitExec: skip=2, fetch=5", + " StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true" + ]; + assert_eq!(initial, expected_initial); + + let after_optimize = + LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?; + + let expected = [ + "GlobalLimitExec: skip=2, fetch=5", + " StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true, fetch=7" + ]; + assert_eq!(get_plan_string(&after_optimize), expected); + + Ok(()) +} + +#[test] +fn transforms_coalesce_batches_exec_into_fetching_version_and_removes_local_limit( +) -> datafusion_common::Result<()> { + let schema = create_schema(); + let streaming_table = streaming_table_exec(schema.clone())?; + let repartition = repartition_exec(streaming_table)?; + let filter = filter_exec(schema.clone(), repartition)?; + let coalesce_batches = coalesce_batches_exec(filter); + let local_limit = local_limit_exec(coalesce_batches, 5); + let coalesce_partitions = coalesce_partitions_exec(local_limit); + let global_limit = global_limit_exec(coalesce_partitions, 0, Some(5)); + + let initial = get_plan_string(&global_limit); + let expected_initial = [ + "GlobalLimitExec: skip=0, fetch=5", + " CoalescePartitionsExec", + " LocalLimitExec: fetch=5", + " CoalesceBatchesExec: target_batch_size=8192", + " FilterExec: c3@2 > 0", + " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", + " StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true" + ]; + assert_eq!(initial, expected_initial); + + let after_optimize = + LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?; + + let expected = [ + "GlobalLimitExec: skip=0, fetch=5", + " CoalescePartitionsExec", + " CoalesceBatchesExec: target_batch_size=8192, fetch=5", + " FilterExec: c3@2 > 0", + " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", + " StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true" + ]; + assert_eq!(get_plan_string(&after_optimize), expected); + + Ok(()) +} + +#[test] +fn pushes_global_limit_exec_through_projection_exec() -> datafusion_common::Result<()> { + let schema = create_schema(); + let streaming_table = streaming_table_exec(schema.clone())?; + let filter = filter_exec(schema.clone(), streaming_table)?; + let projection = projection_exec(schema.clone(), filter)?; + let global_limit = global_limit_exec(projection, 0, Some(5)); + + let initial = get_plan_string(&global_limit); + let expected_initial = [ + "GlobalLimitExec: skip=0, fetch=5", + " ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]", + " FilterExec: c3@2 > 0", + " StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true" + ]; + assert_eq!(initial, expected_initial); + + let after_optimize = + LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?; + + let expected = [ + "ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]", + " GlobalLimitExec: skip=0, fetch=5", + " FilterExec: c3@2 > 0", + " StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true" + ]; + assert_eq!(get_plan_string(&after_optimize), expected); + + Ok(()) +} + +#[test] +fn pushes_global_limit_exec_through_projection_exec_and_transforms_coalesce_batches_exec_into_fetching_version( +) -> datafusion_common::Result<()> { + let schema = create_schema(); + let streaming_table = streaming_table_exec(schema.clone()).unwrap(); + let coalesce_batches = coalesce_batches_exec(streaming_table); + let projection = projection_exec(schema.clone(), coalesce_batches)?; + let global_limit = global_limit_exec(projection, 0, Some(5)); + + let initial = get_plan_string(&global_limit); + let expected_initial = [ + "GlobalLimitExec: skip=0, fetch=5", + " ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]", + " CoalesceBatchesExec: target_batch_size=8192", + " StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true" + ]; + + assert_eq!(initial, expected_initial); + + let after_optimize = + LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?; + + let expected = [ + "ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]", + " CoalesceBatchesExec: target_batch_size=8192, fetch=5", + " StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true" + ]; + assert_eq!(get_plan_string(&after_optimize), expected); + + Ok(()) +} + +#[test] +fn keeps_pushed_local_limit_exec_when_there_are_multiple_input_partitions( +) -> datafusion_common::Result<()> { + let schema = create_schema(); + let streaming_table = streaming_table_exec(schema.clone())?; + let repartition = repartition_exec(streaming_table)?; + let filter = filter_exec(schema.clone(), repartition)?; + let coalesce_partitions = coalesce_partitions_exec(filter); + let global_limit = global_limit_exec(coalesce_partitions, 0, Some(5)); + + let initial = get_plan_string(&global_limit); + let expected_initial = [ + "GlobalLimitExec: skip=0, fetch=5", + " CoalescePartitionsExec", + " FilterExec: c3@2 > 0", + " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", + " StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true" + ]; + assert_eq!(initial, expected_initial); + + let after_optimize = + LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?; + + let expected = [ + "GlobalLimitExec: skip=0, fetch=5", + " CoalescePartitionsExec", + " LocalLimitExec: fetch=5", + " FilterExec: c3@2 > 0", + " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", + " StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true" + ]; + assert_eq!(get_plan_string(&after_optimize), expected); + + Ok(()) +} + +#[test] +fn merges_local_limit_with_local_limit() -> datafusion_common::Result<()> { + let schema = create_schema(); + let empty_exec = empty_exec(schema); + let child_local_limit = local_limit_exec(empty_exec, 10); + let parent_local_limit = local_limit_exec(child_local_limit, 20); + + let initial = get_plan_string(&parent_local_limit); + let expected_initial = [ + "LocalLimitExec: fetch=20", + " LocalLimitExec: fetch=10", + " EmptyExec", + ]; + + assert_eq!(initial, expected_initial); + + let after_optimize = + LimitPushdown::new().optimize(parent_local_limit, &ConfigOptions::new())?; + + let expected = ["LocalLimitExec: fetch=10", " EmptyExec"]; + assert_eq!(get_plan_string(&after_optimize), expected); + + Ok(()) +} + +#[test] +fn merges_global_limit_with_global_limit() -> datafusion_common::Result<()> { + let schema = create_schema(); + let empty_exec = empty_exec(schema); + let child_global_limit = global_limit_exec(empty_exec, 10, Some(30)); + let parent_global_limit = global_limit_exec(child_global_limit, 10, Some(20)); + + let initial = get_plan_string(&parent_global_limit); + let expected_initial = [ + "GlobalLimitExec: skip=10, fetch=20", + " GlobalLimitExec: skip=10, fetch=30", + " EmptyExec", + ]; + + assert_eq!(initial, expected_initial); + + let after_optimize = + LimitPushdown::new().optimize(parent_global_limit, &ConfigOptions::new())?; + + let expected = ["GlobalLimitExec: skip=20, fetch=20", " EmptyExec"]; + assert_eq!(get_plan_string(&after_optimize), expected); + + Ok(()) +} + +#[test] +fn merges_global_limit_with_local_limit() -> datafusion_common::Result<()> { + let schema = create_schema(); + let empty_exec = empty_exec(schema); + let local_limit = local_limit_exec(empty_exec, 40); + let global_limit = global_limit_exec(local_limit, 20, Some(30)); + + let initial = get_plan_string(&global_limit); + let expected_initial = [ + "GlobalLimitExec: skip=20, fetch=30", + " LocalLimitExec: fetch=40", + " EmptyExec", + ]; + + assert_eq!(initial, expected_initial); + + let after_optimize = + LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?; + + let expected = ["GlobalLimitExec: skip=20, fetch=20", " EmptyExec"]; + assert_eq!(get_plan_string(&after_optimize), expected); + + Ok(()) +} + +#[test] +fn merges_local_limit_with_global_limit() -> datafusion_common::Result<()> { + let schema = create_schema(); + let empty_exec = empty_exec(schema); + let global_limit = global_limit_exec(empty_exec, 20, Some(30)); + let local_limit = local_limit_exec(global_limit, 20); + + let initial = get_plan_string(&local_limit); + let expected_initial = [ + "LocalLimitExec: fetch=20", + " GlobalLimitExec: skip=20, fetch=30", + " EmptyExec", + ]; + + assert_eq!(initial, expected_initial); + + let after_optimize = + LimitPushdown::new().optimize(local_limit, &ConfigOptions::new())?; + + let expected = ["GlobalLimitExec: skip=20, fetch=20", " EmptyExec"]; + assert_eq!(get_plan_string(&after_optimize), expected); + + Ok(()) +} + +fn create_schema() -> SchemaRef { + Arc::new(Schema::new(vec![ + Field::new("c1", DataType::Int32, true), + Field::new("c2", DataType::Int32, true), + Field::new("c3", DataType::Int32, true), + ])) +} + +fn streaming_table_exec( + schema: SchemaRef, +) -> datafusion_common::Result> { + Ok(Arc::new(StreamingTableExec::try_new( + schema.clone(), + vec![Arc::new(DummyStreamPartition { + schema: schema.clone(), + }) as _], + None, + None, + true, + None, + )?)) +} + +fn global_limit_exec( + input: Arc, + skip: usize, + fetch: Option, +) -> Arc { + Arc::new(GlobalLimitExec::new(input, skip, fetch)) +} + +fn local_limit_exec( + input: Arc, + fetch: usize, +) -> Arc { + Arc::new(LocalLimitExec::new(input, fetch)) +} + +fn projection_exec( + schema: SchemaRef, + input: Arc, +) -> datafusion_common::Result> { + Ok(Arc::new(ProjectionExec::try_new( + vec![ + (col("c1", schema.as_ref()).unwrap(), "c1".to_string()), + (col("c2", schema.as_ref()).unwrap(), "c2".to_string()), + (col("c3", schema.as_ref()).unwrap(), "c3".to_string()), + ], + input, + )?)) +} + +fn filter_exec( + schema: SchemaRef, + input: Arc, +) -> datafusion_common::Result> { + Ok(Arc::new(FilterExec::try_new( + Arc::new(BinaryExpr::new( + col("c3", schema.as_ref()).unwrap(), + Operator::Gt, + lit(0), + )), + input, + )?)) +} + +fn coalesce_batches_exec(input: Arc) -> Arc { + Arc::new(CoalesceBatchesExec::new(input, 8192)) +} + +fn coalesce_partitions_exec( + local_limit: Arc, +) -> Arc { + Arc::new(CoalescePartitionsExec::new(local_limit)) +} + +fn repartition_exec( + streaming_table: Arc, +) -> datafusion_common::Result> { + Ok(Arc::new(RepartitionExec::try_new( + streaming_table, + Partitioning::RoundRobinBatch(8), + )?)) +} + +fn empty_exec(schema: SchemaRef) -> Arc { + Arc::new(EmptyExec::new(schema)) +} diff --git a/datafusion/core/tests/physical_optimizer/mod.rs b/datafusion/core/tests/physical_optimizer/mod.rs index 0ee89a3d213c..904a8b9fbb38 100644 --- a/datafusion/core/tests/physical_optimizer/mod.rs +++ b/datafusion/core/tests/physical_optimizer/mod.rs @@ -16,3 +16,4 @@ // under the License. mod aggregate_statistics; +mod limit_pushdown; diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index 05e365a0b988..251ac6cb8c0e 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -25,7 +25,7 @@ use crate::operator::Operator; use arrow::array::{new_empty_array, Array}; use arrow::compute::can_cast_types; use arrow::datatypes::{ - DataType, Field, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, + DataType, Field, FieldRef, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, }; use datafusion_common::{exec_datafusion_err, plan_datafusion_err, plan_err, Result}; @@ -498,6 +498,7 @@ pub fn comparison_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option Option { + use arrow::datatypes::DataType::*; + match (lhs_type, rhs_type) { + (Struct(lhs_fields), Struct(rhs_fields)) => { + if lhs_fields.len() != rhs_fields.len() { + return None; + } + + let types = std::iter::zip(lhs_fields.iter(), rhs_fields.iter()) + .map(|(lhs, rhs)| comparison_coercion(lhs.data_type(), rhs.data_type())) + .collect::>>()?; + + let fields = types + .into_iter() + .enumerate() + .map(|(i, datatype)| { + Arc::new(Field::new(format!("c{i}"), datatype, true)) + }) + .collect::>(); + Some(Struct(fields.into())) + } + _ => None, + } +} + /// Returns the output type of applying mathematics operations such as /// `+` to arguments of `lhs_type` and `rhs_type`. fn mathematics_numerical_coercion( diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 5030a95d3c8a..b4d489cc7c1e 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -41,7 +41,10 @@ use datafusion_common::{ internal_err, not_impl_err, plan_err, Column, DFSchema, Result, ScalarValue, TableReference, }; -use sqlparser::ast::NullTreatment; +use sqlparser::ast::{ + display_comma_separated, ExceptSelectItem, ExcludeSelectItem, IlikeSelectItem, + NullTreatment, RenameSelectItem, ReplaceSelectElement, +}; /// Represents logical expressions such as `A + 1`, or `CAST(c1 AS int)`. /// @@ -315,7 +318,10 @@ pub enum Expr { /// /// This expr has to be resolved to a list of columns before translating logical /// plan into physical plan. - Wildcard { qualifier: Option }, + Wildcard { + qualifier: Option, + options: WildcardOptions, + }, /// List of grouping set expressions. Only valid in the context of an aggregate /// GROUP BY expression list GroupingSet(GroupingSet), @@ -970,6 +976,89 @@ impl GroupingSet { } } +/// Additional options for wildcards, e.g. Snowflake `EXCLUDE`/`RENAME` and Bigquery `EXCEPT`. +#[derive(Clone, PartialEq, Eq, Hash, Debug, Default)] +pub struct WildcardOptions { + /// `[ILIKE...]`. + /// Snowflake syntax: + pub ilike: Option, + /// `[EXCLUDE...]`. + /// Snowflake syntax: + pub exclude: Option, + /// `[EXCEPT...]`. + /// BigQuery syntax: + /// Clickhouse syntax: + pub except: Option, + /// `[REPLACE]` + /// BigQuery syntax: + /// Clickhouse syntax: + /// Snowflake syntax: + pub replace: Option, + /// `[RENAME ...]`. + /// Snowflake syntax: + pub rename: Option, +} + +impl WildcardOptions { + pub fn with_replace(self, replace: PlannedReplaceSelectItem) -> Self { + WildcardOptions { + ilike: self.ilike, + exclude: self.exclude, + except: self.except, + replace: Some(replace), + rename: self.rename, + } + } +} + +impl Display for WildcardOptions { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + if let Some(ilike) = &self.ilike { + write!(f, " {ilike}")?; + } + if let Some(exclude) = &self.exclude { + write!(f, " {exclude}")?; + } + if let Some(except) = &self.except { + write!(f, " {except}")?; + } + if let Some(replace) = &self.replace { + write!(f, " {replace}")?; + } + if let Some(rename) = &self.rename { + write!(f, " {rename}")?; + } + Ok(()) + } +} + +/// The planned expressions for `REPLACE` +#[derive(Clone, PartialEq, Eq, Hash, Debug, Default)] +pub struct PlannedReplaceSelectItem { + /// The original ast nodes + pub items: Vec, + /// The expression planned from the ast nodes. They will be used when expanding the wildcard. + pub planned_expressions: Vec, +} + +impl Display for PlannedReplaceSelectItem { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "REPLACE")?; + write!(f, " ({})", display_comma_separated(&self.items))?; + Ok(()) + } +} + +impl PlannedReplaceSelectItem { + pub fn items(&self) -> &[ReplaceSelectElement] { + &self.items + } + + pub fn expressions(&self) -> &[Expr] { + &self.planned_expressions + } +} + /// Fixed seed for the hashing so that Ords are consistent across runs const SEED: ahash::RandomState = ahash::RandomState::with_seeds(0, 0, 0, 0); @@ -1720,8 +1809,9 @@ impl Expr { Expr::ScalarSubquery(subquery) => { subquery.hash(hasher); } - Expr::Wildcard { qualifier } => { + Expr::Wildcard { qualifier, options } => { qualifier.hash(hasher); + options.hash(hasher); } Expr::GroupingSet(grouping_set) => { mem::discriminant(grouping_set).hash(hasher); @@ -2242,9 +2332,9 @@ impl fmt::Display for Expr { write!(f, "{expr} IN ([{}])", expr_vec_fmt!(list)) } } - Expr::Wildcard { qualifier } => match qualifier { - Some(qualifier) => write!(f, "{qualifier}.*"), - None => write!(f, "*"), + Expr::Wildcard { qualifier, options } => match qualifier { + Some(qualifier) => write!(f, "{qualifier}.*{options}"), + None => write!(f, "*{options}"), }, Expr::GroupingSet(grouping_sets) => match grouping_sets { GroupingSet::Rollup(exprs) => { @@ -2543,9 +2633,10 @@ fn create_physical_name(e: &Expr, is_first_expr: bool) -> Result { Expr::Sort { .. } => { internal_err!("Create physical name does not support sort expression") } - Expr::Wildcard { .. } => { - internal_err!("Create physical name does not support wildcard") - } + Expr::Wildcard { qualifier, options } => match qualifier { + Some(qualifier) => Ok(format!("{}.*{}", qualifier, options)), + None => Ok(format!("*{}", options)), + }, Expr::Placeholder(_) => { internal_err!("Create physical name does not support placeholder") } @@ -2558,7 +2649,12 @@ fn create_physical_name(e: &Expr, is_first_expr: bool) -> Result { #[cfg(test)] mod test { use crate::expr_fn::col; - use crate::{case, lit, ColumnarValue, ScalarUDF, ScalarUDFImpl, Volatility}; + use crate::{ + case, lit, qualified_wildcard, wildcard, wildcard_with_options, ColumnarValue, + ScalarUDF, ScalarUDFImpl, Volatility, + }; + use sqlparser::ast; + use sqlparser::ast::{Ident, IdentWithAlias}; use std::any::Any; #[test] @@ -2859,4 +2955,109 @@ mod test { ); assert_eq!(find_df_window_func("not_exist"), None) } + + #[test] + fn test_display_wildcard() { + assert_eq!(format!("{}", wildcard()), "*"); + assert_eq!(format!("{}", qualified_wildcard("t1")), "t1.*"); + assert_eq!( + format!( + "{}", + wildcard_with_options(wildcard_options( + Some(IlikeSelectItem { + pattern: "c1".to_string() + }), + None, + None, + None, + None + )) + ), + "* ILIKE 'c1'" + ); + assert_eq!( + format!( + "{}", + wildcard_with_options(wildcard_options( + None, + Some(ExcludeSelectItem::Multiple(vec![ + Ident::from("c1"), + Ident::from("c2") + ])), + None, + None, + None + )) + ), + "* EXCLUDE (c1, c2)" + ); + assert_eq!( + format!( + "{}", + wildcard_with_options(wildcard_options( + None, + None, + Some(ExceptSelectItem { + first_element: Ident::from("c1"), + additional_elements: vec![Ident::from("c2")] + }), + None, + None + )) + ), + "* EXCEPT (c1, c2)" + ); + assert_eq!( + format!( + "{}", + wildcard_with_options(wildcard_options( + None, + None, + None, + Some(PlannedReplaceSelectItem { + items: vec![ReplaceSelectElement { + expr: ast::Expr::Identifier(Ident::from("c1")), + column_name: Ident::from("a1"), + as_keyword: false + }], + planned_expressions: vec![] + }), + None + )) + ), + "* REPLACE (c1 a1)" + ); + assert_eq!( + format!( + "{}", + wildcard_with_options(wildcard_options( + None, + None, + None, + None, + Some(RenameSelectItem::Multiple(vec![IdentWithAlias { + ident: Ident::from("c1"), + alias: Ident::from("a1") + }])) + )) + ), + "* RENAME (c1 AS a1)" + ) + } + + fn wildcard_options( + opt_ilike: Option, + opt_exclude: Option, + opt_except: Option, + opt_replace: Option, + opt_rename: Option, + ) -> WildcardOptions { + WildcardOptions { + ilike: opt_ilike, + exclude: opt_exclude, + except: opt_except, + replace: opt_replace, + rename: opt_rename, + } + } } diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index e9c5485656c8..4e6022399653 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -19,7 +19,7 @@ use crate::expr::{ AggregateFunction, BinaryExpr, Cast, Exists, GroupingSet, InList, InSubquery, - Placeholder, TryCast, Unnest, WindowFunction, + Placeholder, TryCast, Unnest, WildcardOptions, WindowFunction, }; use crate::function::{ AccumulatorArgs, AccumulatorFactoryFunction, PartitionEvaluatorFactory, @@ -37,7 +37,7 @@ use arrow::compute::kernels::cast_utils::{ parse_interval_day_time, parse_interval_month_day_nano, parse_interval_year_month, }; use arrow::datatypes::{DataType, Field}; -use datafusion_common::{plan_err, Column, Result, ScalarValue}; +use datafusion_common::{plan_err, Column, Result, ScalarValue, TableReference}; use sqlparser::ast::NullTreatment; use std::any::Any; use std::fmt::Debug; @@ -119,7 +119,46 @@ pub fn placeholder(id: impl Into) -> Expr { /// assert_eq!(p.to_string(), "*") /// ``` pub fn wildcard() -> Expr { - Expr::Wildcard { qualifier: None } + Expr::Wildcard { + qualifier: None, + options: WildcardOptions::default(), + } +} + +/// Create an '*' [`Expr::Wildcard`] expression with the wildcard options +pub fn wildcard_with_options(options: WildcardOptions) -> Expr { + Expr::Wildcard { + qualifier: None, + options, + } +} + +/// Create an 't.*' [`Expr::Wildcard`] expression that matches all columns from a specific table +/// +/// # Example +/// +/// ```rust +/// # use datafusion_common::TableReference; +/// # use datafusion_expr::{qualified_wildcard}; +/// let p = qualified_wildcard(TableReference::bare("t")); +/// assert_eq!(p.to_string(), "t.*") +/// ``` +pub fn qualified_wildcard(qualifier: impl Into) -> Expr { + Expr::Wildcard { + qualifier: Some(qualifier.into()), + options: WildcardOptions::default(), + } +} + +/// Create an 't.*' [`Expr::Wildcard`] expression with the wildcard options +pub fn qualified_wildcard_with_options( + qualifier: impl Into, + options: WildcardOptions, +) -> Expr { + Expr::Wildcard { + qualifier: Some(qualifier.into()), + options, + } } /// Return a new expression `left right` diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index 0dc41d4a9ac1..32e621350ee2 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -248,6 +248,7 @@ fn coerce_exprs_for_schema( Expr::Alias(Alias { expr, name, .. }) => { Ok(expr.cast_to(new_type, src_schema)?.alias(name)) } + Expr::Wildcard { .. } => Ok(expr), _ => expr.cast_to(new_type, src_schema), } } else { diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 73123819ba99..af35b9a9910d 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -28,8 +28,8 @@ use crate::{utils, LogicalPlan, Projection, Subquery, WindowFunctionDefinition}; use arrow::compute::can_cast_types; use arrow::datatypes::{DataType, Field}; use datafusion_common::{ - internal_err, not_impl_err, plan_datafusion_err, plan_err, Column, ExprSchema, - Result, TableReference, + not_impl_err, plan_datafusion_err, plan_err, Column, ExprSchema, Result, + TableReference, }; use std::collections::HashMap; use std::sync::Arc; @@ -244,13 +244,7 @@ impl ExprSchemable for Expr { ) }) } - Expr::Wildcard { qualifier } => { - // Wildcard do not really have a type and do not appear in projections - match qualifier { - Some(_) => internal_err!("QualifiedWildcard expressions are not valid in a logical query plan"), - None => Ok(DataType::Null) - } - } + Expr::Wildcard { .. } => Ok(DataType::Null), Expr::GroupingSet(_) => { // grouping sets do not really have a type and do not appear in projections Ok(DataType::Null) @@ -362,12 +356,7 @@ impl ExprSchemable for Expr { | Expr::SimilarTo(Like { expr, pattern, .. }) => { Ok(expr.nullable(input_schema)? || pattern.nullable(input_schema)?) } - Expr::Wildcard { qualifier } => match qualifier { - Some(_) => internal_err!( - "QualifiedWildcard expressions are not valid in a logical query plan" - ), - None => Ok(false), - }, + Expr::Wildcard { .. } => Ok(false), Expr::GroupingSet(_) => { // grouping sets do not really have the concept of nullable and do not appear // in projections @@ -548,7 +537,7 @@ mod tests { use super::*; use crate::{col, lit}; - use datafusion_common::{DFSchema, ScalarValue}; + use datafusion_common::{internal_err, DFSchema, ScalarValue}; macro_rules! test_is_expr_nullable { ($EXPR_TYPE:ident) => {{ diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 4ef346656ff4..2e53a682854c 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -20,7 +20,6 @@ use std::any::Any; use std::cmp::Ordering; use std::collections::{HashMap, HashSet}; -use std::iter::zip; use std::sync::Arc; use crate::dml::CopyTo; @@ -36,11 +35,10 @@ use crate::logical_plan::{ Projection, Repartition, Sort, SubqueryAlias, TableScan, Union, Unnest, Values, Window, }; -use crate::type_coercion::binary::{comparison_coercion, values_coercion}; +use crate::type_coercion::binary::values_coercion; use crate::utils::{ - can_hash, columnize_expr, compare_sort_expr, expand_qualified_wildcard, - expand_wildcard, expr_to_columns, find_valid_equijoin_key_pair, - group_window_expr_by_sort_keys, + can_hash, columnize_expr, compare_sort_expr, expr_to_columns, + find_valid_equijoin_key_pair, group_window_expr_by_sort_keys, }; use crate::{ and, binary_expr, logical_plan::tree_node::unwrap_arc, DmlStatement, Expr, @@ -1316,7 +1314,7 @@ fn add_group_by_exprs_from_dependencies( Ok(group_expr) } /// Errors if one or more expressions have equal names. -pub(crate) fn validate_unique_names<'a>( +pub fn validate_unique_names<'a>( node_name: &str, expressions: impl IntoIterator, ) -> Result<()> { @@ -1339,95 +1337,14 @@ pub(crate) fn validate_unique_names<'a>( }) } -pub fn project_with_column_index( - expr: Vec, - input: Arc, - schema: DFSchemaRef, -) -> Result { - let alias_expr = expr - .into_iter() - .enumerate() - .map(|(i, e)| match e { - Expr::Alias(Alias { ref name, .. }) if name != schema.field(i).name() => { - e.unalias().alias(schema.field(i).name()) - } - Expr::Column(Column { - relation: _, - ref name, - }) if name != schema.field(i).name() => e.alias(schema.field(i).name()), - Expr::Alias { .. } | Expr::Column { .. } => e, - _ => e.alias(schema.field(i).name()), - }) - .collect::>(); - - Projection::try_new_with_schema(alias_expr, input, schema) - .map(LogicalPlan::Projection) -} - /// Union two logical plans. pub fn union(left_plan: LogicalPlan, right_plan: LogicalPlan) -> Result { - let left_col_num = left_plan.schema().fields().len(); - - // check union plan length same. - let right_col_num = right_plan.schema().fields().len(); - if right_col_num != left_col_num { - return plan_err!( - "Union queries must have the same number of columns, (left is {left_col_num}, right is {right_col_num})"); - } - - // create union schema - let union_qualified_fields = - zip(left_plan.schema().iter(), right_plan.schema().iter()) - .map( - |((left_qualifier, left_field), (_right_qualifier, right_field))| { - let nullable = left_field.is_nullable() || right_field.is_nullable(); - let data_type = comparison_coercion( - left_field.data_type(), - right_field.data_type(), - ) - .ok_or_else(|| { - plan_datafusion_err!( - "UNION Column {} (type: {}) is not compatible with column {} (type: {})", - right_field.name(), - right_field.data_type(), - left_field.name(), - left_field.data_type() - ) - })?; - Ok(( - left_qualifier.cloned(), - Arc::new(Field::new(left_field.name(), data_type, nullable)), - )) - }, - ) - .collect::>>()?; - let union_schema = - DFSchema::new_with_metadata(union_qualified_fields, HashMap::new())?; - - let inputs = vec![left_plan, right_plan] - .into_iter() - .map(|p| { - let plan = coerce_plan_expr_for_schema(&p, &union_schema)?; - match plan { - LogicalPlan::Projection(Projection { expr, input, .. }) => { - Ok(Arc::new(project_with_column_index( - expr, - input, - Arc::new(union_schema.clone()), - )?)) - } - other_plan => Ok(Arc::new(other_plan)), - } - }) - .collect::>>()?; - - if inputs.is_empty() { - return plan_err!("Empty UNION"); - } - + // Temporarily use the schema from the left input and later rely on the analyzer to + // coerce the two schemas into a common one. + let schema = Arc::clone(left_plan.schema()); Ok(LogicalPlan::Union(Union { - inputs, - schema: Arc::new(union_schema), + inputs: vec![Arc::new(left_plan), Arc::new(right_plan)], + schema, })) } @@ -1440,22 +1357,11 @@ pub fn project( plan: LogicalPlan, expr: impl IntoIterator>, ) -> Result { - // TODO: move it into analyzer - let input_schema = plan.schema(); let mut projected_expr = vec![]; for e in expr { let e = e.into(); match e { - Expr::Wildcard { qualifier: None } => { - projected_expr.extend(expand_wildcard(input_schema, &plan, None)?) - } - Expr::Wildcard { - qualifier: Some(qualifier), - } => projected_expr.extend(expand_qualified_wildcard( - &qualifier, - input_schema, - None, - )?), + Expr::Wildcard { .. } => projected_expr.push(e), _ => projected_expr.push(columnize_expr(normalize_col(e, &plan)?, &plan)?), } } @@ -1807,26 +1713,6 @@ mod tests { Ok(()) } - #[test] - fn plan_using_join_wildcard_projection() -> Result<()> { - let t2 = table_scan(Some("t2"), &employee_schema(), None)?.build()?; - - let plan = table_scan(Some("t1"), &employee_schema(), None)? - .join_using(t2, JoinType::Inner, vec!["id"])? - .project(vec![Expr::Wildcard { qualifier: None }])? - .build()?; - - // id column should only show up once in projection - let expected = "Projection: t1.id, t1.first_name, t1.last_name, t1.state, t1.salary, t2.first_name, t2.last_name, t2.state, t2.salary\ - \n Inner Join: Using t1.id = t2.id\ - \n TableScan: t1\ - \n TableScan: t2"; - - assert_eq!(expected, format!("{plan}")); - - Ok(()) - } - #[test] fn plan_builder_union() -> Result<()> { let plan = @@ -1881,23 +1767,6 @@ mod tests { Ok(()) } - #[test] - fn plan_builder_union_different_num_columns_error() -> Result<()> { - let plan1 = - table_scan(TableReference::none(), &employee_schema(), Some(vec![3]))?; - let plan2 = - table_scan(TableReference::none(), &employee_schema(), Some(vec![3, 4]))?; - - let expected = "Error during planning: Union queries must have the same number of columns, (left is 1, right is 2)"; - let err_msg1 = plan1.clone().union(plan2.clone().build()?).unwrap_err(); - let err_msg2 = plan1.union_distinct(plan2.build()?).unwrap_err(); - - assert_eq!(err_msg1.strip_backtrace(), expected); - assert_eq!(err_msg2.strip_backtrace(), expected); - - Ok(()) - } - #[test] fn plan_builder_simple_distinct() -> Result<()> { let plan = diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index c5538d8880a7..2bab6d516a73 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -31,8 +31,9 @@ use crate::logical_plan::display::{GraphvizVisitor, IndentVisitor}; use crate::logical_plan::extension::UserDefinedLogicalNode; use crate::logical_plan::{DmlStatement, Statement}; use crate::utils::{ - enumerate_grouping_sets, exprlist_to_fields, find_out_reference_exprs, - grouping_set_expr_count, grouping_set_to_exprlist, split_conjunction, + enumerate_grouping_sets, exprlist_len, exprlist_to_fields, find_base_plan, + find_out_reference_exprs, grouping_set_expr_count, grouping_set_to_exprlist, + split_conjunction, }; use crate::{ build_join_schema, expr_vec_fmt, BinaryExpr, BuiltInWindowFunction, @@ -1977,7 +1978,9 @@ impl Projection { input: Arc, schema: DFSchemaRef, ) -> Result { - if expr.len() != schema.fields().len() { + if !expr.iter().any(|e| matches!(e, Expr::Wildcard { .. })) + && expr.len() != schema.fields().len() + { return plan_err!("Projection has mismatch between number of expressions ({}) and number of fields in schema ({})", expr.len(), schema.fields().len()); } Ok(Self { @@ -2763,20 +2766,48 @@ fn calc_func_dependencies_for_project( // Calculate expression indices (if present) in the input schema. let proj_indices = exprs .iter() - .filter_map(|expr| { - let expr_name = match expr { - Expr::Alias(alias) => { - format!("{}", alias.expr) - } - _ => format!("{}", expr), - }; - input_fields.iter().position(|item| *item == expr_name) + .map(|expr| match expr { + Expr::Wildcard { qualifier, options } => { + let wildcard_fields = exprlist_to_fields( + vec![&Expr::Wildcard { + qualifier: qualifier.clone(), + options: options.clone(), + }], + input, + )?; + Ok::<_, DataFusionError>( + wildcard_fields + .into_iter() + .filter_map(|(qualifier, f)| { + let flat_name = qualifier + .map(|t| format!("{}.{}", t, f.name())) + .unwrap_or(f.name().clone()); + input_fields.iter().position(|item| *item == flat_name) + }) + .collect::>(), + ) + } + Expr::Alias(alias) => Ok(input_fields + .iter() + .position(|item| *item == format!("{}", alias.expr)) + .map(|i| vec![i]) + .unwrap_or(vec![])), + _ => Ok(input_fields + .iter() + .position(|item| *item == format!("{}", expr)) + .map(|i| vec![i]) + .unwrap_or(vec![])), }) + .collect::>>()? + .into_iter() + .flatten() .collect::>(); + + let len = exprlist_len(exprs, input.schema(), Some(find_base_plan(input).schema()))?; Ok(input .schema() .functional_dependencies() - .project_functional_dependencies(&proj_indices, exprs.len())) + .project_functional_dependencies(&proj_indices, len)) } /// Sorts its input according to a list of sort expressions. diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 7b650d1ab448..4db5061e8fe7 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -21,7 +21,7 @@ use std::cmp::Ordering; use std::collections::{HashMap, HashSet}; use std::sync::Arc; -use crate::expr::{Alias, Sort, WindowFunction}; +use crate::expr::{Alias, Sort, WildcardOptions, WindowFunction}; use crate::expr_rewriter::strip_outer_reference; use crate::{ and, BinaryExpr, Expr, ExprSchemable, Filter, GroupingSet, LogicalPlan, Operator, @@ -34,11 +34,11 @@ use datafusion_common::tree_node::{ }; use datafusion_common::utils::get_at_indices; use datafusion_common::{ - internal_err, plan_datafusion_err, plan_err, Column, DFSchema, DFSchemaRef, Result, - TableReference, + internal_err, plan_datafusion_err, plan_err, Column, DFSchema, DFSchemaRef, + DataFusionError, Result, TableReference, }; -use sqlparser::ast::{ExceptSelectItem, ExcludeSelectItem, WildcardAdditionalOptions}; +use sqlparser::ast::{ExceptSelectItem, ExcludeSelectItem}; pub use datafusion_functions_aggregate_common::order::AggregateOrderSensitivity; @@ -377,7 +377,7 @@ fn get_exprs_except_skipped( pub fn expand_wildcard( schema: &DFSchema, plan: &LogicalPlan, - wildcard_options: Option<&WildcardAdditionalOptions>, + wildcard_options: Option<&WildcardOptions>, ) -> Result> { let using_columns = plan.using_columns()?; let mut columns_to_skip = using_columns @@ -401,9 +401,9 @@ pub fn expand_wildcard( .collect::>() }) .collect::>(); - let excluded_columns = if let Some(WildcardAdditionalOptions { - opt_exclude, - opt_except, + let excluded_columns = if let Some(WildcardOptions { + exclude: opt_exclude, + except: opt_except, .. }) = wildcard_options { @@ -420,7 +420,7 @@ pub fn expand_wildcard( pub fn expand_qualified_wildcard( qualifier: &TableReference, schema: &DFSchema, - wildcard_options: Option<&WildcardAdditionalOptions>, + wildcard_options: Option<&WildcardOptions>, ) -> Result> { let qualified_indices = schema.fields_indices_with_qualified(qualifier); let projected_func_dependencies = schema @@ -435,9 +435,9 @@ pub fn expand_qualified_wildcard( let qualified_dfschema = DFSchema::try_from_qualified_schema(qualifier.clone(), &qualified_schema)? .with_functional_dependencies(projected_func_dependencies)?; - let excluded_columns = if let Some(WildcardAdditionalOptions { - opt_exclude, - opt_except, + let excluded_columns = if let Some(WildcardOptions { + exclude: opt_exclude, + except: opt_except, .. }) = wildcard_options { @@ -731,11 +731,129 @@ pub fn exprlist_to_fields<'a>( plan: &LogicalPlan, ) -> Result, Arc)>> { // look for exact match in plan's output schema - let input_schema = &plan.schema(); - exprs + let wildcard_schema = find_base_plan(plan).schema(); + let input_schema = plan.schema(); + let result = exprs .into_iter() - .map(|e| e.to_field(input_schema)) - .collect() + .map(|e| match e { + Expr::Wildcard { qualifier, options } => match qualifier { + None => { + let excluded: Vec = get_excluded_columns( + options.exclude.as_ref(), + options.except.as_ref(), + wildcard_schema, + None, + )? + .into_iter() + .map(|c| c.flat_name()) + .collect(); + Ok::<_, DataFusionError>( + wildcard_schema + .field_names() + .iter() + .enumerate() + .filter(|(_, s)| !excluded.contains(s)) + .map(|(i, _)| wildcard_schema.qualified_field(i)) + .map(|(qualifier, f)| { + (qualifier.cloned(), Arc::new(f.to_owned())) + }) + .collect::>(), + ) + } + Some(qualifier) => { + let excluded: Vec = get_excluded_columns( + options.exclude.as_ref(), + options.except.as_ref(), + wildcard_schema, + Some(qualifier), + )? + .into_iter() + .map(|c| c.flat_name()) + .collect(); + Ok(wildcard_schema + .fields_with_qualified(qualifier) + .into_iter() + .filter_map(|field| { + let flat_name = format!("{}.{}", qualifier, field.name()); + if excluded.contains(&flat_name) { + None + } else { + Some(( + Some(qualifier.clone()), + Arc::new(field.to_owned()), + )) + } + }) + .collect::>()) + } + }, + _ => Ok(vec![e.to_field(input_schema)?]), + }) + .collect::>>()? + .into_iter() + .flatten() + .collect(); + Ok(result) +} + +/// Find the suitable base plan to expand the wildcard expression recursively. +/// When planning [LogicalPlan::Window] and [LogicalPlan::Aggregate], we will generate +/// an intermediate plan based on the relation plan (e.g. [LogicalPlan::TableScan], [LogicalPlan::Subquery], ...). +/// If we expand a wildcard expression basing the intermediate plan, we could get some duplicate fields. +pub fn find_base_plan(input: &LogicalPlan) -> &LogicalPlan { + match input { + LogicalPlan::Window(window) => find_base_plan(&window.input), + LogicalPlan::Aggregate(agg) => find_base_plan(&agg.input), + _ => input, + } +} + +/// Count the number of real fields. We should expand the wildcard expression to get the actual number. +pub fn exprlist_len( + exprs: &[Expr], + schema: &DFSchemaRef, + wildcard_schema: Option<&DFSchemaRef>, +) -> Result { + exprs + .iter() + .map(|e| match e { + Expr::Wildcard { + qualifier: None, + options, + } => { + let excluded = get_excluded_columns( + options.exclude.as_ref(), + options.except.as_ref(), + wildcard_schema.unwrap_or(schema), + None, + )? + .into_iter() + .collect::>(); + Ok( + get_exprs_except_skipped(wildcard_schema.unwrap_or(schema), excluded) + .len(), + ) + } + Expr::Wildcard { + qualifier: Some(qualifier), + options, + } => { + let excluded = get_excluded_columns( + options.exclude.as_ref(), + options.except.as_ref(), + wildcard_schema.unwrap_or(schema), + Some(qualifier), + )? + .into_iter() + .collect::>(); + Ok( + get_exprs_except_skipped(wildcard_schema.unwrap_or(schema), excluded) + .len(), + ) + } + _ => Ok(1), + }) + .sum() } /// Convert an expression into Column expression if it's already provided as input plan. diff --git a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator.rs b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator.rs index 644221edd04d..3984b02c5fbb 100644 --- a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator.rs +++ b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator.rs @@ -20,6 +20,7 @@ pub mod accumulate; pub mod bool_op; +pub mod nulls; pub mod prim_op; use arrow::{ diff --git a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/nulls.rs b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/nulls.rs new file mode 100644 index 000000000000..25212f7f0f5f --- /dev/null +++ b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/nulls.rs @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`set_nulls`], and [`filtered_null_mask`], utilities for working with nulls + +use arrow::array::{Array, ArrowNumericType, BooleanArray, PrimitiveArray}; +use arrow::buffer::NullBuffer; + +/// Sets the validity mask for a `PrimitiveArray` to `nulls` +/// replacing any existing null mask +pub fn set_nulls( + array: PrimitiveArray, + nulls: Option, +) -> PrimitiveArray { + let (dt, values, _old_nulls) = array.into_parts(); + PrimitiveArray::::new(values, nulls).with_data_type(dt) +} + +/// Converts a `BooleanBuffer` representing a filter to a `NullBuffer. +/// +/// The `NullBuffer` is +/// * `true` (representing valid) for values that were `true` in filter +/// * `false` (representing null) for values that were `false` or `null` in filter +fn filter_to_nulls(filter: &BooleanArray) -> Option { + let (filter_bools, filter_nulls) = filter.clone().into_parts(); + let filter_bools = NullBuffer::from(filter_bools); + NullBuffer::union(Some(&filter_bools), filter_nulls.as_ref()) +} + +/// Compute an output validity mask for an array that has been filtered +/// +/// This can be used to compute nulls for the output of +/// [`GroupsAccumulator::convert_to_state`], which quickly applies an optional +/// filter to the input rows by setting any filtered rows to NULL in the output. +/// Subsequent applications of aggregate functions that ignore NULLs (most of +/// them) will thus ignore the filtered rows as well. +/// +/// # Output element is `true` (and thus output is non-null) +/// +/// A `true` in the output represents non null output for all values that were *both*: +/// +/// * `true` in any `opt_filter` (aka values that passed the filter) +/// +/// * `non null` in `input` +/// +/// # Output element is `false` (and thus output is null) +/// +/// A `false` in the output represents an input that was *either*: +/// +/// * `null` +/// +/// * filtered (aka the value was `false` or `null` in the filter) +/// +/// # Example +/// +/// ```text +/// ┌─────┐ ┌─────┐ ┌─────┐ +/// │true │ │NULL │ │false│ +/// │true │ │ │true │ │true │ +/// │true │ ───┼─── │false│ ────────▶ │false│ filtered_nulls +/// │false│ │ │NULL │ │false│ +/// │false│ │true │ │false│ +/// └─────┘ └─────┘ └─────┘ +/// array opt_filter output +/// .nulls() +/// +/// false = NULL true = pass false = NULL Meanings +/// true = valid false = filter true = valid +/// NULL = filter +/// ``` +/// +/// [`GroupsAccumulator::convert_to_state`]: datafusion_expr_common::groups_accumulator::GroupsAccumulator +pub fn filtered_null_mask( + opt_filter: Option<&BooleanArray>, + input: &dyn Array, +) -> Option { + let opt_filter = opt_filter.and_then(filter_to_nulls); + NullBuffer::union(opt_filter.as_ref(), input.nulls()) +} diff --git a/datafusion/functions-aggregate/src/average.rs b/datafusion/functions-aggregate/src/average.rs index 1be3cd6b0714..ddad76a8734b 100644 --- a/datafusion/functions-aggregate/src/average.rs +++ b/datafusion/functions-aggregate/src/average.rs @@ -19,8 +19,9 @@ use arrow::array::{ self, Array, ArrayRef, ArrowNativeTypeOp, ArrowNumericType, ArrowPrimitiveType, - AsArray, PrimitiveArray, PrimitiveBuilder, UInt64Array, + AsArray, BooleanArray, PrimitiveArray, PrimitiveBuilder, UInt64Array, }; + use arrow::compute::sum; use arrow::datatypes::{ i256, ArrowNativeType, DataType, Decimal128Type, Decimal256Type, DecimalType, Field, @@ -34,7 +35,12 @@ use datafusion_expr::Volatility::Immutable; use datafusion_expr::{ Accumulator, AggregateUDFImpl, EmitTo, GroupsAccumulator, ReversedUDAF, Signature, }; + use datafusion_functions_aggregate_common::aggregate::groups_accumulator::accumulate::NullState; +use datafusion_functions_aggregate_common::aggregate::groups_accumulator::nulls::{ + filtered_null_mask, set_nulls, +}; + use datafusion_functions_aggregate_common::utils::DecimalAverager; use log::debug; use std::any::Any; @@ -551,6 +557,30 @@ where Ok(()) } + fn convert_to_state( + &self, + values: &[ArrayRef], + opt_filter: Option<&BooleanArray>, + ) -> Result> { + let sums = values[0] + .as_primitive::() + .clone() + .with_data_type(self.sum_data_type.clone()); + let counts = UInt64Array::from_value(1, sums.len()); + + let nulls = filtered_null_mask(opt_filter, &sums); + + // set nulls on the arrays + let counts = set_nulls(counts, nulls.clone()); + let sums = set_nulls(sums, nulls); + + Ok(vec![Arc::new(counts) as ArrayRef, Arc::new(sums)]) + } + + fn supports_convert_to_state(&self) -> bool { + true + } + fn size(&self) -> usize { self.counts.capacity() * std::mem::size_of::() + self.sums.capacity() * std::mem::size_of::() diff --git a/datafusion/functions-nested/src/map.rs b/datafusion/functions-nested/src/map.rs index e218b501dcf1..b6068fdff0d5 100644 --- a/datafusion/functions-nested/src/map.rs +++ b/datafusion/functions-nested/src/map.rs @@ -15,17 +15,20 @@ // specific language governing permissions and limitations // under the License. -use crate::make_array::make_array; +use std::any::Any; +use std::collections::VecDeque; +use std::sync::Arc; + use arrow::array::ArrayData; -use arrow_array::{Array, ArrayRef, MapArray, StructArray}; +use arrow_array::{Array, ArrayRef, MapArray, OffsetSizeTrait, StructArray}; use arrow_buffer::{Buffer, ToByteSlice}; use arrow_schema::{DataType, Field, SchemaBuilder}; + use datafusion_common::{exec_err, ScalarValue}; use datafusion_expr::expr::ScalarFunction; use datafusion_expr::{ColumnarValue, Expr, ScalarUDFImpl, Signature, Volatility}; -use std::any::Any; -use std::collections::VecDeque; -use std::sync::Arc; + +use crate::make_array::make_array; /// Returns a map created from a key list and a value list pub fn map(keys: Vec, values: Vec) -> Expr { @@ -56,11 +59,11 @@ fn make_map_batch(args: &[ColumnarValue]) -> datafusion_common::Result Ok(array.value(0)), _ => exec_err!("Expected array, got {:?}", value), }, - ColumnarValue::Array(array) => exec_err!("Expected scalar, got {:?}", array), + ColumnarValue::Array(array) => Ok(array.to_owned()), } } @@ -81,6 +84,7 @@ fn make_map_batch_internal( keys: ArrayRef, values: ArrayRef, can_evaluate_to_const: bool, + data_type: DataType, ) -> datafusion_common::Result { if keys.null_count() > 0 { return exec_err!("map key cannot be null"); @@ -90,6 +94,14 @@ fn make_map_batch_internal( return exec_err!("map requires key and value lists to have the same length"); } + if !can_evaluate_to_const { + return if let DataType::LargeList(..) = data_type { + make_map_array_internal::(keys, values) + } else { + make_map_array_internal::(keys, values) + }; + } + let key_field = Arc::new(Field::new("key", keys.data_type().clone(), false)); let value_field = Arc::new(Field::new("value", values.data_type().clone(), true)); let mut entry_struct_buffer: VecDeque<(Arc, ArrayRef)> = VecDeque::new(); @@ -190,7 +202,6 @@ impl ScalarUDFImpl for MapFunc { make_map_batch(args) } } - fn get_element_type(data_type: &DataType) -> datafusion_common::Result<&DataType> { match data_type { DataType::List(element) => Ok(element.data_type()), @@ -202,3 +213,115 @@ fn get_element_type(data_type: &DataType) -> datafusion_common::Result<&DataType ), } } + +/// Helper function to create MapArray from array of values to support arrays for Map scalar function +/// +/// ``` text +/// Format of input KEYS and VALUES column +/// keys values +/// +---------------------+ +---------------------+ +/// | +-----------------+ | | +-----------------+ | +/// | | [k11, k12, k13] | | | | [v11, v12, v13] | | +/// | +-----------------+ | | +-----------------+ | +/// | | | | +/// | +-----------------+ | | +-----------------+ | +/// | | [k21, k22, k23] | | | | [v21, v22, v23] | | +/// | +-----------------+ | | +-----------------+ | +/// | | | | +/// | +-----------------+ | | +-----------------+ | +/// | |[k31, k32, k33] | | | |[v31, v32, v33] | | +/// | +-----------------+ | | +-----------------+ | +/// +---------------------+ +---------------------+ +/// ``` +/// Flattened keys and values array to user create `StructArray`, +/// which serves as inner child for `MapArray` +/// +/// ``` text +/// Flattened Flattened +/// Keys Values +/// +-----------+ +-----------+ +/// | +-------+ | | +-------+ | +/// | | k11 | | | | v11 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k12 | | | | v12 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k13 | | | | v13 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k21 | | | | v21 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k22 | | | | v22 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k23 | | | | v23 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k31 | | | | v31 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k32 | | | | v32 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k33 | | | | v33 | | +/// | +-------+ | | +-------+ | +/// +-----------+ +-----------+ +/// ```text + +fn make_map_array_internal( + keys: ArrayRef, + values: ArrayRef, +) -> datafusion_common::Result { + let mut offset_buffer = vec![O::zero()]; + let mut running_offset = O::zero(); + + let keys = datafusion_common::utils::list_to_arrays::(keys); + let values = datafusion_common::utils::list_to_arrays::(values); + + let mut key_array_vec = vec![]; + let mut value_array_vec = vec![]; + for (k, v) in keys.iter().zip(values.iter()) { + running_offset = running_offset.add(O::usize_as(k.len())); + offset_buffer.push(running_offset); + key_array_vec.push(k.as_ref()); + value_array_vec.push(v.as_ref()); + } + + // concatenate all the arrays + let flattened_keys = arrow::compute::concat(key_array_vec.as_ref())?; + if flattened_keys.null_count() > 0 { + return exec_err!("keys cannot be null"); + } + let flattened_values = arrow::compute::concat(value_array_vec.as_ref())?; + + let fields = vec![ + Arc::new(Field::new("key", flattened_keys.data_type().clone(), false)), + Arc::new(Field::new( + "value", + flattened_values.data_type().clone(), + true, + )), + ]; + + let struct_data = ArrayData::builder(DataType::Struct(fields.into())) + .len(flattened_keys.len()) + .add_child_data(flattened_keys.to_data()) + .add_child_data(flattened_values.to_data()) + .build()?; + + let map_data = ArrayData::builder(DataType::Map( + Arc::new(Field::new( + "entries", + struct_data.data_type().clone(), + false, + )), + false, + )) + .len(keys.len()) + .add_child_data(struct_data) + .add_buffer(Buffer::from_slice_ref(offset_buffer.as_slice())) + .build()?; + Ok(ColumnarValue::Array(Arc::new(MapArray::from(map_data)))) +} diff --git a/datafusion/functions/src/core/mod.rs b/datafusion/functions/src/core/mod.rs index 8c5121397284..062a4a104d54 100644 --- a/datafusion/functions/src/core/mod.rs +++ b/datafusion/functions/src/core/mod.rs @@ -94,6 +94,7 @@ pub fn functions() -> Vec> { nvl2(), arrow_typeof(), named_struct(), + get_field(), coalesce(), ] } diff --git a/datafusion/functions/src/datetime/common.rs b/datafusion/functions/src/datetime/common.rs index 4f48ab188403..6048eeeaa554 100644 --- a/datafusion/functions/src/datetime/common.rs +++ b/datafusion/functions/src/datetime/common.rs @@ -28,7 +28,9 @@ use chrono::{DateTime, TimeZone, Utc}; use itertools::Either; use datafusion_common::cast::as_generic_string_array; -use datafusion_common::{exec_err, DataFusionError, Result, ScalarType, ScalarValue}; +use datafusion_common::{ + exec_err, unwrap_or_internal_err, DataFusionError, Result, ScalarType, ScalarValue, +}; use datafusion_expr::ColumnarValue; /// Error message if nanosecond conversion request beyond supported interval @@ -227,46 +229,34 @@ where // if the first argument is a scalar utf8 all arguments are expected to be scalar utf8 ColumnarValue::Scalar(scalar) => match scalar { ScalarValue::Utf8(a) | ScalarValue::LargeUtf8(a) => { - let mut val: Option> = None; - let mut err: Option = None; + let a = a.as_ref(); + // ASK: Why do we trust `a` to be non-null at this point? + let a = unwrap_or_internal_err!(a); - match a { - Some(a) => { - // enumerate all the values finding the first one that returns an Ok result - for (pos, v) in args.iter().enumerate().skip(1) { - if let ColumnarValue::Scalar(s) = v { - if let ScalarValue::Utf8(x) | ScalarValue::LargeUtf8(x) = - s - { - if let Some(s) = x { - match op(a.as_str(), s.as_str()) { - Ok(r) => { - val = Some(Ok(ColumnarValue::Scalar( - S::scalar(Some(op2(r))), - ))); - break; - } - Err(e) => { - err = Some(e); - } - } - } - } else { - return exec_err!("Unsupported data type {s:?} for function {name}, arg # {pos}"); - } - } else { - return exec_err!("Unsupported data type {v:?} for function {name}, arg # {pos}"); + let mut ret = None; + + for (pos, v) in args.iter().enumerate().skip(1) { + let ColumnarValue::Scalar( + ScalarValue::Utf8(x) | ScalarValue::LargeUtf8(x), + ) = v + else { + return exec_err!("Unsupported data type {v:?} for function {name}, arg # {pos}"); + }; + + if let Some(s) = x { + match op(a.as_str(), s.as_str()) { + Ok(r) => { + ret = Some(Ok(ColumnarValue::Scalar(S::scalar(Some( + op2(r), + ))))); + break; } + Err(e) => ret = Some(Err(e)), } } - None => (), } - if let Some(v) = val { - v - } else { - Err(err.unwrap()) - } + unwrap_or_internal_err!(ret) } other => { exec_err!("Unsupported data type {other:?} for function {name}") diff --git a/datafusion/functions/src/datetime/to_date.rs b/datafusion/functions/src/datetime/to_date.rs index e491c0b55508..cc5ffa73c8f1 100644 --- a/datafusion/functions/src/datetime/to_date.rs +++ b/datafusion/functions/src/datetime/to_date.rs @@ -58,7 +58,7 @@ impl ToDateFunc { }, "to_date", ), - n if n >= 2 => handle_multiple::( + 2.. => handle_multiple::( args, |s, format| { string_to_timestamp_nanos_formatted(s, format) @@ -72,7 +72,7 @@ impl ToDateFunc { |n| n, "to_date", ), - _ => exec_err!("Unsupported 0 argument count for function to_date"), + 0 => exec_err!("Unsupported 0 argument count for function to_date"), } } } diff --git a/datafusion/functions/src/string/btrim.rs b/datafusion/functions/src/string/btrim.rs index 349928d09664..371a11c82c54 100644 --- a/datafusion/functions/src/string/btrim.rs +++ b/datafusion/functions/src/string/btrim.rs @@ -16,9 +16,8 @@ // under the License. use arrow::array::{ArrayRef, OffsetSizeTrait}; -use std::any::Any; - use arrow::datatypes::DataType; +use std::any::Any; use datafusion_common::{exec_err, Result}; use datafusion_expr::function::Hint; @@ -32,7 +31,8 @@ use crate::utils::{make_scalar_function, utf8_to_str_type}; /// Returns the longest string with leading and trailing characters removed. If the characters are not specified, whitespace is removed. /// btrim('xyxtrimyyx', 'xyz') = 'trim' fn btrim(args: &[ArrayRef]) -> Result { - general_trim::(args, TrimType::Both) + let use_string_view = args[0].data_type() == &DataType::Utf8View; + general_trim::(args, TrimType::Both, use_string_view) } #[derive(Debug)] @@ -52,7 +52,15 @@ impl BTrimFunc { use DataType::*; Self { signature: Signature::one_of( - vec![Exact(vec![Utf8]), Exact(vec![Utf8, Utf8])], + vec![ + // Planner attempts coercion to the target type starting with the most preferred candidate. + // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`. + // If that fails, it proceeds to `(Utf8, Utf8)`. + Exact(vec![Utf8View, Utf8View]), + Exact(vec![Utf8, Utf8]), + Exact(vec![Utf8View]), + Exact(vec![Utf8]), + ], Volatility::Immutable, ), aliases: vec![String::from("trim")], @@ -79,7 +87,7 @@ impl ScalarUDFImpl for BTrimFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { match args[0].data_type() { - DataType::Utf8 => make_scalar_function( + DataType::Utf8 | DataType::Utf8View => make_scalar_function( btrim::, vec![Hint::Pad, Hint::AcceptsSingular], )(args), @@ -87,7 +95,10 @@ impl ScalarUDFImpl for BTrimFunc { btrim::, vec![Hint::Pad, Hint::AcceptsSingular], )(args), - other => exec_err!("Unsupported data type {other:?} for function btrim"), + other => exec_err!( + "Unsupported data type {other:?} for function btrim,\ + expected Utf8, LargeUtf8 or Utf8View." + ), } } diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index d36bd5cecc47..7037c1d1c3c3 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -25,7 +25,7 @@ use arrow::array::{ use arrow::buffer::{Buffer, MutableBuffer, NullBuffer}; use arrow::datatypes::DataType; -use datafusion_common::cast::as_generic_string_array; +use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; use datafusion_common::Result; use datafusion_common::{exec_err, ScalarValue}; use datafusion_expr::ColumnarValue; @@ -49,6 +49,7 @@ impl Display for TrimType { pub(crate) fn general_trim( args: &[ArrayRef], trim_type: TrimType, + use_string_view: bool, ) -> Result { let func = match trim_type { TrimType::Left => |input, pattern: &str| { @@ -68,6 +69,74 @@ pub(crate) fn general_trim( }, }; + if use_string_view { + string_view_trim::(trim_type, func, args) + } else { + string_trim::(trim_type, func, args) + } +} + +// removing 'a will cause compiler complaining lifetime of `func` +fn string_view_trim<'a, T: OffsetSizeTrait>( + trim_type: TrimType, + func: fn(&'a str, &'a str) -> &'a str, + args: &'a [ArrayRef], +) -> Result { + let string_array = as_string_view_array(&args[0])?; + + match args.len() { + 1 => { + let result = string_array + .iter() + .map(|string| string.map(|string: &str| func(string, " "))) + .collect::>(); + + Ok(Arc::new(result) as ArrayRef) + } + 2 => { + let characters_array = as_string_view_array(&args[1])?; + + if characters_array.len() == 1 { + if characters_array.is_null(0) { + return Ok(new_null_array( + // The schema is expecting utf8 as null + &DataType::Utf8, + string_array.len(), + )); + } + + let characters = characters_array.value(0); + let result = string_array + .iter() + .map(|item| item.map(|string| func(string, characters))) + .collect::>(); + return Ok(Arc::new(result) as ArrayRef); + } + + let result = string_array + .iter() + .zip(characters_array.iter()) + .map(|(string, characters)| match (string, characters) { + (Some(string), Some(characters)) => Some(func(string, characters)), + _ => None, + }) + .collect::>(); + + Ok(Arc::new(result) as ArrayRef) + } + other => { + exec_err!( + "{trim_type} was called with {other} arguments. It requires at least 1 and at most 2." + ) + } + } +} + +fn string_trim<'a, T: OffsetSizeTrait>( + trim_type: TrimType, + func: fn(&'a str, &'a str) -> &'a str, + args: &'a [ArrayRef], +) -> Result { let string_array = as_generic_string_array::(&args[0])?; match args.len() { @@ -84,7 +153,10 @@ pub(crate) fn general_trim( if characters_array.len() == 1 { if characters_array.is_null(0) { - return Ok(new_null_array(args[0].data_type(), args[0].len())); + return Ok(new_null_array( + string_array.data_type(), + string_array.len(), + )); } let characters = characters_array.value(0); @@ -109,7 +181,7 @@ pub(crate) fn general_trim( other => { exec_err!( "{trim_type} was called with {other} arguments. It requires at least 1 and at most 2." - ) + ) } } } diff --git a/datafusion/functions/src/string/ends_with.rs b/datafusion/functions/src/string/ends_with.rs index b72cf0f66fa6..03a1795954d0 100644 --- a/datafusion/functions/src/string/ends_with.rs +++ b/datafusion/functions/src/string/ends_with.rs @@ -18,12 +18,10 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::{ArrayRef, OffsetSizeTrait}; +use arrow::array::ArrayRef; use arrow::datatypes::DataType; -use arrow::datatypes::DataType::Boolean; -use datafusion_common::cast::as_generic_string_array; -use datafusion_common::{exec_err, Result}; +use datafusion_common::{internal_err, Result}; use datafusion_expr::TypeSignature::*; use datafusion_expr::{ColumnarValue, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; @@ -43,14 +41,15 @@ impl Default for EndsWithFunc { impl EndsWithFunc { pub fn new() -> Self { - use DataType::*; Self { signature: Signature::one_of( vec![ - Exact(vec![Utf8, Utf8]), - Exact(vec![Utf8, LargeUtf8]), - Exact(vec![LargeUtf8, Utf8]), - Exact(vec![LargeUtf8, LargeUtf8]), + // Planner attempts coercion to the target type starting with the most preferred candidate. + // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`. + // If that fails, it proceeds to `(Utf8, Utf8)`. + Exact(vec![DataType::Utf8View, DataType::Utf8View]), + Exact(vec![DataType::Utf8, DataType::Utf8]), + Exact(vec![DataType::LargeUtf8, DataType::LargeUtf8]), ], Volatility::Immutable, ), @@ -72,15 +71,16 @@ impl ScalarUDFImpl for EndsWithFunc { } fn return_type(&self, _arg_types: &[DataType]) -> Result { - Ok(Boolean) + Ok(DataType::Boolean) } fn invoke(&self, args: &[ColumnarValue]) -> Result { match args[0].data_type() { - DataType::Utf8 => make_scalar_function(ends_with::, vec![])(args), - DataType::LargeUtf8 => make_scalar_function(ends_with::, vec![])(args), + DataType::Utf8View | DataType::Utf8 | DataType::LargeUtf8 => { + make_scalar_function(ends_with, vec![])(args) + } other => { - exec_err!("Unsupported data type {other:?} for function ends_with") + internal_err!("Unsupported data type {other:?} for function ends_with. Expected Utf8, LargeUtf8 or Utf8View")? } } } @@ -88,11 +88,8 @@ impl ScalarUDFImpl for EndsWithFunc { /// Returns true if string ends with suffix. /// ends_with('alphabet', 'abet') = 't' -pub fn ends_with(args: &[ArrayRef]) -> Result { - let left = as_generic_string_array::(&args[0])?; - let right = as_generic_string_array::(&args[1])?; - - let result = arrow::compute::kernels::comparison::ends_with(left, right)?; +pub fn ends_with(args: &[ArrayRef]) -> Result { + let result = arrow::compute::kernels::comparison::ends_with(&args[0], &args[1])?; Ok(Arc::new(result) as ArrayRef) } diff --git a/datafusion/functions/src/string/initcap.rs b/datafusion/functions/src/string/initcap.rs index 864179d130fd..4e1eb213ef57 100644 --- a/datafusion/functions/src/string/initcap.rs +++ b/datafusion/functions/src/string/initcap.rs @@ -18,10 +18,10 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; +use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, StringArray}; use arrow::datatypes::DataType; -use datafusion_common::cast::as_generic_string_array; +use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; use datafusion_common::{exec_err, Result}; use datafusion_expr::{ColumnarValue, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; @@ -45,7 +45,7 @@ impl InitcapFunc { Self { signature: Signature::uniform( 1, - vec![Utf8, LargeUtf8], + vec![Utf8, LargeUtf8, Utf8View], Volatility::Immutable, ), } @@ -73,6 +73,7 @@ impl ScalarUDFImpl for InitcapFunc { match args[0].data_type() { DataType::Utf8 => make_scalar_function(initcap::, vec![])(args), DataType::LargeUtf8 => make_scalar_function(initcap::, vec![])(args), + DataType::Utf8View => make_scalar_function(initcap_utf8view, vec![])(args), other => { exec_err!("Unsupported data type {other:?} for function initcap") } @@ -88,28 +89,41 @@ fn initcap(args: &[ArrayRef]) -> Result { // first map is the iterator, second is for the `Option<_>` let result = string_array .iter() - .map(|string| { - string.map(|string: &str| { - let mut char_vector = Vec::::new(); - let mut previous_character_letter_or_number = false; - for c in string.chars() { - if previous_character_letter_or_number { - char_vector.push(c.to_ascii_lowercase()); - } else { - char_vector.push(c.to_ascii_uppercase()); - } - previous_character_letter_or_number = c.is_ascii_uppercase() - || c.is_ascii_lowercase() - || c.is_ascii_digit(); - } - char_vector.iter().collect::() - }) - }) + .map(initcap_string) .collect::>(); Ok(Arc::new(result) as ArrayRef) } +fn initcap_utf8view(args: &[ArrayRef]) -> Result { + let string_view_array = as_string_view_array(&args[0])?; + + let result = string_view_array + .iter() + .map(initcap_string) + .collect::(); + + Ok(Arc::new(result) as ArrayRef) +} + +fn initcap_string(string: Option<&str>) -> Option { + let mut char_vector = Vec::::new(); + string.map(|string: &str| { + char_vector.clear(); + let mut previous_character_letter_or_number = false; + for c in string.chars() { + if previous_character_letter_or_number { + char_vector.push(c.to_ascii_lowercase()); + } else { + char_vector.push(c.to_ascii_uppercase()); + } + previous_character_letter_or_number = + c.is_ascii_uppercase() || c.is_ascii_lowercase() || c.is_ascii_digit(); + } + char_vector.iter().collect::() + }) +} + #[cfg(test)] mod tests { use crate::string::initcap::InitcapFunc; @@ -153,6 +167,44 @@ mod tests { Utf8, StringArray ); + test_function!( + InitcapFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8View(Some( + "hi THOMAS".to_string() + )))], + Ok(Some("Hi Thomas")), + &str, + Utf8, + StringArray + ); + test_function!( + InitcapFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8View(Some( + "hi THOMAS wIth M0re ThAN 12 ChaRs".to_string() + )))], + Ok(Some("Hi Thomas With M0re Than 12 Chars")), + &str, + Utf8, + StringArray + ); + test_function!( + InitcapFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8View(Some( + "".to_string() + )))], + Ok(Some("")), + &str, + Utf8, + StringArray + ); + test_function!( + InitcapFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8View(None))], + Ok(None), + &str, + Utf8, + StringArray + ); Ok(()) } diff --git a/datafusion/functions/src/string/levenshtein.rs b/datafusion/functions/src/string/levenshtein.rs index 3edf6de8c863..430c402a50c5 100644 --- a/datafusion/functions/src/string/levenshtein.rs +++ b/datafusion/functions/src/string/levenshtein.rs @@ -22,7 +22,7 @@ use arrow::array::{ArrayRef, Int32Array, Int64Array, OffsetSizeTrait}; use arrow::datatypes::DataType; use crate::utils::{make_scalar_function, utf8_to_int_type}; -use datafusion_common::cast::as_generic_string_array; +use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; use datafusion_common::utils::datafusion_strsim; use datafusion_common::{exec_err, Result}; use datafusion_expr::ColumnarValue; @@ -42,10 +42,13 @@ impl Default for LevenshteinFunc { impl LevenshteinFunc { pub fn new() -> Self { - use DataType::*; Self { signature: Signature::one_of( - vec![Exact(vec![Utf8, Utf8]), Exact(vec![LargeUtf8, LargeUtf8])], + vec![ + Exact(vec![DataType::Utf8View, DataType::Utf8View]), + Exact(vec![DataType::Utf8, DataType::Utf8]), + Exact(vec![DataType::LargeUtf8, DataType::LargeUtf8]), + ], Volatility::Immutable, ), } @@ -71,7 +74,9 @@ impl ScalarUDFImpl for LevenshteinFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { match args[0].data_type() { - DataType::Utf8 => make_scalar_function(levenshtein::, vec![])(args), + DataType::Utf8View | DataType::Utf8 => { + make_scalar_function(levenshtein::, vec![])(args) + } DataType::LargeUtf8 => make_scalar_function(levenshtein::, vec![])(args), other => { exec_err!("Unsupported data type {other:?} for function levenshtein") @@ -89,10 +94,26 @@ pub fn levenshtein(args: &[ArrayRef]) -> Result { args.len() ); } - let str1_array = as_generic_string_array::(&args[0])?; - let str2_array = as_generic_string_array::(&args[1])?; + match args[0].data_type() { + DataType::Utf8View => { + let str1_array = as_string_view_array(&args[0])?; + let str2_array = as_string_view_array(&args[1])?; + let result = str1_array + .iter() + .zip(str2_array.iter()) + .map(|(string1, string2)| match (string1, string2) { + (Some(string1), Some(string2)) => { + Some(datafusion_strsim::levenshtein(string1, string2) as i32) + } + _ => None, + }) + .collect::(); + Ok(Arc::new(result) as ArrayRef) + } DataType::Utf8 => { + let str1_array = as_generic_string_array::(&args[0])?; + let str2_array = as_generic_string_array::(&args[1])?; let result = str1_array .iter() .zip(str2_array.iter()) @@ -106,6 +127,8 @@ pub fn levenshtein(args: &[ArrayRef]) -> Result { Ok(Arc::new(result) as ArrayRef) } DataType::LargeUtf8 => { + let str1_array = as_generic_string_array::(&args[0])?; + let str2_array = as_generic_string_array::(&args[1])?; let result = str1_array .iter() .zip(str2_array.iter()) @@ -120,7 +143,7 @@ pub fn levenshtein(args: &[ArrayRef]) -> Result { } other => { exec_err!( - "levenshtein was called with {other} datatype arguments. It requires Utf8 or LargeUtf8." + "levenshtein was called with {other} datatype arguments. It requires Utf8View, Utf8 or LargeUtf8." ) } } diff --git a/datafusion/functions/src/string/ltrim.rs b/datafusion/functions/src/string/ltrim.rs index de14bbaa2bcf..b7b27afcee1f 100644 --- a/datafusion/functions/src/string/ltrim.rs +++ b/datafusion/functions/src/string/ltrim.rs @@ -32,7 +32,8 @@ use crate::utils::{make_scalar_function, utf8_to_str_type}; /// Returns the longest string with leading characters removed. If the characters are not specified, whitespace is removed. /// ltrim('zzzytest', 'xyz') = 'test' fn ltrim(args: &[ArrayRef]) -> Result { - general_trim::(args, TrimType::Left) + let use_string_view = args[0].data_type() == &DataType::Utf8View; + general_trim::(args, TrimType::Left, use_string_view) } #[derive(Debug)] @@ -51,7 +52,15 @@ impl LtrimFunc { use DataType::*; Self { signature: Signature::one_of( - vec![Exact(vec![Utf8]), Exact(vec![Utf8, Utf8])], + vec![ + // Planner attempts coercion to the target type starting with the most preferred candidate. + // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`. + // If that fails, it proceeds to `(Utf8, Utf8)`. + Exact(vec![Utf8View, Utf8View]), + Exact(vec![Utf8, Utf8]), + Exact(vec![Utf8View]), + Exact(vec![Utf8]), + ], Volatility::Immutable, ), } @@ -77,7 +86,7 @@ impl ScalarUDFImpl for LtrimFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { match args[0].data_type() { - DataType::Utf8 => make_scalar_function( + DataType::Utf8 | DataType::Utf8View => make_scalar_function( ltrim::, vec![Hint::Pad, Hint::AcceptsSingular], )(args), @@ -85,7 +94,10 @@ impl ScalarUDFImpl for LtrimFunc { ltrim::, vec![Hint::Pad, Hint::AcceptsSingular], )(args), - other => exec_err!("Unsupported data type {other:?} for function ltrim"), + other => exec_err!( + "Unsupported data type {other:?} for function ltrim,\ + expected Utf8, LargeUtf8 or Utf8View." + ), } } } diff --git a/datafusion/functions/src/string/octet_length.rs b/datafusion/functions/src/string/octet_length.rs index 12980fab1f11..f792914d862e 100644 --- a/datafusion/functions/src/string/octet_length.rs +++ b/datafusion/functions/src/string/octet_length.rs @@ -43,7 +43,7 @@ impl OctetLengthFunc { Self { signature: Signature::uniform( 1, - vec![Utf8, LargeUtf8], + vec![Utf8, LargeUtf8, Utf8View], Volatility::Immutable, ), } @@ -84,6 +84,9 @@ impl ScalarUDFImpl for OctetLengthFunc { ScalarValue::LargeUtf8(v) => Ok(ColumnarValue::Scalar( ScalarValue::Int64(v.as_ref().map(|x| x.len() as i64)), )), + ScalarValue::Utf8View(v) => Ok(ColumnarValue::Scalar( + ScalarValue::Int32(v.as_ref().map(|x| x.len() as i32)), + )), _ => unreachable!(), }, } @@ -176,6 +179,36 @@ mod tests { Int32, Int32Array ); + test_function!( + OctetLengthFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8View(Some( + String::from("joséjoséjoséjosé") + )))], + Ok(Some(20)), + i32, + Int32, + Int32Array + ); + test_function!( + OctetLengthFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8View(Some( + String::from("josé") + )))], + Ok(Some(5)), + i32, + Int32, + Int32Array + ); + test_function!( + OctetLengthFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8View(Some( + String::from("") + )))], + Ok(Some(0)), + i32, + Int32, + Int32Array + ); Ok(()) } diff --git a/datafusion/functions/src/string/rtrim.rs b/datafusion/functions/src/string/rtrim.rs index 2d29b50cb173..ec53f3ed7430 100644 --- a/datafusion/functions/src/string/rtrim.rs +++ b/datafusion/functions/src/string/rtrim.rs @@ -32,7 +32,8 @@ use crate::utils::{make_scalar_function, utf8_to_str_type}; /// Returns the longest string with trailing characters removed. If the characters are not specified, whitespace is removed. /// rtrim('testxxzx', 'xyz') = 'test' fn rtrim(args: &[ArrayRef]) -> Result { - general_trim::(args, TrimType::Right) + let use_string_view = args[0].data_type() == &DataType::Utf8View; + general_trim::(args, TrimType::Right, use_string_view) } #[derive(Debug)] @@ -51,7 +52,15 @@ impl RtrimFunc { use DataType::*; Self { signature: Signature::one_of( - vec![Exact(vec![Utf8]), Exact(vec![Utf8, Utf8])], + vec![ + // Planner attempts coercion to the target type starting with the most preferred candidate. + // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`. + // If that fails, it proceeds to `(Utf8, Utf8)`. + Exact(vec![Utf8View, Utf8View]), + Exact(vec![Utf8, Utf8]), + Exact(vec![Utf8View]), + Exact(vec![Utf8]), + ], Volatility::Immutable, ), } @@ -77,7 +86,7 @@ impl ScalarUDFImpl for RtrimFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { match args[0].data_type() { - DataType::Utf8 => make_scalar_function( + DataType::Utf8 | DataType::Utf8View => make_scalar_function( rtrim::, vec![Hint::Pad, Hint::AcceptsSingular], )(args), @@ -85,7 +94,10 @@ impl ScalarUDFImpl for RtrimFunc { rtrim::, vec![Hint::Pad, Hint::AcceptsSingular], )(args), - other => exec_err!("Unsupported data type {other:?} for function rtrim"), + other => exec_err!( + "Unsupported data type {other:?} for function rtrim,\ + expected Utf8, LargeUtf8 or Utf8View." + ), } } } diff --git a/datafusion/functions/src/unicode/find_in_set.rs b/datafusion/functions/src/unicode/find_in_set.rs index 7c864bc191d7..41a2b9d9e72d 100644 --- a/datafusion/functions/src/unicode/find_in_set.rs +++ b/datafusion/functions/src/unicode/find_in_set.rs @@ -19,11 +19,11 @@ use std::any::Any; use std::sync::Arc; use arrow::array::{ - ArrayRef, ArrowPrimitiveType, GenericStringArray, OffsetSizeTrait, PrimitiveArray, + ArrayAccessor, ArrayIter, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait, + PrimitiveArray, }; use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type}; -use datafusion_common::cast::as_generic_string_array; use datafusion_common::{exec_err, Result}; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; @@ -46,7 +46,11 @@ impl FindInSetFunc { use DataType::*; Self { signature: Signature::one_of( - vec![Exact(vec![Utf8, Utf8]), Exact(vec![LargeUtf8, LargeUtf8])], + vec![ + Exact(vec![Utf8View, Utf8View]), + Exact(vec![Utf8, Utf8]), + Exact(vec![LargeUtf8, LargeUtf8]), + ], Volatility::Immutable, ), } @@ -71,41 +75,52 @@ impl ScalarUDFImpl for FindInSetFunc { } fn invoke(&self, args: &[ColumnarValue]) -> Result { - match args[0].data_type() { - DataType::Utf8 => { - make_scalar_function(find_in_set::, vec![])(args) - } - DataType::LargeUtf8 => { - make_scalar_function(find_in_set::, vec![])(args) - } - other => { - exec_err!("Unsupported data type {other:?} for function find_in_set") - } - } + make_scalar_function(find_in_set, vec![])(args) } } ///Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings ///A string list is a string composed of substrings separated by , characters. -pub fn find_in_set(args: &[ArrayRef]) -> Result -where - T::Native: OffsetSizeTrait, -{ +fn find_in_set(args: &[ArrayRef]) -> Result { if args.len() != 2 { return exec_err!( "find_in_set was called with {} arguments. It requires 2.", args.len() ); } + match args[0].data_type() { + DataType::Utf8 => { + let string_array = args[0].as_string::(); + let str_list_array = args[1].as_string::(); + find_in_set_general::(string_array, str_list_array) + } + DataType::LargeUtf8 => { + let string_array = args[0].as_string::(); + let str_list_array = args[1].as_string::(); + find_in_set_general::(string_array, str_list_array) + } + DataType::Utf8View => { + let string_array = args[0].as_string_view(); + let str_list_array = args[1].as_string_view(); + find_in_set_general::(string_array, str_list_array) + } + other => { + exec_err!("Unsupported data type {other:?} for function find_in_set") + } + } +} - let str_array: &GenericStringArray = - as_generic_string_array::(&args[0])?; - let str_list_array: &GenericStringArray = - as_generic_string_array::(&args[1])?; - - let result = str_array - .iter() - .zip(str_list_array.iter()) +pub fn find_in_set_general<'a, T: ArrowPrimitiveType, V: ArrayAccessor>( + string_array: V, + str_list_array: V, +) -> Result +where + T::Native: OffsetSizeTrait, +{ + let string_iter = ArrayIter::new(string_array); + let str_list_iter = ArrayIter::new(str_list_array); + let result = string_iter + .zip(str_list_iter) .map(|(string, str_list)| match (string, str_list) { (Some(string), Some(str_list)) => { let mut res = 0; diff --git a/datafusion/functions/src/unicode/right.rs b/datafusion/functions/src/unicode/right.rs index 20cbbe020ff1..9d542bb2c006 100644 --- a/datafusion/functions/src/unicode/right.rs +++ b/datafusion/functions/src/unicode/right.rs @@ -19,17 +19,21 @@ use std::any::Any; use std::cmp::{max, Ordering}; use std::sync::Arc; -use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; +use arrow::array::{ + Array, ArrayAccessor, ArrayIter, ArrayRef, GenericStringArray, Int64Array, + OffsetSizeTrait, +}; use arrow::datatypes::DataType; -use datafusion_common::cast::{as_generic_string_array, as_int64_array}; +use crate::utils::{make_scalar_function, utf8_to_str_type}; +use datafusion_common::cast::{ + as_generic_string_array, as_int64_array, as_string_view_array, +}; use datafusion_common::exec_err; use datafusion_common::Result; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; -use crate::utils::{make_scalar_function, utf8_to_str_type}; - #[derive(Debug)] pub struct RightFunc { signature: Signature, @@ -46,7 +50,11 @@ impl RightFunc { use DataType::*; Self { signature: Signature::one_of( - vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8, Int64])], + vec![ + Exact(vec![Utf8View, Int64]), + Exact(vec![Utf8, Int64]), + Exact(vec![LargeUtf8, Int64]), + ], Volatility::Immutable, ), } @@ -72,9 +80,14 @@ impl ScalarUDFImpl for RightFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { match args[0].data_type() { - DataType::Utf8 => make_scalar_function(right::, vec![])(args), + DataType::Utf8 | DataType::Utf8View => { + make_scalar_function(right::, vec![])(args) + } DataType::LargeUtf8 => make_scalar_function(right::, vec![])(args), - other => exec_err!("Unsupported data type {other:?} for function right"), + other => exec_err!( + "Unsupported data type {other:?} for function right,\ + expected Utf8View, Utf8 or LargeUtf8." + ), } } } @@ -83,11 +96,26 @@ impl ScalarUDFImpl for RightFunc { /// right('abcde', 2) = 'de' /// The implementation uses UTF-8 code points as characters pub fn right(args: &[ArrayRef]) -> Result { - let string_array = as_generic_string_array::(&args[0])?; let n_array = as_int64_array(&args[1])?; + if args[0].data_type() == &DataType::Utf8View { + // string_view_right(args) + let string_array = as_string_view_array(&args[0])?; + right_impl::(&mut string_array.iter(), n_array) + } else { + // string_right::(args) + let string_array = &as_generic_string_array::(&args[0])?; + right_impl::(&mut string_array.iter(), n_array) + } +} - let result = string_array - .iter() +// Currently the return type can only be Utf8 or LargeUtf8, to reach fully support, we need +// to edit the `get_optimal_return_type` in utils.rs to make the udfs be able to return Utf8View +// See https://github.com/apache/datafusion/issues/11790#issuecomment-2283777166 +fn right_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor>( + string_array_iter: &mut ArrayIter, + n_array: &Int64Array, +) -> Result { + let result = string_array_iter .zip(n_array.iter()) .map(|(string, n)| match (string, n) { (Some(string), Some(n)) => match n.cmp(&0) { diff --git a/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs b/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs index 8ff00917dcb1..593dab2bc9a2 100644 --- a/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs +++ b/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs @@ -48,7 +48,13 @@ impl AnalyzerRule for CountWildcardRule { } fn is_wildcard(expr: &Expr) -> bool { - matches!(expr, Expr::Wildcard { qualifier: None }) + matches!( + expr, + Expr::Wildcard { + qualifier: None, + .. + } + ) } fn is_count_star_aggregate(aggregate_function: &AggregateFunction) -> bool { diff --git a/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs b/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs new file mode 100644 index 000000000000..53ba3042f522 --- /dev/null +++ b/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs @@ -0,0 +1,304 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use crate::AnalyzerRule; +use datafusion_common::config::ConfigOptions; +use datafusion_common::tree_node::{Transformed, TransformedResult}; +use datafusion_common::{Column, Result}; +use datafusion_expr::builder::validate_unique_names; +use datafusion_expr::expr::PlannedReplaceSelectItem; +use datafusion_expr::utils::{ + expand_qualified_wildcard, expand_wildcard, find_base_plan, +}; +use datafusion_expr::{Expr, LogicalPlan, Projection, SubqueryAlias}; + +#[derive(Default)] +pub struct ExpandWildcardRule {} + +impl ExpandWildcardRule { + pub fn new() -> Self { + Self {} + } +} + +impl AnalyzerRule for ExpandWildcardRule { + fn analyze(&self, plan: LogicalPlan, _: &ConfigOptions) -> Result { + // Because the wildcard expansion is based on the schema of the input plan, + // using `transform_up_with_subqueries` here. + plan.transform_up_with_subqueries(expand_internal).data() + } + + fn name(&self) -> &str { + "expand_wildcard_rule" + } +} + +fn expand_internal(plan: LogicalPlan) -> Result> { + match plan { + LogicalPlan::Projection(Projection { expr, input, .. }) => { + let projected_expr = expand_exprlist(&input, expr)?; + validate_unique_names("Projections", projected_expr.iter())?; + Ok(Transformed::yes( + Projection::try_new(projected_expr, Arc::clone(&input)) + .map(LogicalPlan::Projection)?, + )) + } + // Teh schema of the plan should also be updated if the child plan is transformed. + LogicalPlan::SubqueryAlias(SubqueryAlias { input, alias, .. }) => { + Ok(Transformed::yes( + SubqueryAlias::try_new(input, alias).map(LogicalPlan::SubqueryAlias)?, + )) + } + _ => Ok(Transformed::no(plan)), + } +} + +fn expand_exprlist(input: &LogicalPlan, expr: Vec) -> Result> { + let mut projected_expr = vec![]; + let input = find_base_plan(input); + for e in expr { + match e { + Expr::Wildcard { qualifier, options } => { + if let Some(qualifier) = qualifier { + let expanded = expand_qualified_wildcard( + &qualifier, + input.schema(), + Some(&options), + )?; + // If there is a REPLACE statement, replace that column with the given + // replace expression. Column name remains the same. + let replaced = if let Some(replace) = options.replace { + replace_columns(expanded, replace)? + } else { + expanded + }; + projected_expr.extend(replaced); + } else { + let expanded = + expand_wildcard(input.schema(), input, Some(&options))?; + // If there is a REPLACE statement, replace that column with the given + // replace expression. Column name remains the same. + let replaced = if let Some(replace) = options.replace { + replace_columns(expanded, replace)? + } else { + expanded + }; + projected_expr.extend(replaced); + } + } + // A workaround to handle the case when the column name is "*". + // We transform the expression to a Expr::Column through [Column::from_name] in many places. + // It would also convert the wildcard expression to a column expression with name "*". + Expr::Column(Column { + ref relation, + ref name, + }) => { + if name.eq("*") { + if let Some(qualifier) = relation { + projected_expr.extend(expand_qualified_wildcard( + qualifier, + input.schema(), + None, + )?); + } else { + projected_expr.extend(expand_wildcard( + input.schema(), + input, + None, + )?); + } + } else { + projected_expr.push(e.clone()); + } + } + _ => projected_expr.push(e), + } + } + Ok(projected_expr) +} + +/// If there is a REPLACE statement in the projected expression in the form of +/// "REPLACE (some_column_within_an_expr AS some_column)", this function replaces +/// that column with the given replace expression. Column name remains the same. +/// Multiple REPLACEs are also possible with comma separations. +fn replace_columns( + mut exprs: Vec, + replace: PlannedReplaceSelectItem, +) -> Result> { + for expr in exprs.iter_mut() { + if let Expr::Column(Column { name, .. }) = expr { + if let Some((_, new_expr)) = replace + .items() + .iter() + .zip(replace.expressions().iter()) + .find(|(item, _)| item.column_name.value == *name) + { + *expr = new_expr.clone().alias(name.clone()) + } + } + } + Ok(exprs) +} + +#[cfg(test)] +mod tests { + use arrow::datatypes::{DataType, Field, Schema}; + + use datafusion_common::{JoinType, TableReference}; + use datafusion_expr::{ + col, in_subquery, qualified_wildcard, table_scan, wildcard, LogicalPlanBuilder, + }; + + use crate::test::{assert_analyzed_plan_eq_display_indent, test_table_scan}; + use crate::Analyzer; + + use super::*; + + fn assert_plan_eq(plan: LogicalPlan, expected: &str) -> Result<()> { + assert_analyzed_plan_eq_display_indent( + Arc::new(ExpandWildcardRule::new()), + plan, + expected, + ) + } + + #[test] + fn test_expand_wildcard() -> Result<()> { + let table_scan = test_table_scan()?; + let plan = LogicalPlanBuilder::from(table_scan) + .project(vec![wildcard()])? + .build()?; + let expected = + "Projection: test.a, test.b, test.c [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; + assert_plan_eq(plan, expected) + } + + #[test] + fn test_expand_qualified_wildcard() -> Result<()> { + let table_scan = test_table_scan()?; + let plan = LogicalPlanBuilder::from(table_scan) + .project(vec![qualified_wildcard(TableReference::bare("test"))])? + .build()?; + let expected = + "Projection: test.a, test.b, test.c [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; + assert_plan_eq(plan, expected) + } + + #[test] + fn test_expand_qualified_wildcard_in_subquery() -> Result<()> { + let table_scan = test_table_scan()?; + let plan = LogicalPlanBuilder::from(table_scan) + .project(vec![qualified_wildcard(TableReference::bare("test"))])? + .build()?; + let plan = LogicalPlanBuilder::from(plan) + .project(vec![wildcard()])? + .build()?; + let expected = + "Projection: test.a, test.b, test.c [a:UInt32, b:UInt32, c:UInt32]\ + \n Projection: test.a, test.b, test.c [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; + assert_plan_eq(plan, expected) + } + + #[test] + fn test_expand_wildcard_in_subquery() -> Result<()> { + let projection_a = LogicalPlanBuilder::from(test_table_scan()?) + .project(vec![col("a")])? + .build()?; + let subquery = LogicalPlanBuilder::from(projection_a) + .project(vec![wildcard()])? + .build()?; + let plan = LogicalPlanBuilder::from(test_table_scan()?) + .filter(in_subquery(col("a"), Arc::new(subquery)))? + .project(vec![wildcard()])? + .build()?; + let expected = "\ + Projection: test.a, test.b, test.c [a:UInt32, b:UInt32, c:UInt32]\ + \n Filter: test.a IN () [a:UInt32, b:UInt32, c:UInt32]\ + \n Subquery: [a:UInt32]\ + \n Projection: test.a [a:UInt32]\ + \n Projection: test.a [a:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; + assert_plan_eq(plan, expected) + } + + #[test] + fn test_subquery_schema() -> Result<()> { + let analyzer = Analyzer::with_rules(vec![Arc::new(ExpandWildcardRule::new())]); + let options = ConfigOptions::default(); + let subquery = LogicalPlanBuilder::from(test_table_scan()?) + .project(vec![wildcard()])? + .build()?; + let plan = LogicalPlanBuilder::from(subquery) + .alias("sub")? + .project(vec![wildcard()])? + .build()?; + let analyzed_plan = analyzer.execute_and_check(plan, &options, |_, _| {})?; + for x in analyzed_plan.inputs() { + for field in x.schema().fields() { + assert_ne!(field.name(), "*"); + } + } + Ok(()) + } + + fn employee_schema() -> Schema { + Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("first_name", DataType::Utf8, false), + Field::new("last_name", DataType::Utf8, false), + Field::new("state", DataType::Utf8, false), + Field::new("salary", DataType::Int32, false), + ]) + } + + #[test] + fn plan_using_join_wildcard_projection() -> Result<()> { + let t2 = table_scan(Some("t2"), &employee_schema(), None)?.build()?; + + let plan = table_scan(Some("t1"), &employee_schema(), None)? + .join_using(t2, JoinType::Inner, vec!["id"])? + .project(vec![wildcard()])? + .build()?; + + let expected = "Projection: *\ + \n Inner Join: Using t1.id = t2.id\ + \n TableScan: t1\ + \n TableScan: t2"; + + assert_eq!(expected, format!("{plan}")); + + let analyzer = Analyzer::with_rules(vec![Arc::new(ExpandWildcardRule::new())]); + let options = ConfigOptions::default(); + + let analyzed_plan = analyzer.execute_and_check(plan, &options, |_, _| {})?; + + // id column should only show up once in projection + let expected = "Projection: t1.id, t1.first_name, t1.last_name, t1.state, t1.salary, t2.first_name, t2.last_name, t2.state, t2.salary\ + \n Inner Join: Using t1.id = t2.id\ + \n TableScan: t1\ + \n TableScan: t2"; + assert_eq!(expected, format!("{analyzed_plan}")); + + Ok(()) + } +} diff --git a/datafusion/optimizer/src/analyzer/inline_table_scan.rs b/datafusion/optimizer/src/analyzer/inline_table_scan.rs index 73ab37cb11d8..b69b8410da49 100644 --- a/datafusion/optimizer/src/analyzer/inline_table_scan.rs +++ b/datafusion/optimizer/src/analyzer/inline_table_scan.rs @@ -23,6 +23,7 @@ use crate::analyzer::AnalyzerRule; use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::{Column, Result}; +use datafusion_expr::expr::WildcardOptions; use datafusion_expr::{logical_plan::LogicalPlan, Expr, LogicalPlanBuilder, TableScan}; /// Analyzed rule that inlines TableScan that provide a [`LogicalPlan`] @@ -93,7 +94,10 @@ fn generate_projection_expr( ))); } } else { - exprs.push(Expr::Wildcard { qualifier: None }); + exprs.push(Expr::Wildcard { + qualifier: None, + options: WildcardOptions::default(), + }); } Ok(exprs) } @@ -178,7 +182,7 @@ mod tests { let plan = scan.filter(col("x.a").eq(lit(1)))?.build()?; let expected = "Filter: x.a = Int32(1)\ \n SubqueryAlias: x\ - \n Projection: y.a, y.b\ + \n Projection: *\ \n TableScan: y"; assert_analyzed_plan_eq(Arc::new(InlineTableScan::new()), plan, expected) diff --git a/datafusion/optimizer/src/analyzer/mod.rs b/datafusion/optimizer/src/analyzer/mod.rs index 91ee8a9e1033..6e2afeca88c9 100644 --- a/datafusion/optimizer/src/analyzer/mod.rs +++ b/datafusion/optimizer/src/analyzer/mod.rs @@ -30,6 +30,7 @@ use datafusion_expr::expr_rewriter::FunctionRewrite; use datafusion_expr::{Expr, LogicalPlan}; use crate::analyzer::count_wildcard_rule::CountWildcardRule; +use crate::analyzer::expand_wildcard_rule::ExpandWildcardRule; use crate::analyzer::inline_table_scan::InlineTableScan; use crate::analyzer::subquery::check_subquery_expr; use crate::analyzer::type_coercion::TypeCoercion; @@ -38,6 +39,7 @@ use crate::utils::log_plan; use self::function_rewrite::ApplyFunctionRewrites; pub mod count_wildcard_rule; +pub mod expand_wildcard_rule; pub mod function_rewrite; pub mod inline_table_scan; pub mod subquery; @@ -89,6 +91,9 @@ impl Analyzer { pub fn new() -> Self { let rules: Vec> = vec![ Arc::new(InlineTableScan::new()), + // Every rule that will generate [Expr::Wildcard] should be placed in front of [ExpandWildcardRule]. + Arc::new(ExpandWildcardRule::new()), + // [Expr::Wildcard] should be expanded before [TypeCoercion] Arc::new(TypeCoercion::new()), Arc::new(CountWildcardRule::new()), ]; diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 2bb859d84ad7..40efbba6de7a 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -17,20 +17,26 @@ //! Optimizer rule for type validation and coercion +use std::collections::HashMap; use std::sync::Arc; -use arrow::datatypes::{DataType, IntervalUnit}; +use itertools::izip; +use arrow::datatypes::{DataType, Field, IntervalUnit}; + +use crate::analyzer::AnalyzerRule; +use crate::utils::NamePreserver; use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRewriter}; use datafusion_common::{ - exec_err, internal_err, not_impl_err, plan_datafusion_err, plan_err, DFSchema, - DataFusionError, Result, ScalarValue, + exec_err, internal_err, not_impl_err, plan_datafusion_err, plan_err, Column, + DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue, }; use datafusion_expr::expr::{ - self, Between, BinaryExpr, Case, Exists, InList, InSubquery, Like, ScalarFunction, - WindowFunction, + self, Alias, Between, BinaryExpr, Case, Exists, InList, InSubquery, Like, + ScalarFunction, WindowFunction, }; +use datafusion_expr::expr_rewriter::coerce_plan_expr_for_schema; use datafusion_expr::expr_schema::cast_subquery; use datafusion_expr::logical_plan::tree_node::unwrap_arc; use datafusion_expr::logical_plan::Subquery; @@ -47,13 +53,10 @@ use datafusion_expr::type_coercion::{is_datetime, is_utf8_or_large_utf8}; use datafusion_expr::utils::merge_schema; use datafusion_expr::{ is_false, is_not_false, is_not_true, is_not_unknown, is_true, is_unknown, not, - AggregateUDF, Expr, ExprFunctionExt, ExprSchemable, LogicalPlan, Operator, ScalarUDF, - WindowFrame, WindowFrameBound, WindowFrameUnits, + AggregateUDF, Expr, ExprFunctionExt, ExprSchemable, Join, LogicalPlan, Operator, + Projection, ScalarUDF, Union, WindowFrame, WindowFrameBound, WindowFrameUnits, }; -use crate::analyzer::AnalyzerRule; -use crate::utils::NamePreserver; - #[derive(Default)] pub struct TypeCoercion {} @@ -120,8 +123,8 @@ fn analyze_internal( expr.rewrite(&mut expr_rewrite)? .map_data(|expr| original_name.restore(expr)) })? - // coerce join expressions specially - .map_data(|plan| expr_rewrite.coerce_joins(plan))? + // some plans need extra coercion after their expressions are coerced + .map_data(|plan| expr_rewrite.coerce_plan(plan))? // recompute the schema after the expressions have been rewritten as the types may have changed .map_data(|plan| plan.recompute_schema()) } @@ -135,6 +138,14 @@ impl<'a> TypeCoercionRewriter<'a> { Self { schema } } + fn coerce_plan(&mut self, plan: LogicalPlan) -> Result { + match plan { + LogicalPlan::Join(join) => self.coerce_join(join), + LogicalPlan::Union(union) => Self::coerce_union(union), + _ => Ok(plan), + } + } + /// Coerce join equality expressions and join filter /// /// Joins must be treated specially as their equality expressions are stored @@ -143,11 +154,7 @@ impl<'a> TypeCoercionRewriter<'a> { /// /// For example, on_exprs like `t1.a = t2.b AND t1.x = t2.y` will be stored /// as a list of `(t1.a, t2.b), (t1.x, t2.y)` - fn coerce_joins(&mut self, plan: LogicalPlan) -> Result { - let LogicalPlan::Join(mut join) = plan else { - return Ok(plan); - }; - + fn coerce_join(&mut self, mut join: Join) -> Result { join.on = join .on .into_iter() @@ -168,6 +175,33 @@ impl<'a> TypeCoercionRewriter<'a> { Ok(LogicalPlan::Join(join)) } + /// Coerce the union’s inputs to a common schema compatible with all inputs. + /// This occurs after wildcard expansion and the coercion of the input expressions. + fn coerce_union(union_plan: Union) -> Result { + let union_schema = Arc::new(coerce_union_schema(&union_plan.inputs)?); + let new_inputs = union_plan + .inputs + .iter() + .map(|p| { + let plan = coerce_plan_expr_for_schema(p, &union_schema)?; + match plan { + LogicalPlan::Projection(Projection { expr, input, .. }) => { + Ok(Arc::new(project_with_column_index( + expr, + input, + Arc::clone(&union_schema), + )?)) + } + other_plan => Ok(Arc::new(other_plan)), + } + }) + .collect::>>()?; + Ok(LogicalPlan::Union(Union { + inputs: new_inputs, + schema: union_schema, + })) + } + fn coerce_join_filter(&self, expr: Expr) -> Result { let expr_type = expr.get_type(self.schema)?; match expr_type { @@ -774,6 +808,92 @@ fn coerce_case_expression(case: Case, schema: &DFSchema) -> Result { Ok(Case::new(case_expr, when_then, else_expr)) } +/// Get a common schema that is compatible with all inputs of UNION. +fn coerce_union_schema(inputs: &[Arc]) -> Result { + let base_schema = inputs[0].schema(); + let mut union_datatypes = base_schema + .fields() + .iter() + .map(|f| f.data_type().clone()) + .collect::>(); + let mut union_nullabilities = base_schema + .fields() + .iter() + .map(|f| f.is_nullable()) + .collect::>(); + + for (i, plan) in inputs.iter().enumerate().skip(1) { + let plan_schema = plan.schema(); + if plan_schema.fields().len() != base_schema.fields().len() { + return plan_err!( + "Union schemas have different number of fields: \ + query 1 has {} fields whereas query {} has {} fields", + base_schema.fields().len(), + i + 1, + plan_schema.fields().len() + ); + } + // coerce data type and nullablity for each field + for (union_datatype, union_nullable, plan_field) in izip!( + union_datatypes.iter_mut(), + union_nullabilities.iter_mut(), + plan_schema.fields() + ) { + let coerced_type = + comparison_coercion(union_datatype, plan_field.data_type()).ok_or_else( + || { + plan_datafusion_err!( + "Incompatible inputs for Union: Previous inputs were \ + of type {}, but got incompatible type {} on column '{}'", + union_datatype, + plan_field.data_type(), + plan_field.name() + ) + }, + )?; + *union_datatype = coerced_type; + *union_nullable = *union_nullable || plan_field.is_nullable(); + } + } + let union_qualified_fields = izip!( + base_schema.iter(), + union_datatypes.into_iter(), + union_nullabilities + ) + .map(|((qualifier, field), datatype, nullable)| { + let field = Arc::new(Field::new(field.name().clone(), datatype, nullable)); + (qualifier.cloned(), field) + }) + .collect::>(); + DFSchema::new_with_metadata(union_qualified_fields, HashMap::new()) +} + +/// See `` +fn project_with_column_index( + expr: Vec, + input: Arc, + schema: DFSchemaRef, +) -> Result { + let alias_expr = expr + .into_iter() + .enumerate() + .map(|(i, e)| match e { + Expr::Alias(Alias { ref name, .. }) if name != schema.field(i).name() => { + e.unalias().alias(schema.field(i).name()) + } + Expr::Column(Column { + relation: _, + ref name, + }) if name != schema.field(i).name() => e.alias(schema.field(i).name()), + Expr::Alias { .. } | Expr::Column { .. } => e, + _ => e.alias(schema.field(i).name()), + }) + .collect::>(); + + Projection::try_new_with_schema(alias_expr, input, schema) + .map(LogicalPlan::Projection) +} + #[cfg(test)] mod test { use std::any::Any; @@ -1286,7 +1406,6 @@ mod test { .eq(cast(lit("1998-03-18"), DataType::Date32)); let empty = empty(); let plan = LogicalPlan::Projection(Projection::try_new(vec![expr], empty)?); - dbg!(&plan); let expected = "Projection: CAST(Utf8(\"1998-03-18\") AS Timestamp(Nanosecond, None)) = CAST(CAST(Utf8(\"1998-03-18\") AS Date32) AS Timestamp(Nanosecond, None))\n EmptyRelation"; assert_analyzed_plan_eq(Arc::new(TypeCoercion::new()), plan, expected)?; @@ -1473,7 +1592,6 @@ mod test { )); let empty = empty(); let plan = LogicalPlan::Projection(Projection::try_new(vec![expr], empty)?); - dbg!(&plan); let expected = "Projection: CAST(Utf8(\"1998-03-18\") AS Timestamp(Nanosecond, None)) - CAST(Utf8(\"1998-03-18\") AS Timestamp(Nanosecond, None))\n EmptyRelation"; assert_analyzed_plan_eq(Arc::new(TypeCoercion::new()), plan, expected)?; diff --git a/datafusion/optimizer/src/eliminate_nested_union.rs b/datafusion/optimizer/src/eliminate_nested_union.rs index cc8cf1f56c18..5f41e4f137b1 100644 --- a/datafusion/optimizer/src/eliminate_nested_union.rs +++ b/datafusion/optimizer/src/eliminate_nested_union.rs @@ -114,8 +114,11 @@ fn extract_plan_from_distinct(plan: Arc) -> Arc { #[cfg(test)] mod tests { use super::*; + use crate::analyzer::type_coercion::TypeCoercion; + use crate::analyzer::Analyzer; use crate::test::*; use arrow::datatypes::{DataType, Field, Schema}; + use datafusion_common::config::ConfigOptions; use datafusion_expr::{col, logical_plan::table_scan}; fn schema() -> Schema { @@ -127,7 +130,14 @@ mod tests { } fn assert_optimized_plan_equal(plan: LogicalPlan, expected: &str) -> Result<()> { - assert_optimized_plan_eq(Arc::new(EliminateNestedUnion::new()), plan, expected) + let options = ConfigOptions::default(); + let analyzed_plan = Analyzer::with_rules(vec![Arc::new(TypeCoercion::new())]) + .execute_and_check(plan, &options, |_, _| {})?; + assert_optimized_plan_eq( + Arc::new(EliminateNestedUnion::new()), + analyzed_plan, + expected, + ) } #[test] diff --git a/datafusion/optimizer/src/push_down_limit.rs b/datafusion/optimizer/src/push_down_limit.rs index 612aac1d152d..4d8f1dbdb955 100644 --- a/datafusion/optimizer/src/push_down_limit.rs +++ b/datafusion/optimizer/src/push_down_limit.rs @@ -24,6 +24,7 @@ use crate::optimizer::ApplyOrder; use crate::{OptimizerConfig, OptimizerRule}; use datafusion_common::tree_node::Transformed; +use datafusion_common::utils::combine_limit; use datafusion_common::Result; use datafusion_expr::logical_plan::tree_node::unwrap_arc; use datafusion_expr::logical_plan::{Join, JoinType, Limit, LogicalPlan}; @@ -217,69 +218,6 @@ fn transformed_limit( }))) } -/// Computes the `skip` and `fetch` parameters of a single limit that would be -/// equivalent to two consecutive limits with the given `skip`/`fetch` parameters. -/// -/// There are multiple cases to consider: -/// -/// # Case 0: Parent and child are disjoint (`child_fetch <= skip`). -/// -/// ```text -/// Before merging: -/// |........skip........|---fetch-->| Parent limit -/// |...child_skip...|---child_fetch-->| Child limit -/// ``` -/// -/// After merging: -/// ```text -/// |.........(child_skip + skip).........| -/// ``` -/// -/// # Case 1: Parent is beyond child's range (`skip < child_fetch <= skip + fetch`). -/// -/// Before merging: -/// ```text -/// |...skip...|------------fetch------------>| Parent limit -/// |...child_skip...|-------------child_fetch------------>| Child limit -/// ``` -/// -/// After merging: -/// ```text -/// |....(child_skip + skip)....|---(child_fetch - skip)-->| -/// ``` -/// -/// # Case 2: Parent is within child's range (`skip + fetch < child_fetch`). -/// -/// Before merging: -/// ```text -/// |...skip...|---fetch-->| Parent limit -/// |...child_skip...|-------------child_fetch------------>| Child limit -/// ``` -/// -/// After merging: -/// ```text -/// |....(child_skip + skip)....|---fetch-->| -/// ``` -pub fn combine_limit( - parent_skip: usize, - parent_fetch: Option, - child_skip: usize, - child_fetch: Option, -) -> (usize, Option) { - let combined_skip = child_skip.saturating_add(parent_skip); - - let combined_fetch = match (parent_fetch, child_fetch) { - (Some(parent_fetch), Some(child_fetch)) => { - Some(min(parent_fetch, child_fetch.saturating_sub(parent_skip))) - } - (Some(parent_fetch), None) => Some(parent_fetch), - (None, Some(child_fetch)) => Some(child_fetch.saturating_sub(parent_skip)), - (None, None) => None, - }; - - (combined_skip, combined_fetch) -} - /// Adds a limit to the inputs of a join, if possible fn push_down_join(mut join: Join, limit: usize) -> Transformed { use JoinType::*; diff --git a/datafusion/optimizer/tests/optimizer_integration.rs b/datafusion/optimizer/tests/optimizer_integration.rs index aaa5eec3955c..93dd49b17492 100644 --- a/datafusion/optimizer/tests/optimizer_integration.rs +++ b/datafusion/optimizer/tests/optimizer_integration.rs @@ -335,6 +335,27 @@ fn test_propagate_empty_relation_inner_join_and_unions() { assert_eq!(expected, format!("{plan}")); } +#[test] +fn select_wildcard_with_repeated_column() { + let sql = "SELECT *, col_int32 FROM test"; + let err = test_sql(sql).expect_err("query should have failed"); + assert_eq!( + "expand_wildcard_rule\ncaused by\nError during planning: Projections require unique expression names but the expression \"test.col_int32\" at position 0 and \"test.col_int32\" at position 7 have the same name. Consider aliasing (\"AS\") one of them.", + err.strip_backtrace() + ); +} + +#[test] +fn select_wildcard_with_repeated_column_but_is_aliased() { + let sql = "SELECT *, col_int32 as col_32 FROM test"; + + let plan = test_sql(sql).unwrap(); + let expected = "Projection: test.col_int32, test.col_uint32, test.col_utf8, test.col_date32, test.col_date64, test.col_ts_nano_none, test.col_ts_nano_utc, test.col_int32 AS col_32\ + \n TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]"; + + assert_eq!(expected, format!("{plan}")); +} + fn test_sql(sql: &str) -> Result { // parse the SQL let dialect = GenericDialect {}; // or AnsiDialect, or your own dialect ... diff --git a/datafusion/physical-expr/benches/case_when.rs b/datafusion/physical-expr/benches/case_when.rs index 8a34f34a82db..9eda1277c263 100644 --- a/datafusion/physical-expr/benches/case_when.rs +++ b/datafusion/physical-expr/benches/case_when.rs @@ -44,12 +44,12 @@ fn criterion_benchmark(c: &mut Criterion) { if i % 7 == 0 { c2.append_null(); } else { - c2.append_value(&format!("string {i}")); + c2.append_value(format!("string {i}")); } if i % 9 == 0 { c3.append_null(); } else { - c3.append_value(&format!("other string {i}")); + c3.append_value(format!("other string {i}")); } } let c1 = Arc::new(c1.finish()); diff --git a/datafusion/physical-expr/src/analysis.rs b/datafusion/physical-expr/src/analysis.rs index bcf1c8e510b1..3eac62a4df08 100644 --- a/datafusion/physical-expr/src/analysis.rs +++ b/datafusion/physical-expr/src/analysis.rs @@ -119,7 +119,7 @@ impl ExprBoundaries { Ok(ExprBoundaries { column, interval, - distinct_count: col_stats.distinct_count.clone(), + distinct_count: col_stats.distinct_count, }) } diff --git a/datafusion/physical-expr/src/expressions/case.rs b/datafusion/physical-expr/src/expressions/case.rs index 583a4ef32542..c6afb5c05985 100644 --- a/datafusion/physical-expr/src/expressions/case.rs +++ b/datafusion/physical-expr/src/expressions/case.rs @@ -1146,7 +1146,7 @@ mod tests { if i % 7 == 0 { c2.append_null(); } else { - c2.append_value(&format!("string {i}")); + c2.append_value(format!("string {i}")); } } let c1 = Arc::new(c1.finish()); diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs index 8a3885030b9d..dfc70551ccf6 100644 --- a/datafusion/physical-expr/src/expressions/in_list.rs +++ b/datafusion/physical-expr/src/expressions/in_list.rs @@ -28,7 +28,6 @@ use crate::PhysicalExpr; use arrow::array::*; use arrow::buffer::BooleanBuffer; use arrow::compute::kernels::boolean::{not, or_kleene}; -use arrow::compute::kernels::cmp::eq; use arrow::compute::take; use arrow::datatypes::*; use arrow::util::bit_iterator::BitIndexIterator; @@ -41,7 +40,8 @@ use datafusion_common::hash_utils::HashValue; use datafusion_common::{ exec_err, internal_err, not_impl_err, DFSchema, Result, ScalarValue, }; -use datafusion_expr::ColumnarValue; +use datafusion_expr::{ColumnarValue, Operator}; +use datafusion_physical_expr_common::datum::compare_op_for_nested; use ahash::RandomState; use hashbrown::hash_map::RawEntryMut; @@ -361,7 +361,11 @@ impl PhysicalExpr for InListExpr { |result, expr| -> Result { Ok(or_kleene( &result, - &eq(&value, &expr?.into_array(num_rows)?)?, + &compare_op_for_nested( + Operator::Eq, + &value, + &expr?.into_array(num_rows)?, + )?, )?) }, )?; diff --git a/datafusion/physical-optimizer/src/lib.rs b/datafusion/physical-optimizer/src/lib.rs index 8108493a0d3b..d54e6dbcab8f 100644 --- a/datafusion/physical-optimizer/src/lib.rs +++ b/datafusion/physical-optimizer/src/lib.rs @@ -18,6 +18,7 @@ #![deny(clippy::clone_on_ref_ptr)] pub mod aggregate_statistics; +pub mod limit_pushdown; mod optimizer; pub mod output_requirements; diff --git a/datafusion/physical-optimizer/src/limit_pushdown.rs b/datafusion/physical-optimizer/src/limit_pushdown.rs new file mode 100644 index 000000000000..2b787980585a --- /dev/null +++ b/datafusion/physical-optimizer/src/limit_pushdown.rs @@ -0,0 +1,253 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`LimitPushdown`] pushes `LIMIT` down through `ExecutionPlan`s to reduce +//! data transfer as much as possible. + +use std::fmt::Debug; +use std::sync::Arc; + +use crate::PhysicalOptimizerRule; +use datafusion_common::config::ConfigOptions; +use datafusion_common::plan_datafusion_err; +use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; +use datafusion_common::utils::combine_limit; +use datafusion_common::Result; +use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec; +use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; +use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; +use datafusion_physical_plan::ExecutionPlan; + +/// This rule inspects [`ExecutionPlan`]'s and pushes down the fetch limit from +/// the parent to the child if applicable. +#[derive(Default)] +pub struct LimitPushdown {} + +impl LimitPushdown { + #[allow(missing_docs)] + pub fn new() -> Self { + Self {} + } +} + +impl PhysicalOptimizerRule for LimitPushdown { + fn optimize( + &self, + plan: Arc, + _config: &ConfigOptions, + ) -> Result> { + plan.transform_down(push_down_limits).data() + } + + fn name(&self) -> &str { + "LimitPushdown" + } + + fn schema_check(&self) -> bool { + true + } +} + +/// This enumeration makes `skip` and `fetch` calculations easier by providing +/// a single API for both local and global limit operators. +#[derive(Debug)] +enum LimitExec { + Global(GlobalLimitExec), + Local(LocalLimitExec), +} + +impl LimitExec { + fn input(&self) -> &Arc { + match self { + Self::Global(global) => global.input(), + Self::Local(local) => local.input(), + } + } + + fn fetch(&self) -> Option { + match self { + Self::Global(global) => global.fetch(), + Self::Local(local) => Some(local.fetch()), + } + } + + fn skip(&self) -> usize { + match self { + Self::Global(global) => global.skip(), + Self::Local(_) => 0, + } + } + + fn with_child(&self, child: Arc) -> Self { + match self { + Self::Global(global) => { + Self::Global(GlobalLimitExec::new(child, global.skip(), global.fetch())) + } + Self::Local(local) => Self::Local(LocalLimitExec::new(child, local.fetch())), + } + } +} + +impl From for Arc { + fn from(limit_exec: LimitExec) -> Self { + match limit_exec { + LimitExec::Global(global) => Arc::new(global), + LimitExec::Local(local) => Arc::new(local), + } + } +} + +/// Pushes down the limit through the plan. +pub fn push_down_limits( + plan: Arc, +) -> Result>> { + let maybe_modified = if let Some(limit_exec) = extract_limit(&plan) { + let child = limit_exec.input(); + if let Some(child_limit) = extract_limit(child) { + let merged = merge_limits(&limit_exec, &child_limit); + // Revisit current node in case of consecutive pushdowns + Some(push_down_limits(merged)?.data) + } else if child.supports_limit_pushdown() { + try_push_down_limit(&limit_exec, Arc::clone(child))? + } else { + add_fetch_to_child(&limit_exec, Arc::clone(child)) + } + } else { + None + }; + + Ok(maybe_modified.map_or(Transformed::no(plan), Transformed::yes)) +} + +/// Transforms the [`ExecutionPlan`] into a [`LimitExec`] if it is a +/// [`GlobalLimitExec`] or a [`LocalLimitExec`]. +fn extract_limit(plan: &Arc) -> Option { + if let Some(global_limit) = plan.as_any().downcast_ref::() { + Some(LimitExec::Global(GlobalLimitExec::new( + Arc::clone(global_limit.input()), + global_limit.skip(), + global_limit.fetch(), + ))) + } else { + plan.as_any() + .downcast_ref::() + .map(|local_limit| { + LimitExec::Local(LocalLimitExec::new( + Arc::clone(local_limit.input()), + local_limit.fetch(), + )) + }) + } +} + +/// Merge the limits of the parent and the child. If at least one of them is a +/// [`GlobalLimitExec`], the result is also a [`GlobalLimitExec`]. Otherwise, +/// the result is a [`LocalLimitExec`]. +fn merge_limits( + parent_limit_exec: &LimitExec, + child_limit_exec: &LimitExec, +) -> Arc { + // We can use the logic in `combine_limit` from the logical optimizer: + let (skip, fetch) = combine_limit( + parent_limit_exec.skip(), + parent_limit_exec.fetch(), + child_limit_exec.skip(), + child_limit_exec.fetch(), + ); + match (parent_limit_exec, child_limit_exec) { + (LimitExec::Local(_), LimitExec::Local(_)) => { + // The fetch is present in this case, can unwrap. + Arc::new(LocalLimitExec::new( + Arc::clone(child_limit_exec.input()), + fetch.unwrap(), + )) + } + _ => Arc::new(GlobalLimitExec::new( + Arc::clone(child_limit_exec.input()), + skip, + fetch, + )), + } +} + +/// Pushes down the limit through the child. If the child has a single input +/// partition, simply swaps the parent and the child. Otherwise, adds a +/// [`LocalLimitExec`] after in between in addition to swapping, because of +/// multiple input partitions. +fn try_push_down_limit( + limit_exec: &LimitExec, + child: Arc, +) -> Result>> { + let grandchildren = child.children(); + if let Some(&grandchild) = grandchildren.first() { + // GlobalLimitExec and LocalLimitExec must have an input after pushdown + if combines_input_partitions(&child) { + // We still need a LocalLimitExec after the child + if let Some(fetch) = limit_exec.fetch() { + let new_local_limit = Arc::new(LocalLimitExec::new( + Arc::clone(grandchild), + fetch + limit_exec.skip(), + )); + let new_child = + Arc::clone(&child).with_new_children(vec![new_local_limit])?; + Ok(Some(limit_exec.with_child(new_child).into())) + } else { + Ok(None) + } + } else { + // Swap current with child + let new_limit = limit_exec.with_child(Arc::clone(grandchild)); + let new_child = child.with_new_children(vec![new_limit.into()])?; + Ok(Some(new_child)) + } + } else { + // Operators supporting limit push down must have a child. + Err(plan_datafusion_err!( + "{:#?} must have a child to push down limit", + child + )) + } +} + +fn combines_input_partitions(exec: &Arc) -> bool { + let exec = exec.as_any(); + exec.is::() || exec.is::() +} + +/// Transforms child to the fetching version if supported. Removes the parent if +/// skip is zero. Otherwise, keeps the parent. +fn add_fetch_to_child( + limit_exec: &LimitExec, + child: Arc, +) -> Option> { + let fetch = limit_exec.fetch(); + let skip = limit_exec.skip(); + + let child_fetch = fetch.map(|f| f + skip); + + if let Some(child_with_fetch) = child.with_fetch(child_fetch) { + if skip > 0 { + Some(limit_exec.with_child(child_with_fetch).into()) + } else { + Some(child_with_fetch) + } + } else { + None + } +} + +// See tests in datafusion/core/tests/physical_optimizer diff --git a/datafusion/physical-optimizer/src/output_requirements.rs b/datafusion/physical-optimizer/src/output_requirements.rs index f971d8f1f0aa..fdfdd349e36e 100644 --- a/datafusion/physical-optimizer/src/output_requirements.rs +++ b/datafusion/physical-optimizer/src/output_requirements.rs @@ -286,3 +286,5 @@ fn require_top_ordering_helper( Ok((plan, false)) } } + +// See tests in datafusion/core/tests/physical_optimizer diff --git a/datafusion/physical-plan/src/aggregates/row_hash.rs b/datafusion/physical-plan/src/aggregates/row_hash.rs index ed3d6d49f9f3..b3221752d034 100644 --- a/datafusion/physical-plan/src/aggregates/row_hash.rs +++ b/datafusion/physical-plan/src/aggregates/row_hash.rs @@ -100,22 +100,24 @@ struct SpillState { /// /// See "partial aggregation" discussion on [`GroupedHashAggregateStream`] struct SkipAggregationProbe { - /// Number of processed input rows + /// Number of processed input rows (updated during probing) input_rows: usize, - /// Number of total group values for `input_rows` + /// Number of total group values for `input_rows` (updated during probing) num_groups: usize, - /// Aggregation ratio check should be performed only when the - /// number of input rows exceeds this threshold + /// Aggregation ratio check performed when the number of input rows exceeds + /// this threshold (from `SessionConfig`) probe_rows_threshold: usize, - /// Maximum allowed value of `input_rows` / `num_groups` to - /// continue aggregation + /// Maximum ratio of `num_groups` to `input_rows` for continuing aggregation + /// (from `SessionConfig`). If the ratio exceeds this value, aggregation + /// is skipped and input rows are directly converted to output probe_ratio_threshold: f64, - /// Flag indicating that further data aggregation mey be skipped + /// Flag indicating further data aggregation may be skipped (decision made + /// when probing complete) should_skip: bool, - /// Flag indicating that further updates of `SkipAggregationProbe` - /// state won't make any effect + /// Flag indicating further updates of `SkipAggregationProbe` state won't + /// make any effect (set either while probing or on probing completion) is_locked: bool, /// Number of rows where state was output without aggregation. diff --git a/datafusion/physical-plan/src/coalesce_batches.rs b/datafusion/physical-plan/src/coalesce_batches.rs index 13c10c535c08..5589027694fe 100644 --- a/datafusion/physical-plan/src/coalesce_batches.rs +++ b/datafusion/physical-plan/src/coalesce_batches.rs @@ -20,24 +20,24 @@ use std::any::Any; use std::pin::Pin; use std::sync::Arc; -use std::task::{ready, Context, Poll}; +use std::task::{Context, Poll}; + +use super::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; +use super::{DisplayAs, ExecutionPlanProperties, PlanProperties, Statistics}; +use crate::{ + DisplayFormatType, ExecutionPlan, RecordBatchStream, SendableRecordBatchStream, +}; use arrow::array::{AsArray, StringViewBuilder}; use arrow::compute::concat_batches; use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; use arrow_array::{Array, ArrayRef}; -use futures::stream::{Stream, StreamExt}; - use datafusion_common::Result; use datafusion_execution::TaskContext; -use crate::{ - DisplayFormatType, ExecutionPlan, RecordBatchStream, SendableRecordBatchStream, -}; - -use super::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; -use super::{DisplayAs, ExecutionPlanProperties, PlanProperties, Statistics}; +use futures::ready; +use futures::stream::{Stream, StreamExt}; /// `CoalesceBatchesExec` combines small batches into larger batches for more /// efficient use of vectorized processing by later operators. @@ -202,8 +202,9 @@ impl ExecutionPlan for CoalesceBatchesExec { self.target_batch_size, self.fetch, ), - is_closed: false, baseline_metrics: BaselineMetrics::new(&self.metrics, partition), + // Start by pulling data + inner_state: CoalesceBatchesStreamState::Pull, })) } @@ -236,10 +237,11 @@ struct CoalesceBatchesStream { input: SendableRecordBatchStream, /// Buffer for combining batches coalescer: BatchCoalescer, - /// Whether the stream has finished returning all of its data or not - is_closed: bool, /// Execution metrics baseline_metrics: BaselineMetrics, + /// The current inner state of the stream. This state dictates the current + /// action or operation to be performed in the streaming process. + inner_state: CoalesceBatchesStreamState, } impl Stream for CoalesceBatchesStream { @@ -259,45 +261,98 @@ impl Stream for CoalesceBatchesStream { } } +/// Enumeration of possible states for `CoalesceBatchesStream`. +/// It represents different stages in the lifecycle of a stream of record batches. +/// +/// An example of state transition: +/// Notation: +/// `[3000]`: A batch with size 3000 +/// `{[2000], [3000]}`: `CoalesceBatchStream`'s internal buffer with 2 batches buffered +/// Input of `CoalesceBatchStream` will generate three batches `[2000], [3000], [4000]` +/// The coalescing procedure will go through the following steps with 4096 coalescing threshold: +/// 1. Read the first batch and get it buffered. +/// - initial state: `Pull` +/// - initial buffer: `{}` +/// - updated buffer: `{[2000]}` +/// - next state: `Pull` +/// 2. Read the second batch, the coalescing target is reached since 2000 + 3000 > 4096 +/// - initial state: `Pull` +/// - initial buffer: `{[2000]}` +/// - updated buffer: `{[2000], [3000]}` +/// - next state: `ReturnBuffer` +/// 4. Two batches in the batch get merged and consumed by the upstream operator. +/// - initial state: `ReturnBuffer` +/// - initial buffer: `{[2000], [3000]}` +/// - updated buffer: `{}` +/// - next state: `Pull` +/// 5. Read the third input batch. +/// - initial state: `Pull` +/// - initial buffer: `{}` +/// - updated buffer: `{[4000]}` +/// - next state: `Pull` +/// 5. The input is ended now. Jump to exhaustion state preparing the finalized data. +/// - initial state: `Pull` +/// - initial buffer: `{[4000]}` +/// - updated buffer: `{[4000]}` +/// - next state: `Exhausted` +#[derive(Debug, Clone, Eq, PartialEq)] +enum CoalesceBatchesStreamState { + /// State to pull a new batch from the input stream. + Pull, + /// State to return a buffered batch. + ReturnBuffer, + /// State indicating that the stream is exhausted. + Exhausted, +} + impl CoalesceBatchesStream { fn poll_next_inner( self: &mut Pin<&mut Self>, cx: &mut Context<'_>, ) -> Poll>> { - // Get a clone (uses same underlying atomic) as self gets borrowed below let cloned_time = self.baseline_metrics.elapsed_compute().clone(); - - if self.is_closed { - return Poll::Ready(None); - } loop { - let input_batch = self.input.poll_next_unpin(cx); - // records time on drop - let _timer = cloned_time.timer(); - match ready!(input_batch) { - Some(result) => { - let Ok(input_batch) = result else { - return Poll::Ready(Some(result)); // pass back error - }; - // Buffer the batch and either get more input if not enough - // rows yet or output - match self.coalescer.push_batch(input_batch) { - Ok(None) => continue, - res => { - if self.coalescer.limit_reached() { - self.is_closed = true; + match &self.inner_state { + CoalesceBatchesStreamState::Pull => { + // Attempt to pull the next batch from the input stream. + let input_batch = ready!(self.input.poll_next_unpin(cx)); + // Start timing the operation. The timer records time upon being dropped. + let _timer = cloned_time.timer(); + + match input_batch { + Some(Ok(batch)) => match self.coalescer.push_batch(batch) { + CoalescerState::Continue => {} + CoalescerState::LimitReached => { + self.inner_state = CoalesceBatchesStreamState::Exhausted; } - return Poll::Ready(res.transpose()); + CoalescerState::TargetReached => { + self.inner_state = + CoalesceBatchesStreamState::ReturnBuffer; + } + }, + None => { + // End of input stream, but buffered batches might still be present. + self.inner_state = CoalesceBatchesStreamState::Exhausted; } + other => return Poll::Ready(other), } } - None => { - self.is_closed = true; - // we have reached the end of the input stream but there could still - // be buffered batches - return match self.coalescer.finish() { - Ok(None) => Poll::Ready(None), - res => Poll::Ready(res.transpose()), + CoalesceBatchesStreamState::ReturnBuffer => { + // Combine buffered batches into one batch and return it. + let batch = self.coalescer.finish_batch()?; + // Set to pull state for the next iteration. + self.inner_state = CoalesceBatchesStreamState::Pull; + return Poll::Ready(Some(Ok(batch))); + } + CoalesceBatchesStreamState::Exhausted => { + // Handle the end of the input stream. + return if self.coalescer.buffer.is_empty() { + // If buffer is empty, return None indicating the stream is fully consumed. + Poll::Ready(None) + } else { + // If the buffer still contains batches, prepare to return them. + let batch = self.coalescer.finish_batch()?; + Poll::Ready(Some(Ok(batch))) }; } } @@ -364,90 +419,72 @@ impl BatchCoalescer { Arc::clone(&self.schema) } - /// Add a batch, returning a batch if the target batch size or limit is reached - fn push_batch(&mut self, batch: RecordBatch) -> Result> { - // discard empty batches - if batch.num_rows() == 0 { - return Ok(None); - } - - // past limit - if self.limit_reached() { - return Ok(None); - } - + /// Given a batch, it updates the buffer of [`BatchCoalescer`]. It returns + /// a variant of [`CoalescerState`] indicating the final state of the buffer. + fn push_batch(&mut self, batch: RecordBatch) -> CoalescerState { let batch = gc_string_view_batch(&batch); + if self.limit_reached(&batch) { + CoalescerState::LimitReached + } else if self.target_reached(batch) { + CoalescerState::TargetReached + } else { + CoalescerState::Continue + } + } - // Handle fetch limit: - if let Some(fetch) = self.fetch { - if self.total_rows + batch.num_rows() >= fetch { - // We have reached the fetch limit. + /// The function checks if the buffer can reach the specified limit after getting `batch`. + /// If it does, it slices the received batch as needed, updates the buffer with it, and + /// finally returns `true`. Otherwise; the function does nothing and returns `false`. + fn limit_reached(&mut self, batch: &RecordBatch) -> bool { + match self.fetch { + Some(fetch) if self.total_rows + batch.num_rows() >= fetch => { + // Limit is reached let remaining_rows = fetch - self.total_rows; debug_assert!(remaining_rows > 0); - self.total_rows = fetch; - // Trim the batch and add to buffered batches: + let batch = batch.slice(0, remaining_rows); self.buffered_rows += batch.num_rows(); + self.total_rows = fetch; self.buffer.push(batch); - // Combine buffered batches: - let batch = concat_batches(&self.schema, &self.buffer)?; - // Reset the buffer state and return final batch: - self.buffer.clear(); - self.buffered_rows = 0; - return Ok(Some(batch)); + true } + _ => false, } - self.total_rows += batch.num_rows(); - - // batch itself is already big enough and we have no buffered rows so - // return it directly - if batch.num_rows() >= self.target_batch_size && self.buffer.is_empty() { - return Ok(Some(batch)); - } - // add to the buffered batches - self.buffered_rows += batch.num_rows(); - self.buffer.push(batch); - // check to see if we have enough batches yet - let batch = if self.buffered_rows >= self.target_batch_size { - // combine the batches and return - let batch = concat_batches(&self.schema, &self.buffer)?; - // reset buffer state - self.buffer.clear(); - self.buffered_rows = 0; - // return batch - Some(batch) - } else { - None - }; - Ok(batch) } - /// Finish the coalescing process, returning all buffered data as a final, - /// single batch, if any - fn finish(&mut self) -> Result> { - if self.buffer.is_empty() { - Ok(None) + /// Updates the buffer with the given batch. If the target batch size is reached, + /// the function returns `true`. Otherwise, it returns `false`. + fn target_reached(&mut self, batch: RecordBatch) -> bool { + if batch.num_rows() == 0 { + false } else { - // combine the batches and return - let batch = concat_batches(&self.schema, &self.buffer)?; - // reset buffer state - self.buffer.clear(); - self.buffered_rows = 0; - // return batch - Ok(Some(batch)) + self.total_rows += batch.num_rows(); + self.buffered_rows += batch.num_rows(); + self.buffer.push(batch); + self.buffered_rows >= self.target_batch_size } } - /// returns true if there is a limit and it has been reached - pub fn limit_reached(&self) -> bool { - if let Some(fetch) = self.fetch { - self.total_rows >= fetch - } else { - false - } + /// Concatenates and returns all buffered batches, and clears the buffer. + fn finish_batch(&mut self) -> Result { + let batch = concat_batches(&self.schema, &self.buffer)?; + self.buffer.clear(); + self.buffered_rows = 0; + Ok(batch) } } +/// This enumeration acts as a status indicator for the [`BatchCoalescer`] after a +/// [`BatchCoalescer::push_batch()`] operation. +enum CoalescerState { + /// Neither the limit nor the target batch size is reached. + Continue, + /// The sufficient row count to produce a complete query result is reached. + LimitReached, + /// The specified minimum number of rows a batch should have is reached. + TargetReached, +} + /// Heuristically compact `StringViewArray`s to reduce memory usage, if needed /// /// This function decides when to consolidate the StringView into a new buffer @@ -521,11 +558,13 @@ fn gc_string_view_batch(batch: &RecordBatch) -> RecordBatch { #[cfg(test)] mod tests { + use std::ops::Range; + use super::*; + use arrow::datatypes::{DataType, Field, Schema}; use arrow_array::builder::ArrayBuilder; use arrow_array::{StringViewArray, UInt32Array}; - use std::ops::Range; #[test] fn test_coalesce() { @@ -670,16 +709,25 @@ mod tests { // create a single large input batch for output comparison let single_input_batch = concat_batches(&schema, &input_batches).unwrap(); - let mut coalescer = BatchCoalescer::new(schema, target_batch_size, fetch); + let mut coalescer = + BatchCoalescer::new(Arc::clone(&schema), target_batch_size, fetch); let mut output_batches = vec![]; for batch in input_batches { - if let Some(batch) = coalescer.push_batch(batch).unwrap() { - output_batches.push(batch); + match coalescer.push_batch(batch) { + CoalescerState::Continue => {} + CoalescerState::LimitReached => { + output_batches.push(coalescer.finish_batch().unwrap()); + break; + } + CoalescerState::TargetReached => { + coalescer.buffered_rows = 0; + output_batches.push(coalescer.finish_batch().unwrap()); + } } } - if let Some(batch) = coalescer.finish().unwrap() { - output_batches.push(batch); + if coalescer.buffered_rows != 0 { + output_batches.extend(coalescer.buffer); } // make sure we got the expected number of output batches and content diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index fa9108057cfe..568987b14798 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -324,7 +324,7 @@ fn collect_new_statistics( (Precision::Inexact(lower), Precision::Inexact(upper)) }; ColumnStatistics { - null_count: input_column_stats[idx].null_count.clone().to_inexact(), + null_count: input_column_stats[idx].null_count.to_inexact(), max_value, min_value, distinct_count: distinct_count.to_inexact(), diff --git a/datafusion/physical-plan/src/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs index b8a58e4d0d30..80d8815bdebc 100644 --- a/datafusion/physical-plan/src/joins/utils.rs +++ b/datafusion/physical-plan/src/joins/utils.rs @@ -827,12 +827,12 @@ fn estimate_join_cardinality( JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => { let ij_cardinality = estimate_inner_join_cardinality( Statistics { - num_rows: left_stats.num_rows.clone(), + num_rows: left_stats.num_rows, total_byte_size: Precision::Absent, column_statistics: left_col_stats, }, Statistics { - num_rows: right_stats.num_rows.clone(), + num_rows: right_stats.num_rows, total_byte_size: Precision::Absent, column_statistics: right_col_stats, }, @@ -1024,7 +1024,7 @@ fn max_distinct_count( stats: &ColumnStatistics, ) -> Precision { match &stats.distinct_count { - dc @ (Precision::Exact(_) | Precision::Inexact(_)) => dc.clone(), + &dc @ (Precision::Exact(_) | Precision::Inexact(_)) => dc, _ => { // The number can never be greater than the number of rows we have // minus the nulls (since they don't count as distinct values). @@ -2054,9 +2054,7 @@ mod tests { ); assert_eq!( partial_join_stats.map(|s| s.column_statistics), - expected_cardinality - .clone() - .map(|_| [left_col_stats, right_col_stats].concat()) + expected_cardinality.map(|_| [left_col_stats, right_col_stats].concat()) ); } Ok(()) diff --git a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs index 6311107f7b58..29ead35895fe 100644 --- a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs +++ b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs @@ -1196,7 +1196,7 @@ mod tests { RecordBatchStream, SendableRecordBatchStream, TaskContext, }; use datafusion_expr::{ - Expr, WindowFrame, WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition, + WindowFrame, WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition, }; use datafusion_functions_aggregate::count::count_udaf; use datafusion_physical_expr::expressions::{col, Column, NthValue}; @@ -1303,10 +1303,7 @@ mod tests { let window_fn = WindowFunctionDefinition::AggregateUDF(count_udaf()); let col_expr = Arc::new(Column::new(schema.fields[0].name(), 0)) as Arc; - let log_expr = - Expr::Column(datafusion_common::Column::from(schema.fields[0].name())); let args = vec![col_expr]; - let log_args = vec![log_expr]; let partitionby_exprs = vec![col(hash, &schema)?]; let orderby_exprs = vec![PhysicalSortExpr { expr: col(order_by, &schema)?, @@ -1327,7 +1324,6 @@ mod tests { &window_fn, fn_name, &args, - &log_args, &partitionby_exprs, &orderby_exprs, Arc::new(window_frame.clone()), diff --git a/datafusion/physical-plan/src/windows/mod.rs b/datafusion/physical-plan/src/windows/mod.rs index 2e6ad4e1a14f..1fd0ca36b1eb 100644 --- a/datafusion/physical-plan/src/windows/mod.rs +++ b/datafusion/physical-plan/src/windows/mod.rs @@ -32,8 +32,8 @@ use arrow::datatypes::Schema; use arrow_schema::{DataType, Field, SchemaRef}; use datafusion_common::{exec_err, DataFusionError, Result, ScalarValue}; use datafusion_expr::{ - BuiltInWindowFunction, Expr, PartitionEvaluator, WindowFrame, - WindowFunctionDefinition, WindowUDF, + BuiltInWindowFunction, PartitionEvaluator, WindowFrame, WindowFunctionDefinition, + WindowUDF, }; use datafusion_physical_expr::equivalence::collapse_lex_req; use datafusion_physical_expr::{ @@ -94,7 +94,6 @@ pub fn create_window_expr( fun: &WindowFunctionDefinition, name: String, args: &[Arc], - _logical_args: &[Expr], partition_by: &[Arc], order_by: &[PhysicalSortExpr], window_frame: Arc, @@ -746,7 +745,6 @@ mod tests { &[col("a", &schema)?], &[], &[], - &[], Arc::new(WindowFrame::new(None)), schema.as_ref(), false, diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index 6c4c07428bd3..6cbea5f0cfcc 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -22,8 +22,8 @@ use datafusion_common::{ exec_datafusion_err, internal_err, plan_datafusion_err, Result, ScalarValue, TableReference, UnnestOptions, }; -use datafusion_expr::expr::Unnest; use datafusion_expr::expr::{Alias, Placeholder}; +use datafusion_expr::expr::{Unnest, WildcardOptions}; use datafusion_expr::ExprFunctionExt; use datafusion_expr::{ expr::{self, InList, Sort, WindowFunction}, @@ -556,7 +556,10 @@ pub fn parse_expr( ))), ExprType::Wildcard(protobuf::Wildcard { qualifier }) => { let qualifier = qualifier.to_owned().map(|x| x.try_into()).transpose()?; - Ok(Expr::Wildcard { qualifier }) + Ok(Expr::Wildcard { + qualifier, + options: WildcardOptions::default(), + }) } ExprType::ScalarUdfExpr(protobuf::ScalarUdfExprNode { fun_name, diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index ab81ce8af9cb..c7361c89c328 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -582,7 +582,7 @@ pub fn serialize_expr( expr_type: Some(ExprType::InList(expr)), } } - Expr::Wildcard { qualifier } => protobuf::LogicalExprNode { + Expr::Wildcard { qualifier, .. } => protobuf::LogicalExprNode { expr_type: Some(ExprType::Wildcard(protobuf::Wildcard { qualifier: qualifier.to_owned().map(|x| x.into()), })), diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index bc0a19336bae..b2f92f4b2ee4 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -169,13 +169,10 @@ pub fn parse_physical_window_expr( // TODO: Remove extended_schema if functions are all UDAF let extended_schema = schema_add_window_field(&window_node_expr, input_schema, &fun, &name)?; - // approx_percentile_cont and approx_percentile_cont_weight are not supported for UDAF from protobuf yet. - let logical_exprs = &[]; create_window_expr( &fun, name, &window_node_expr, - logical_exprs, &partition_by, &order_by, Arc::new(window_frame), diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index b5d28f40a68f..0f6722dd375b 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -477,7 +477,7 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode { ExprType::AggregateExpr(agg_node) => { let input_phy_expr: Vec> = agg_node.expr.iter() .map(|e| parse_physical_expr(e, registry, &physical_schema, extension_codec)).collect::>>()?; - let _ordering_req: Vec = agg_node.ordering_req.iter() + let ordering_req: Vec = agg_node.ordering_req.iter() .map(|e| parse_physical_sort_expr(e, registry, &physical_schema, extension_codec)).collect::>>()?; agg_node.aggregate_function.as_ref().map(|func| { match func { @@ -487,14 +487,12 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode { None => registry.udaf(udaf_name)? }; - // TODO: approx_percentile_cont and approx_percentile_cont_weight are not supported for UDAF from protobuf yet. - // TODO: `order by` is not supported for UDAF yet - // https://github.com/apache/datafusion/issues/11804 AggregateExprBuilder::new(agg_udf, input_phy_expr) .schema(Arc::clone(&physical_schema)) .alias(name) .with_ignore_nulls(agg_node.ignore_nulls) .with_distinct(agg_node.distinct) + .order_by(ordering_req) .build() } } diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index a18fa03b2d15..eb7cc5c4b9c5 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -58,7 +58,7 @@ use datafusion_common::{ use datafusion_expr::dml::CopyTo; use datafusion_expr::expr::{ self, Between, BinaryExpr, Case, Cast, GroupingSet, InList, Like, ScalarFunction, - Sort, Unnest, + Sort, Unnest, WildcardOptions, }; use datafusion_expr::logical_plan::{Extension, UserDefinedLogicalNodeCore}; use datafusion_expr::{ @@ -1977,7 +1977,10 @@ fn roundtrip_unnest() { #[test] fn roundtrip_wildcard() { - let test_expr = Expr::Wildcard { qualifier: None }; + let test_expr = Expr::Wildcard { + qualifier: None, + options: WildcardOptions::default(), + }; let ctx = SessionContext::new(); roundtrip_expr_test(test_expr, ctx); @@ -1987,6 +1990,7 @@ fn roundtrip_wildcard() { fn roundtrip_qualified_wildcard() { let test_expr = Expr::Wildcard { qualifier: Some("foo".into()), + options: WildcardOptions::default(), }; let ctx = SessionContext::new(); diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs index 1a9c6d40ebe6..6766468ef443 100644 --- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs @@ -25,6 +25,8 @@ use std::vec; use arrow::array::RecordBatch; use arrow::csv::WriterBuilder; use datafusion::physical_expr_functions_aggregate::aggregate::AggregateExprBuilder; +use datafusion_functions_aggregate::approx_percentile_cont::approx_percentile_cont_udaf; +use datafusion_functions_aggregate::array_agg::array_agg_udaf; use datafusion_functions_aggregate::min_max::max_udaf; use prost::Message; @@ -412,6 +414,70 @@ fn rountrip_aggregate_with_limit() -> Result<()> { roundtrip_test(Arc::new(agg)) } +#[test] +fn rountrip_aggregate_with_approx_pencentile_cont() -> Result<()> { + let field_a = Field::new("a", DataType::Int64, false); + let field_b = Field::new("b", DataType::Int64, false); + let schema = Arc::new(Schema::new(vec![field_a, field_b])); + + let groups: Vec<(Arc, String)> = + vec![(col("a", &schema)?, "unused".to_string())]; + + let aggregates: Vec> = vec![AggregateExprBuilder::new( + approx_percentile_cont_udaf(), + vec![col("b", &schema)?, lit(0.5)], + ) + .schema(Arc::clone(&schema)) + .alias("APPROX_PERCENTILE_CONT(b, 0.5)") + .build()?]; + + let agg = AggregateExec::try_new( + AggregateMode::Final, + PhysicalGroupBy::new_single(groups.clone()), + aggregates.clone(), + vec![None], + Arc::new(EmptyExec::new(schema.clone())), + schema, + )?; + roundtrip_test(Arc::new(agg)) +} + +#[test] +fn rountrip_aggregate_with_sort() -> Result<()> { + let field_a = Field::new("a", DataType::Int64, false); + let field_b = Field::new("b", DataType::Int64, false); + let schema = Arc::new(Schema::new(vec![field_a, field_b])); + + let groups: Vec<(Arc, String)> = + vec![(col("a", &schema)?, "unused".to_string())]; + let sort_exprs = vec![PhysicalSortExpr { + expr: col("b", &schema)?, + options: SortOptions { + descending: false, + nulls_first: true, + }, + }]; + + let aggregates: Vec> = + vec![ + AggregateExprBuilder::new(array_agg_udaf(), vec![col("b", &schema)?]) + .schema(Arc::clone(&schema)) + .alias("ARRAY_AGG(b)") + .order_by(sort_exprs) + .build()?, + ]; + + let agg = AggregateExec::try_new( + AggregateMode::Final, + PhysicalGroupBy::new_single(groups.clone()), + aggregates.clone(), + vec![None], + Arc::new(EmptyExec::new(schema.clone())), + schema, + )?; + roundtrip_test(Arc::new(agg)) +} + #[test] fn roundtrip_aggregate_udaf() -> Result<()> { let field_a = Field::new("a", DataType::Int64, false); diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs index d16d08b041ae..b95414a8cafd 100644 --- a/datafusion/sql/src/expr/function.rs +++ b/datafusion/sql/src/expr/function.rs @@ -22,6 +22,7 @@ use datafusion_common::{ internal_datafusion_err, not_impl_err, plan_datafusion_err, plan_err, DFSchema, Dependency, Result, }; +use datafusion_expr::expr::WildcardOptions; use datafusion_expr::planner::PlannerResult; use datafusion_expr::{ expr, Expr, ExprFunctionExt, ExprSchemable, WindowFrame, WindowFunctionDefinition, @@ -420,13 +421,17 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { name: _, arg: FunctionArgExpr::Wildcard, operator: _, - } => Ok(Expr::Wildcard { qualifier: None }), + } => Ok(Expr::Wildcard { + qualifier: None, + options: WildcardOptions::default(), + }), FunctionArg::Unnamed(FunctionArgExpr::Expr(arg)) => { self.sql_expr_to_logical_expr(arg, schema, planner_context) } - FunctionArg::Unnamed(FunctionArgExpr::Wildcard) => { - Ok(Expr::Wildcard { qualifier: None }) - } + FunctionArg::Unnamed(FunctionArgExpr::Wildcard) => Ok(Expr::Wildcard { + qualifier: None, + options: WildcardOptions::default(), + }), _ => not_impl_err!("Unsupported qualified wildcard argument: {sql:?}"), } } diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index edb0002842a8..7c94e5ead5c3 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -27,10 +27,10 @@ use sqlparser::ast::{ use datafusion_common::{ internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, Result, - ScalarValue, + ScalarValue, TableReference, }; -use datafusion_expr::expr::InList; use datafusion_expr::expr::ScalarFunction; +use datafusion_expr::expr::{InList, WildcardOptions}; use datafusion_expr::{ lit, Between, BinaryExpr, Cast, Expr, ExprSchemable, GetFieldAccess, Like, Literal, Operator, TryCast, @@ -661,6 +661,15 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } not_impl_err!("AnyOp not supported by ExprPlanner: {binary_expr:?}") } + SQLExpr::Wildcard => Ok(Expr::Wildcard { + qualifier: None, + options: WildcardOptions::default(), + }), + SQLExpr::QualifiedWildcard(object_name) => Ok(Expr::Wildcard { + qualifier: Some(TableReference::from(object_name.to_string())), + options: WildcardOptions::default(), + }), + SQLExpr::Tuple(values) => self.parse_tuple(schema, planner_context, values), _ => not_impl_err!("Unsupported ast node in sqltorel: {sql:?}"), } } @@ -670,7 +679,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { &self, schema: &DFSchema, planner_context: &mut PlannerContext, - values: Vec, + values: Vec, fields: Vec, ) -> Result { if !fields.is_empty() { @@ -695,6 +704,23 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { not_impl_err!("Struct not supported by ExprPlanner: {create_struct_args:?}") } + fn parse_tuple( + &self, + schema: &DFSchema, + planner_context: &mut PlannerContext, + values: Vec, + ) -> Result { + match values.first() { + Some(SQLExpr::Identifier(_)) | Some(SQLExpr::Value(_)) => { + self.parse_struct(schema, planner_context, values, vec![]) + } + None => not_impl_err!("Empty tuple not supported yet"), + _ => { + not_impl_err!("Only identifiers and literals are supported in tuples") + } + } + } + fn sql_position_to_expr( &self, substr_expr: SQLExpr, diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs index 40dd368f9e80..2df8d89c59bc 100644 --- a/datafusion/sql/src/parser.rs +++ b/datafusion/sql/src/parser.rs @@ -519,14 +519,7 @@ impl<'a> DFParser<'a> { Token::SingleQuotedString(s) => Ok(Value::SingleQuotedString(s)), Token::DoubleQuotedString(s) => Ok(Value::DoubleQuotedString(s)), Token::EscapedStringLiteral(s) => Ok(Value::EscapedStringLiteral(s)), - Token::Number(ref n, l) => match n.parse() { - Ok(n) => Ok(Value::Number(n, l)), - // The tokenizer should have ensured `n` is an integer - // so this should not be possible - Err(e) => parser_err!(format!( - "Unexpected error: could not parse '{n}' as number: {e}" - )), - }, + Token::Number(n, l) => Ok(Value::Number(n, l)), _ => self.parser.expected("string or numeric value", next_token), } } diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index 95a44dace31a..339234d9965c 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -27,23 +27,23 @@ use crate::utils::{ }; use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; +use datafusion_common::UnnestOptions; use datafusion_common::{not_impl_err, plan_err, DataFusionError, Result}; -use datafusion_common::{Column, UnnestOptions}; -use datafusion_expr::expr::Alias; +use datafusion_expr::expr::{Alias, PlannedReplaceSelectItem, WildcardOptions}; use datafusion_expr::expr_rewriter::{ normalize_col, normalize_col_with_schemas_and_ambiguity_check, normalize_cols, }; use datafusion_expr::logical_plan::tree_node::unwrap_arc; use datafusion_expr::utils::{ - expand_qualified_wildcard, expand_wildcard, expr_as_column_expr, expr_to_columns, - find_aggregate_exprs, find_window_exprs, + expr_as_column_expr, expr_to_columns, find_aggregate_exprs, find_window_exprs, }; use datafusion_expr::{ - Aggregate, Expr, Filter, GroupingSet, LogicalPlan, LogicalPlanBuilder, Partitioning, + qualified_wildcard_with_options, wildcard_with_options, Aggregate, Expr, Filter, + GroupingSet, LogicalPlan, LogicalPlanBuilder, Partitioning, }; use sqlparser::ast::{ Distinct, Expr as SQLExpr, GroupByExpr, NamedWindowExpr, OrderByExpr, - ReplaceSelectItem, WildcardAdditionalOptions, WindowType, + WildcardAdditionalOptions, WindowType, }; use sqlparser::ast::{NamedWindowDefinition, Select, SelectItem, TableWithJoins}; @@ -82,7 +82,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // handle named windows before processing the projection expression check_conflicting_windows(&select.named_window)?; match_window_definitions(&mut select.projection, &select.named_window)?; - // process the SELECT expressions, with wildcards expanded. + // process the SELECT expressions let select_exprs = self.prepare_select_exprs( &base_plan, select.projection, @@ -515,8 +515,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } /// Returns the `Expr`'s corresponding to a SQL query's SELECT expressions. - /// - /// Wildcards are expanded into the concrete list of columns. fn prepare_select_exprs( &self, plan: &LogicalPlan, @@ -570,49 +568,30 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } SelectItem::Wildcard(options) => { Self::check_wildcard_options(&options)?; - if empty_from { return plan_err!("SELECT * with no tables specified is not valid"); } - // do not expand from outer schema - let expanded_exprs = - expand_wildcard(plan.schema().as_ref(), plan, Some(&options))?; - // If there is a REPLACE statement, replace that column with the given - // replace expression. Column name remains the same. - if let Some(replace) = options.opt_replace { - self.replace_columns( - plan, - empty_from, - planner_context, - expanded_exprs, - replace, - ) - } else { - Ok(expanded_exprs) - } + let planned_options = self.plan_wildcard_options( + plan, + empty_from, + planner_context, + options, + )?; + Ok(vec![wildcard_with_options(planned_options)]) } SelectItem::QualifiedWildcard(object_name, options) => { Self::check_wildcard_options(&options)?; let qualifier = idents_to_table_reference(object_name.0, false)?; - // do not expand from outer schema - let expanded_exprs = expand_qualified_wildcard( - &qualifier, - plan.schema().as_ref(), - Some(&options), + let planned_options = self.plan_wildcard_options( + plan, + empty_from, + planner_context, + options, )?; - // If there is a REPLACE statement, replace that column with the given - // replace expression. Column name remains the same. - if let Some(replace) = options.opt_replace { - self.replace_columns( - plan, - empty_from, - planner_context, - expanded_exprs, - replace, - ) - } else { - Ok(expanded_exprs) - } + Ok(vec![qualified_wildcard_with_options( + qualifier, + planned_options, + )]) } } } @@ -637,40 +616,44 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } /// If there is a REPLACE statement in the projected expression in the form of - /// "REPLACE (some_column_within_an_expr AS some_column)", this function replaces - /// that column with the given replace expression. Column name remains the same. - /// Multiple REPLACEs are also possible with comma separations. - fn replace_columns( + /// "REPLACE (some_column_within_an_expr AS some_column)", we should plan the + /// replace expressions first. + fn plan_wildcard_options( &self, plan: &LogicalPlan, empty_from: bool, planner_context: &mut PlannerContext, - mut exprs: Vec, - replace: ReplaceSelectItem, - ) -> Result> { - for expr in exprs.iter_mut() { - if let Expr::Column(Column { name, .. }) = expr { - if let Some(item) = replace - .items - .iter() - .find(|item| item.column_name.value == *name) - { - let new_expr = self.sql_select_to_rex( + options: WildcardAdditionalOptions, + ) -> Result { + let planned_option = WildcardOptions { + ilike: options.opt_ilike, + exclude: options.opt_exclude, + except: options.opt_except, + replace: None, + rename: options.opt_rename, + }; + if let Some(replace) = options.opt_replace { + let replace_expr = replace + .items + .iter() + .map(|item| { + Ok(self.sql_select_to_rex( SelectItem::UnnamedExpr(item.expr.clone()), plan, empty_from, planner_context, )?[0] - .clone(); - *expr = Expr::Alias(Alias { - expr: Box::new(new_expr), - relation: None, - name: name.clone(), - }); - } - } + .clone()) + }) + .collect::>>()?; + let planned_replace = PlannedReplaceSelectItem { + items: replace.items.into_iter().map(|i| *i).collect(), + planned_expressions: replace_expr, + }; + Ok(planned_option.with_replace(planned_replace)) + } else { + Ok(planned_option) } - Ok(exprs) } /// Wrap a plan in a projection @@ -715,7 +698,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let plan = LogicalPlanBuilder::from(input.clone()) .aggregate(group_by_exprs.to_vec(), aggr_exprs.to_vec())? .build()?; - let group_by_exprs = if let LogicalPlan::Aggregate(agg) = &plan { &agg.group_expr } else { diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs index de130754ab1a..39511ea4d03a 100644 --- a/datafusion/sql/src/unparser/expr.rs +++ b/datafusion/sql/src/unparser/expr.rs @@ -21,11 +21,13 @@ use datafusion_expr::ScalarUDF; use sqlparser::ast::Value::SingleQuotedString; use sqlparser::ast::{ self, BinaryOperator, Expr as AstExpr, Function, FunctionArg, Ident, Interval, - TimezoneInfo, UnaryOperator, + ObjectName, TimezoneInfo, UnaryOperator, }; use std::sync::Arc; use std::{fmt::Display, vec}; +use super::dialect::{DateFieldExtractStyle, IntervalStyle}; +use super::Unparser; use arrow::datatypes::{Decimal128Type, Decimal256Type, DecimalType}; use arrow::util::display::array_value_to_string; use arrow_array::types::{ @@ -44,9 +46,6 @@ use datafusion_expr::{ Between, BinaryExpr, Case, Cast, Expr, GroupingSet, Like, Operator, TryCast, }; -use super::dialect::{DateFieldExtractStyle, IntervalStyle}; -use super::Unparser; - /// DataFusion's Exprs can represent either an `Expr` or an `OrderByExpr` pub enum Unparsed { // SQL Expression @@ -159,7 +158,13 @@ impl Unparser<'_> { let args = args .iter() .map(|e| { - if matches!(e, Expr::Wildcard { qualifier: None }) { + if matches!( + e, + Expr::Wildcard { + qualifier: None, + .. + } + ) { Ok(FunctionArg::Unnamed(ast::FunctionArgExpr::Wildcard)) } else { self.expr_to_sql_inner(e).map(|e| { @@ -477,8 +482,15 @@ impl Unparser<'_> { format: None, }) } - Expr::Wildcard { qualifier: _ } => { - not_impl_err!("Unsupported Expr conversion: {expr:?}") + // TODO: unparsing wildcard addition options + Expr::Wildcard { qualifier, .. } => { + if let Some(qualifier) = qualifier { + let idents: Vec = + qualifier.to_vec().into_iter().map(Ident::new).collect(); + Ok(ast::Expr::QualifiedWildcard(ObjectName(idents))) + } else { + Ok(ast::Expr::Wildcard) + } } Expr::GroupingSet(grouping_set) => match grouping_set { GroupingSet::GroupingSets(grouping_sets) => { @@ -643,7 +655,13 @@ impl Unparser<'_> { fn function_args_to_sql(&self, args: &[Expr]) -> Result> { args.iter() .map(|e| { - if matches!(e, Expr::Wildcard { qualifier: None }) { + if matches!( + e, + Expr::Wildcard { + qualifier: None, + .. + } + ) { Ok(ast::FunctionArg::Unnamed(ast::FunctionArgExpr::Wildcard)) } else { self.expr_to_sql(e) @@ -1503,6 +1521,7 @@ mod tests { use arrow_schema::DataType::Int8; use ast::ObjectName; use datafusion_common::TableReference; + use datafusion_expr::expr::WildcardOptions; use datafusion_expr::{ case, col, cube, exists, grouping_set, interval_datetime_lit, interval_year_month_lit, lit, not, not_exists, out_ref_col, placeholder, rollup, @@ -1558,7 +1577,10 @@ mod tests { fn expr_to_sql_ok() -> Result<()> { let dummy_schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); let dummy_logical_plan = table_scan(Some("t"), &dummy_schema, None)? - .project(vec![Expr::Wildcard { qualifier: None }])? + .project(vec![Expr::Wildcard { + qualifier: None, + options: WildcardOptions::default(), + }])? .filter(col("a").eq(lit(1)))? .build()?; @@ -1749,7 +1771,10 @@ mod tests { (sum(col("a")), r#"sum(a)"#), ( count_udaf() - .call(vec![Expr::Wildcard { qualifier: None }]) + .call(vec![Expr::Wildcard { + qualifier: None, + options: WildcardOptions::default(), + }]) .distinct() .build() .unwrap(), @@ -1757,7 +1782,10 @@ mod tests { ), ( count_udaf() - .call(vec![Expr::Wildcard { qualifier: None }]) + .call(vec![Expr::Wildcard { + qualifier: None, + options: WildcardOptions::default(), + }]) .filter(lit(true)) .build() .unwrap(), @@ -1833,11 +1861,11 @@ mod tests { (Expr::Negative(Box::new(col("a"))), r#"-a"#), ( exists(Arc::new(dummy_logical_plan.clone())), - r#"EXISTS (SELECT t.a FROM t WHERE (t.a = 1))"#, + r#"EXISTS (SELECT * FROM t WHERE (t.a = 1))"#, ), ( not_exists(Arc::new(dummy_logical_plan.clone())), - r#"NOT EXISTS (SELECT t.a FROM t WHERE (t.a = 1))"#, + r#"NOT EXISTS (SELECT * FROM t WHERE (t.a = 1))"#, ), ( try_cast(col("a"), DataType::Date64), diff --git a/datafusion/sql/src/unparser/plan.rs b/datafusion/sql/src/unparser/plan.rs index 277efd5fe700..024f33fb2c7d 100644 --- a/datafusion/sql/src/unparser/plan.rs +++ b/datafusion/sql/src/unparser/plan.rs @@ -359,18 +359,14 @@ impl Unparser<'_> { .iter() .map(|e| self.select_item_to_sql(e)) .collect::>>()?; - match &on.sort_expr { - Some(sort_expr) => { - if let Some(query_ref) = query { - query_ref - .order_by(self.sort_to_sql(sort_expr.clone())?); - } else { - return internal_err!( - "Sort operator only valid in a statement context." - ); - } + if let Some(sort_expr) = &on.sort_expr { + if let Some(query_ref) = query { + query_ref.order_by(self.sort_to_sql(sort_expr.clone())?); + } else { + return internal_err!( + "Sort operator only valid in a statement context." + ); } - None => {} } select.projection(items); (ast::Distinct::On(exprs), on.input.as_ref()) diff --git a/datafusion/sql/tests/cases/plan_to_sql.rs b/datafusion/sql/tests/cases/plan_to_sql.rs index 179fc108e6d2..ed23fada0cfb 100644 --- a/datafusion/sql/tests/cases/plan_to_sql.rs +++ b/datafusion/sql/tests/cases/plan_to_sql.rs @@ -33,7 +33,7 @@ use datafusion_functions::core::planner::CoreFunctionPlanner; use sqlparser::dialect::{Dialect, GenericDialect, MySqlDialect}; use sqlparser::parser::Parser; -use crate::common::MockContextProvider; +use crate::common::{MockContextProvider, MockSessionState}; #[test] fn roundtrip_expr() { @@ -59,8 +59,8 @@ fn roundtrip_expr() { let roundtrip = |table, sql: &str| -> Result { let dialect = GenericDialect {}; let sql_expr = Parser::new(&dialect).try_with_sql(sql)?.parse_expr()?; - - let context = MockContextProvider::default().with_udaf(sum_udaf()); + let state = MockSessionState::default().with_aggregate_function(sum_udaf()); + let context = MockContextProvider { state }; let schema = context.get_table_source(table)?.schema(); let df_schema = DFSchema::try_from(schema.as_ref().clone())?; let sql_to_rel = SqlToRel::new(&context); @@ -156,11 +156,11 @@ fn roundtrip_statement() -> Result<()> { let statement = Parser::new(&dialect) .try_with_sql(query)? .parse_statement()?; - - let context = MockContextProvider::default() - .with_udaf(sum_udaf()) - .with_udaf(count_udaf()) + let state = MockSessionState::default() + .with_aggregate_function(sum_udaf()) + .with_aggregate_function(count_udaf()) .with_expr_planner(Arc::new(CoreFunctionPlanner::default())); + let context = MockContextProvider { state }; let sql_to_rel = SqlToRel::new(&context); let plan = sql_to_rel.sql_statement_to_plan(statement).unwrap(); @@ -189,8 +189,10 @@ fn roundtrip_crossjoin() -> Result<()> { .try_with_sql(query)? .parse_statement()?; - let context = MockContextProvider::default() + let state = MockSessionState::default() .with_expr_planner(Arc::new(CoreFunctionPlanner::default())); + + let context = MockContextProvider { state }; let sql_to_rel = SqlToRel::new(&context); let plan = sql_to_rel.sql_statement_to_plan(statement).unwrap(); @@ -412,10 +414,12 @@ fn roundtrip_statement_with_dialect() -> Result<()> { .try_with_sql(query.sql)? .parse_statement()?; - let context = MockContextProvider::default() - .with_expr_planner(Arc::new(CoreFunctionPlanner::default())) - .with_udaf(max_udaf()) - .with_udaf(min_udaf()); + let state = MockSessionState::default() + .with_aggregate_function(max_udaf()) + .with_aggregate_function(min_udaf()) + .with_expr_planner(Arc::new(CoreFunctionPlanner::default())); + + let context = MockContextProvider { state }; let sql_to_rel = SqlToRel::new(&context); let plan = sql_to_rel .sql_statement_to_plan(statement) @@ -443,7 +447,9 @@ fn test_unnest_logical_plan() -> Result<()> { .try_with_sql(query)? .parse_statement()?; - let context = MockContextProvider::default(); + let context = MockContextProvider { + state: MockSessionState::default(), + }; let sql_to_rel = SqlToRel::new(&context); let plan = sql_to_rel.sql_statement_to_plan(statement).unwrap(); @@ -516,7 +522,9 @@ fn test_pretty_roundtrip() -> Result<()> { let df_schema = DFSchema::try_from(schema)?; - let context = MockContextProvider::default(); + let context = MockContextProvider { + state: MockSessionState::default(), + }; let sql_to_rel = SqlToRel::new(&context); let unparser = Unparser::default().with_pretty(true); @@ -589,7 +597,9 @@ fn sql_round_trip(query: &str, expect: &str) { .parse_statement() .unwrap(); - let context = MockContextProvider::default(); + let context = MockContextProvider { + state: MockSessionState::default(), + }; let sql_to_rel = SqlToRel::new(&context); let plan = sql_to_rel.sql_statement_to_plan(statement).unwrap(); diff --git a/datafusion/sql/tests/common/mod.rs b/datafusion/sql/tests/common/mod.rs index 374aa9db6714..fe0e5f7283a4 100644 --- a/datafusion/sql/tests/common/mod.rs +++ b/datafusion/sql/tests/common/mod.rs @@ -50,36 +50,40 @@ impl Display for MockCsvType { } #[derive(Default)] -pub(crate) struct MockContextProvider { - options: ConfigOptions, - udfs: HashMap>, - udafs: HashMap>, +pub(crate) struct MockSessionState { + scalar_functions: HashMap>, + aggregate_functions: HashMap>, expr_planners: Vec>, + pub config_options: ConfigOptions, } -impl MockContextProvider { - // Suppressing dead code warning, as this is used in integration test crates - #[allow(dead_code)] - pub(crate) fn options_mut(&mut self) -> &mut ConfigOptions { - &mut self.options +impl MockSessionState { + pub fn with_expr_planner(mut self, expr_planner: Arc) -> Self { + self.expr_planners.push(expr_planner); + self } - #[allow(dead_code)] - pub(crate) fn with_udf(mut self, udf: ScalarUDF) -> Self { - self.udfs.insert(udf.name().to_string(), Arc::new(udf)); + pub fn with_scalar_function(mut self, scalar_function: Arc) -> Self { + self.scalar_functions + .insert(scalar_function.name().to_string(), scalar_function); self } - pub(crate) fn with_udaf(mut self, udaf: Arc) -> Self { + pub fn with_aggregate_function( + mut self, + aggregate_function: Arc, + ) -> Self { // TODO: change to to_string() if all the function name is converted to lowercase - self.udafs.insert(udaf.name().to_lowercase(), udaf); + self.aggregate_functions.insert( + aggregate_function.name().to_string().to_lowercase(), + aggregate_function, + ); self } +} - pub(crate) fn with_expr_planner(mut self, planner: Arc) -> Self { - self.expr_planners.push(planner); - self - } +pub(crate) struct MockContextProvider { + pub(crate) state: MockSessionState, } impl ContextProvider for MockContextProvider { @@ -202,11 +206,11 @@ impl ContextProvider for MockContextProvider { } fn get_function_meta(&self, name: &str) -> Option> { - self.udfs.get(name).cloned() + self.state.scalar_functions.get(name).cloned() } fn get_aggregate_meta(&self, name: &str) -> Option> { - self.udafs.get(name).cloned() + self.state.aggregate_functions.get(name).cloned() } fn get_variable_type(&self, _: &[String]) -> Option { @@ -218,7 +222,7 @@ impl ContextProvider for MockContextProvider { } fn options(&self) -> &ConfigOptions { - &self.options + &self.state.config_options } fn get_file_type( @@ -237,11 +241,11 @@ impl ContextProvider for MockContextProvider { } fn udf_names(&self) -> Vec { - self.udfs.keys().cloned().collect() + self.state.scalar_functions.keys().cloned().collect() } fn udaf_names(&self) -> Vec { - self.udafs.keys().cloned().collect() + self.state.aggregate_functions.keys().cloned().collect() } fn udwf_names(&self) -> Vec { @@ -249,7 +253,7 @@ impl ContextProvider for MockContextProvider { } fn get_expr_planners(&self) -> &[Arc] { - &self.expr_planners + &self.state.expr_planners } } diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 4d7e60805657..7ce3565fa29f 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -41,6 +41,7 @@ use datafusion_sql::{ planner::{ParserOptions, SqlToRel}, }; +use crate::common::MockSessionState; use datafusion_functions::core::planner::CoreFunctionPlanner; use datafusion_functions_aggregate::{ approx_median::approx_median_udaf, count::count_udaf, min_max::max_udaf, @@ -57,7 +58,7 @@ mod common; fn test_schema_support() { quick_test( "SELECT * FROM s1.test", - "Projection: s1.test.t_date32, s1.test.t_date64\ + "Projection: *\ \n TableScan: s1.test", ); } @@ -516,7 +517,7 @@ fn plan_copy_to_query() { let plan = r#" CopyTo: format=csv output_url=output.csv options: () Limit: skip=0, fetch=10 - Projection: test_decimal.id, test_decimal.price + Projection: * TableScan: test_decimal "# .trim(); @@ -636,23 +637,13 @@ fn select_repeated_column() { ); } -#[test] -fn select_wildcard_with_repeated_column() { - let sql = "SELECT *, age FROM person"; - let err = logical_plan(sql).expect_err("query should have failed"); - assert_eq!( - "Error during planning: Projections require unique expression names but the expression \"person.age\" at position 3 and \"person.age\" at position 8 have the same name. Consider aliasing (\"AS\") one of them.", - err.strip_backtrace() - ); -} - #[test] fn select_wildcard_with_repeated_column_but_is_aliased() { quick_test( - "SELECT *, first_name AS fn from person", - "Projection: person.id, person.first_name, person.last_name, person.age, person.state, person.salary, person.birth_date, person.😀, person.first_name AS fn\ + "SELECT *, first_name AS fn from person", + "Projection: *, person.first_name AS fn\ \n TableScan: person", - ); + ); } #[test] @@ -869,7 +860,7 @@ fn where_selection_with_ambiguous_column() { #[test] fn natural_join() { let sql = "SELECT * FROM lineitem a NATURAL JOIN lineitem b"; - let expected = "Projection: a.l_item_id, a.l_description, a.price\ + let expected = "Projection: *\ \n Inner Join: Using a.l_item_id = b.l_item_id, a.l_description = b.l_description, a.price = b.price\ \n SubqueryAlias: a\ \n TableScan: lineitem\ @@ -905,7 +896,7 @@ fn natural_right_join() { #[test] fn natural_join_no_common_becomes_cross_join() { let sql = "SELECT * FROM person a NATURAL JOIN lineitem b"; - let expected = "Projection: a.id, a.first_name, a.last_name, a.age, a.state, a.salary, a.birth_date, a.😀, b.l_item_id, b.l_description, b.price\ + let expected = "Projection: *\ \n CrossJoin:\ \n SubqueryAlias: a\ \n TableScan: person\ @@ -917,8 +908,7 @@ fn natural_join_no_common_becomes_cross_join() { #[test] fn using_join_multiple_keys() { let sql = "SELECT * FROM person a join person b using (id, age)"; - let expected = "Projection: a.id, a.first_name, a.last_name, a.age, a.state, a.salary, a.birth_date, a.😀, \ - b.first_name, b.last_name, b.state, b.salary, b.birth_date, b.😀\ + let expected = "Projection: *\ \n Inner Join: Using a.id = b.id, a.age = b.age\ \n SubqueryAlias: a\ \n TableScan: person\ @@ -932,8 +922,7 @@ fn using_join_multiple_keys_subquery() { let sql = "SELECT age FROM (SELECT * FROM person a join person b using (id, age, state))"; let expected = "Projection: a.age\ - \n Projection: a.id, a.first_name, a.last_name, a.age, a.state, a.salary, a.birth_date, a.😀, \ - b.first_name, b.last_name, b.salary, b.birth_date, b.😀\ + \n Projection: *\ \n Inner Join: Using a.id = b.id, a.age = b.age, a.state = b.state\ \n SubqueryAlias: a\ \n TableScan: person\ @@ -945,8 +934,7 @@ fn using_join_multiple_keys_subquery() { #[test] fn using_join_multiple_keys_qualified_wildcard_select() { let sql = "SELECT a.* FROM person a join person b using (id, age)"; - let expected = - "Projection: a.id, a.first_name, a.last_name, a.age, a.state, a.salary, a.birth_date, a.😀\ + let expected = "Projection: a.*\ \n Inner Join: Using a.id = b.id, a.age = b.age\ \n SubqueryAlias: a\ \n TableScan: person\ @@ -958,8 +946,7 @@ fn using_join_multiple_keys_qualified_wildcard_select() { #[test] fn using_join_multiple_keys_select_all_columns() { let sql = "SELECT a.*, b.* FROM person a join person b using (id, age)"; - let expected = "Projection: a.id, a.first_name, a.last_name, a.age, a.state, a.salary, a.birth_date, a.😀, \ - b.id, b.first_name, b.last_name, b.age, b.state, b.salary, b.birth_date, b.😀\ + let expected = "Projection: a.*, b.*\ \n Inner Join: Using a.id = b.id, a.age = b.age\ \n SubqueryAlias: a\ \n TableScan: person\ @@ -971,9 +958,7 @@ fn using_join_multiple_keys_select_all_columns() { #[test] fn using_join_multiple_keys_multiple_joins() { let sql = "SELECT * FROM person a join person b using (id, age, state) join person c using (id, age, state)"; - let expected = "Projection: a.id, a.first_name, a.last_name, a.age, a.state, a.salary, a.birth_date, a.😀, \ - b.first_name, b.last_name, b.salary, b.birth_date, b.😀, \ - c.first_name, c.last_name, c.salary, c.birth_date, c.😀\ + let expected = "Projection: *\ \n Inner Join: Using a.id = c.id, a.age = c.age, a.state = c.state\ \n Inner Join: Using a.id = b.id, a.age = b.age, a.state = b.state\ \n SubqueryAlias: a\ @@ -1304,13 +1289,13 @@ fn select_binary_expr_nested() { fn select_wildcard_with_groupby() { quick_test( r#"SELECT * FROM person GROUP BY id, first_name, last_name, age, state, salary, birth_date, "😀""#, - "Projection: person.id, person.first_name, person.last_name, person.age, person.state, person.salary, person.birth_date, person.😀\ + "Projection: *\ \n Aggregate: groupBy=[[person.id, person.first_name, person.last_name, person.age, person.state, person.salary, person.birth_date, person.😀]], aggr=[[]]\ \n TableScan: person", ); quick_test( "SELECT * FROM (SELECT first_name, last_name FROM person) AS a GROUP BY first_name, last_name", - "Projection: a.first_name, a.last_name\ + "Projection: *\ \n Aggregate: groupBy=[[a.first_name, a.last_name]], aggr=[[]]\ \n SubqueryAlias: a\ \n Projection: person.first_name, person.last_name\ @@ -1473,7 +1458,7 @@ fn recursive_ctes() { select * from numbers;"; quick_test( sql, - "Projection: numbers.n\ + "Projection: *\ \n SubqueryAlias: numbers\ \n RecursiveQuery: is_distinct=false\ \n Projection: Int64(1) AS n\ @@ -1495,8 +1480,9 @@ fn recursive_ctes_disabled() { select * from numbers;"; // manually setting up test here so that we can disable recursive ctes - let mut context = MockContextProvider::default(); - context.options_mut().execution.enable_recursive_ctes = false; + let mut state = MockSessionState::default(); + state.config_options.execution.enable_recursive_ctes = false; + let context = MockContextProvider { state }; let planner = SqlToRel::new_with_options(&context, ParserOptions::default()); let result = DFParser::parse_sql_with_dialect(sql, &GenericDialect {}); @@ -1685,10 +1671,10 @@ fn select_aggregate_with_non_column_inner_expression_with_groupby() { #[test] fn test_wildcard() { quick_test( - "SELECT * from person", - "Projection: person.id, person.first_name, person.last_name, person.age, person.state, person.salary, person.birth_date, person.😀\ + "SELECT * from person", + "Projection: *\ \n TableScan: person", - ); + ); } #[test] @@ -2116,7 +2102,7 @@ fn project_wildcard_on_join_with_using() { FROM lineitem \ JOIN lineitem as lineitem2 \ USING (l_item_id)"; - let expected = "Projection: lineitem.l_item_id, lineitem.l_description, lineitem.price, lineitem2.l_description, lineitem2.price\ + let expected = "Projection: *\ \n Inner Join: Using lineitem.l_item_id = lineitem2.l_item_id\ \n TableScan: lineitem\ \n SubqueryAlias: lineitem2\ @@ -2174,148 +2160,6 @@ fn union_all() { quick_test(sql, expected); } -#[test] -fn union_with_different_column_names() { - let sql = "SELECT order_id from orders UNION ALL SELECT customer_id FROM orders"; - let expected = "Union\ - \n Projection: orders.order_id\ - \n TableScan: orders\ - \n Projection: orders.customer_id AS order_id\ - \n TableScan: orders"; - quick_test(sql, expected); -} - -#[test] -fn union_values_with_no_alias() { - let sql = "SELECT 1, 2 UNION ALL SELECT 3, 4"; - let expected = "Union\ - \n Projection: Int64(1) AS Int64(1), Int64(2) AS Int64(2)\ - \n EmptyRelation\ - \n Projection: Int64(3) AS Int64(1), Int64(4) AS Int64(2)\ - \n EmptyRelation"; - quick_test(sql, expected); -} - -#[test] -fn union_with_incompatible_data_type() { - let sql = "SELECT interval '1 year 1 day' UNION ALL SELECT 1"; - let err = logical_plan(sql) - .expect_err("query should have failed") - .strip_backtrace(); - assert_eq!( - "Error during planning: UNION Column Int64(1) (type: Int64) is not compatible with column IntervalMonthDayNano(\"IntervalMonthDayNano { months: 12, days: 1, nanoseconds: 0 }\") (type: Interval(MonthDayNano))", - err - ); -} - -#[test] -fn union_with_different_decimal_data_types() { - let sql = "SELECT 1 a UNION ALL SELECT 1.1 a"; - let expected = "Union\ - \n Projection: CAST(Int64(1) AS Float64) AS a\ - \n EmptyRelation\ - \n Projection: Float64(1.1) AS a\ - \n EmptyRelation"; - quick_test(sql, expected); -} - -#[test] -fn union_with_null() { - let sql = "SELECT NULL a UNION ALL SELECT 1.1 a"; - let expected = "Union\ - \n Projection: CAST(NULL AS Float64) AS a\ - \n EmptyRelation\ - \n Projection: Float64(1.1) AS a\ - \n EmptyRelation"; - quick_test(sql, expected); -} - -#[test] -fn union_with_float_and_string() { - let sql = "SELECT 'a' a UNION ALL SELECT 1.1 a"; - let expected = "Union\ - \n Projection: Utf8(\"a\") AS a\ - \n EmptyRelation\ - \n Projection: CAST(Float64(1.1) AS Utf8) AS a\ - \n EmptyRelation"; - quick_test(sql, expected); -} - -#[test] -fn union_with_multiply_cols() { - let sql = "SELECT 'a' a, 1 b UNION ALL SELECT 1.1 a, 1.1 b"; - let expected = "Union\ - \n Projection: Utf8(\"a\") AS a, CAST(Int64(1) AS Float64) AS b\ - \n EmptyRelation\ - \n Projection: CAST(Float64(1.1) AS Utf8) AS a, Float64(1.1) AS b\ - \n EmptyRelation"; - quick_test(sql, expected); -} - -#[test] -fn sorted_union_with_different_types_and_group_by() { - let sql = "SELECT a FROM (select 1 a) x GROUP BY 1 UNION ALL (SELECT a FROM (select 1.1 a) x GROUP BY 1) ORDER BY 1"; - let expected = "Sort: x.a ASC NULLS LAST\ - \n Union\ - \n Projection: CAST(x.a AS Float64) AS a\ - \n Aggregate: groupBy=[[x.a]], aggr=[[]]\ - \n SubqueryAlias: x\ - \n Projection: Int64(1) AS a\ - \n EmptyRelation\ - \n Projection: x.a\ - \n Aggregate: groupBy=[[x.a]], aggr=[[]]\ - \n SubqueryAlias: x\ - \n Projection: Float64(1.1) AS a\ - \n EmptyRelation"; - quick_test(sql, expected); -} - -#[test] -fn union_with_binary_expr_and_cast() { - let sql = "SELECT cast(0.0 + a as integer) FROM (select 1 a) x GROUP BY 1 UNION ALL (SELECT 2.1 + a FROM (select 1 a) x GROUP BY 1)"; - let expected = "Union\ - \n Projection: CAST(Float64(0) + x.a AS Float64) AS Float64(0) + x.a\ - \n Aggregate: groupBy=[[CAST(Float64(0) + x.a AS Int32)]], aggr=[[]]\ - \n SubqueryAlias: x\ - \n Projection: Int64(1) AS a\ - \n EmptyRelation\ - \n Projection: Float64(2.1) + x.a AS Float64(0) + x.a\ - \n Aggregate: groupBy=[[Float64(2.1) + x.a]], aggr=[[]]\ - \n SubqueryAlias: x\ - \n Projection: Int64(1) AS a\ - \n EmptyRelation"; - quick_test(sql, expected); -} - -#[test] -fn union_with_aliases() { - let sql = "SELECT a as a1 FROM (select 1 a) x GROUP BY 1 UNION ALL (SELECT a as a1 FROM (select 1.1 a) x GROUP BY 1)"; - let expected = "Union\ - \n Projection: CAST(x.a AS Float64) AS a1\ - \n Aggregate: groupBy=[[x.a]], aggr=[[]]\ - \n SubqueryAlias: x\ - \n Projection: Int64(1) AS a\ - \n EmptyRelation\ - \n Projection: x.a AS a1\ - \n Aggregate: groupBy=[[x.a]], aggr=[[]]\ - \n SubqueryAlias: x\ - \n Projection: Float64(1.1) AS a\ - \n EmptyRelation"; - quick_test(sql, expected); -} - -#[test] -fn union_with_incompatible_data_types() { - let sql = "SELECT 'a' a UNION ALL SELECT true a"; - let err = logical_plan(sql) - .expect_err("query should have failed") - .strip_backtrace(); - assert_eq!( - "Error during planning: UNION Column a (type: Boolean) is not compatible with column a (type: Utf8)", - err - ); -} - #[test] fn empty_over() { let sql = "SELECT order_id, MAX(order_id) OVER () from orders"; @@ -2727,7 +2571,8 @@ fn logical_plan_with_options(sql: &str, options: ParserOptions) -> Result Result { - let context = MockContextProvider::default().with_udaf(sum_udaf()); + let state = MockSessionState::default().with_aggregate_function(sum_udaf()); + let context = MockContextProvider { state }; let planner = SqlToRel::new(&context); let result = DFParser::parse_sql_with_dialect(sql, dialect); let mut ast = result?; @@ -2739,39 +2584,44 @@ fn logical_plan_with_dialect_and_options( dialect: &dyn Dialect, options: ParserOptions, ) -> Result { - let context = MockContextProvider::default() - .with_udf(unicode::character_length().as_ref().clone()) - .with_udf(string::concat().as_ref().clone()) - .with_udf(make_udf( + let state = MockSessionState::default() + .with_scalar_function(Arc::new(unicode::character_length().as_ref().clone())) + .with_scalar_function(Arc::new(string::concat().as_ref().clone())) + .with_scalar_function(Arc::new(make_udf( "nullif", vec![DataType::Int32, DataType::Int32], DataType::Int32, - )) - .with_udf(make_udf( + ))) + .with_scalar_function(Arc::new(make_udf( "round", vec![DataType::Float64, DataType::Int64], DataType::Float32, - )) - .with_udf(make_udf( + ))) + .with_scalar_function(Arc::new(make_udf( "arrow_cast", vec![DataType::Int64, DataType::Utf8], DataType::Float64, - )) - .with_udf(make_udf( + ))) + .with_scalar_function(Arc::new(make_udf( "date_trunc", vec![DataType::Utf8, DataType::Timestamp(Nanosecond, None)], DataType::Int32, - )) - .with_udf(make_udf("sqrt", vec![DataType::Int64], DataType::Int64)) - .with_udaf(sum_udaf()) - .with_udaf(approx_median_udaf()) - .with_udaf(count_udaf()) - .with_udaf(avg_udaf()) - .with_udaf(min_udaf()) - .with_udaf(max_udaf()) - .with_udaf(grouping_udaf()) + ))) + .with_scalar_function(Arc::new(make_udf( + "sqrt", + vec![DataType::Int64], + DataType::Int64, + ))) + .with_aggregate_function(sum_udaf()) + .with_aggregate_function(approx_median_udaf()) + .with_aggregate_function(count_udaf()) + .with_aggregate_function(avg_udaf()) + .with_aggregate_function(min_udaf()) + .with_aggregate_function(max_udaf()) + .with_aggregate_function(grouping_udaf()) .with_expr_planner(Arc::new(CoreFunctionPlanner::default())); + let context = MockContextProvider { state }; let planner = SqlToRel::new_with_options(&context, options); let result = DFParser::parse_sql_with_dialect(sql, dialect); let mut ast = result?; @@ -2997,7 +2847,7 @@ fn exists_subquery_wildcard() { let expected = "Projection: p.id\ \n Filter: EXISTS ()\ \n Subquery:\ - \n Projection: person.id, person.first_name, person.last_name, person.age, person.state, person.salary, person.birth_date, person.😀\ + \n Projection: *\ \n Filter: person.last_name = outer_ref(p.last_name) AND person.state = outer_ref(p.state)\ \n TableScan: person\ \n SubqueryAlias: p\ @@ -3084,13 +2934,13 @@ fn subquery_references_cte() { cte AS (SELECT * FROM person) \ SELECT * FROM person WHERE EXISTS (SELECT * FROM cte WHERE id = person.id)"; - let expected = "Projection: person.id, person.first_name, person.last_name, person.age, person.state, person.salary, person.birth_date, person.😀\ + let expected = "Projection: *\ \n Filter: EXISTS ()\ \n Subquery:\ - \n Projection: cte.id, cte.first_name, cte.last_name, cte.age, cte.state, cte.salary, cte.birth_date, cte.😀\ + \n Projection: *\ \n Filter: cte.id = outer_ref(person.id)\ \n SubqueryAlias: cte\ - \n Projection: person.id, person.first_name, person.last_name, person.age, person.state, person.salary, person.birth_date, person.😀\ + \n Projection: *\ \n TableScan: person\ \n TableScan: person"; @@ -3105,7 +2955,7 @@ fn cte_with_no_column_names() { ) \ SELECT * FROM numbers;"; - let expected = "Projection: numbers.a, numbers.b, numbers.c\ + let expected = "Projection: *\ \n SubqueryAlias: numbers\ \n Projection: Int64(1) AS a, Int64(2) AS b, Int64(3) AS c\ \n EmptyRelation"; @@ -3121,7 +2971,7 @@ fn cte_with_column_names() { ) \ SELECT * FROM numbers;"; - let expected = "Projection: numbers.a, numbers.b, numbers.c\ + let expected = "Projection: *\ \n SubqueryAlias: numbers\ \n Projection: Int64(1) AS a, Int64(2) AS b, Int64(3) AS c\ \n Projection: Int64(1), Int64(2), Int64(3)\ @@ -3139,7 +2989,7 @@ fn cte_with_column_aliases_precedence() { ) \ SELECT * FROM numbers;"; - let expected = "Projection: numbers.a, numbers.b, numbers.c\ + let expected = "Projection: *\ \n SubqueryAlias: numbers\ \n Projection: x AS a, y AS b, z AS c\ \n Projection: Int64(1) AS x, Int64(2) AS y, Int64(3) AS z\ @@ -3520,7 +3370,7 @@ fn test_select_all_inner_join() { INNER JOIN orders \ ON orders.customer_id * 2 = person.id + 10"; - let expected = "Projection: person.id, person.first_name, person.last_name, person.age, person.state, person.salary, person.birth_date, person.😀, orders.order_id, orders.customer_id, orders.o_item_id, orders.qty, orders.price, orders.delivered\ + let expected = "Projection: *\ \n Inner Join: Filter: orders.customer_id * Int64(2) = person.id + Int64(10)\ \n TableScan: person\ \n TableScan: orders"; @@ -4237,7 +4087,7 @@ fn test_prepare_statement_to_plan_value_list() { let sql = "PREPARE my_plan(STRING, STRING) AS SELECT * FROM (VALUES(1, $1), (2, $2)) AS t (num, letter);"; let expected_plan = "Prepare: \"my_plan\" [Utf8, Utf8] \ - \n Projection: t.num, t.letter\ + \n Projection: *\ \n SubqueryAlias: t\ \n Projection: column1 AS num, column2 AS letter\ \n Values: (Int64(1), $1), (Int64(2), $2)"; @@ -4252,7 +4102,7 @@ fn test_prepare_statement_to_plan_value_list() { ScalarValue::from("a".to_string()), ScalarValue::from("b".to_string()), ]; - let expected_plan = "Projection: t.num, t.letter\ + let expected_plan = "Projection: *\ \n SubqueryAlias: t\ \n Projection: column1 AS num, column2 AS letter\ \n Values: (Int64(1), Utf8(\"a\")), (Int64(2), Utf8(\"b\"))"; @@ -4302,7 +4152,7 @@ fn test_table_alias() { (select age from person) t2 \ ) as f"; - let expected = "Projection: f.id, f.age\ + let expected = "Projection: *\ \n SubqueryAlias: f\ \n CrossJoin:\ \n SubqueryAlias: t1\ @@ -4319,7 +4169,7 @@ fn test_table_alias() { (select age from person) t2 \ ) as f (c1, c2)"; - let expected = "Projection: f.c1, f.c2\ + let expected = "Projection: *\ \n SubqueryAlias: f\ \n Projection: t1.id AS c1, t2.age AS c2\ \n CrossJoin:\ diff --git a/datafusion/sqllogictest/src/engines/datafusion_engine/runner.rs b/datafusion/sqllogictest/src/engines/datafusion_engine/runner.rs index afd0a241ca5e..5c24b49cfe86 100644 --- a/datafusion/sqllogictest/src/engines/datafusion_engine/runner.rs +++ b/datafusion/sqllogictest/src/engines/datafusion_engine/runner.rs @@ -15,10 +15,13 @@ // specific language governing permissions and limitations // under the License. +use std::sync::Arc; use std::{path::PathBuf, time::Duration}; use arrow::record_batch::RecordBatch; use async_trait::async_trait; +use datafusion::physical_plan::common::collect; +use datafusion::physical_plan::execute_stream; use datafusion::prelude::SessionContext; use log::info; use sqllogictest::DBOutput; @@ -69,9 +72,12 @@ impl sqllogictest::AsyncDB for DataFusion { async fn run_query(ctx: &SessionContext, sql: impl Into) -> Result { let df = ctx.sql(sql.into().as_str()).await?; + let task_ctx = Arc::new(df.task_ctx()); + let plan = df.create_physical_plan().await?; - let types = normalize::convert_schema_to_types(df.schema().fields()); - let results: Vec = df.collect().await?; + let stream = execute_stream(plan, task_ctx)?; + let types = normalize::convert_schema_to_types(stream.schema().fields()); + let results: Vec = collect(stream).await?; let rows = normalize::convert_batches(results)?; if rows.is_empty() && types.is_empty() { diff --git a/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt b/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt index 6c0cf5f800d8..ba378f4230f8 100644 --- a/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt +++ b/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt @@ -209,6 +209,21 @@ SELECT c2, sum(c3), sum(c11) FROM aggregate_test_100 GROUP BY c2 ORDER BY c2; 4 29 9.531112968922 5 -194 7.074412226677 +# Test avg for tinyint / float +query TRR +SELECT + c1, + avg(c2), + avg(c11) +FROM aggregate_test_100 GROUP BY c1 ORDER BY c1; +---- +a 2.857142857143 0.438223421574 +b 3.263157894737 0.496481208425 +c 2.666666666667 0.425241138254 +d 2.444444444444 0.541519476308 +e 3 0.505440263521 + + # Enabling PG dialect for filtered aggregates tests statement ok set datafusion.sql_parser.dialect = 'Postgres'; @@ -267,6 +282,20 @@ FROM aggregate_test_100_null GROUP BY c2 ORDER BY c2; 4 11 14 5 8 7 +# Test avg for tinyint / float +query TRR +SELECT + c1, + avg(c2) FILTER (WHERE c2 != 5), + avg(c11) FILTER (WHERE c2 != 5) +FROM aggregate_test_100 GROUP BY c1 ORDER BY c1; +---- +a 2.5 0.449071887467 +b 2.642857142857 0.445486298629 +c 2.421052631579 0.422882117723 +d 2.125 0.518706191331 +e 2.789473684211 0.536785323369 + # Test count with nullable fields and nullable filter query III SELECT c2, diff --git a/datafusion/sqllogictest/test_files/coalesce.slt b/datafusion/sqllogictest/test_files/coalesce.slt index d16b79734c62..0e977666ccfd 100644 --- a/datafusion/sqllogictest/test_files/coalesce.slt +++ b/datafusion/sqllogictest/test_files/coalesce.slt @@ -23,7 +23,7 @@ select coalesce(1, 2, 3); 1 # test with first null -query ?T +query IT select coalesce(null, 3, 2, 1), arrow_typeof(coalesce(null, 3, 2, 1)); ---- 3 Int64 @@ -35,7 +35,7 @@ select coalesce(null, null); NULL # cast to float -query IT +query RT select coalesce(1, 2.0), arrow_typeof(coalesce(1, 2.0)) @@ -51,7 +51,7 @@ select ---- 2 Float64 -query IT +query RT select coalesce(1, arrow_cast(2.0, 'Float32')), arrow_typeof(coalesce(1, arrow_cast(2.0, 'Float32'))) @@ -177,7 +177,7 @@ select 2 Decimal256(22, 2) # coalesce string -query T? +query TT select coalesce('', 'test'), coalesce(null, 'test'); @@ -246,7 +246,7 @@ drop table test1 statement ok create table t(c varchar) as values ('a'), (null); -query TT +query ?T select coalesce(c, arrow_cast('b', 'Dictionary(Int32, Utf8)')), arrow_typeof(coalesce(c, arrow_cast('b', 'Dictionary(Int32, Utf8)'))) @@ -295,7 +295,7 @@ statement ok drop table t; # test dict(int32, int8) -query I +query ? select coalesce(34, arrow_cast(123, 'Dictionary(Int32, Int8)')); ---- 34 diff --git a/datafusion/sqllogictest/test_files/copy.slt b/datafusion/sqllogictest/test_files/copy.slt index ff7040926caa..ebb3ca2173b8 100644 --- a/datafusion/sqllogictest/test_files/copy.slt +++ b/datafusion/sqllogictest/test_files/copy.slt @@ -20,13 +20,13 @@ statement ok create table source_table(col1 integer, col2 varchar) as values (1, 'Foo'), (2, 'Bar'); # Copy to directory as multiple files -query IT +query I COPY source_table TO 'test_files/scratch/copy/table/' STORED AS parquet OPTIONS ('format.compression' 'zstd(10)'); ---- 2 # Copy to directory as partitioned files -query IT +query I COPY source_table TO 'test_files/scratch/copy/partitioned_table1/' STORED AS parquet PARTITIONED BY (col2) OPTIONS ('format.compression' 'zstd(10)'); ---- 2 @@ -53,7 +53,7 @@ select * from validate_partitioned_parquet_bar order by col1; 2 # Copy to directory as partitioned files -query ITT +query I COPY (values (1, 'a', 'x'), (2, 'b', 'y'), (3, 'c', 'z')) TO 'test_files/scratch/copy/partitioned_table2/' STORED AS parquet PARTITIONED BY (column2, column3) OPTIONS ('format.compression' 'zstd(10)'); ---- @@ -81,7 +81,7 @@ select * from validate_partitioned_parquet_a_x order by column1; 1 # Copy to directory as partitioned files -query TTT +query I COPY (values ('1', 'a', 'x'), ('2', 'b', 'y'), ('3', 'c', 'z')) TO 'test_files/scratch/copy/partitioned_table3/' STORED AS parquet PARTITIONED BY (column1, column3) OPTIONS ('format.compression' 'zstd(10)'); ---- @@ -167,7 +167,7 @@ physical_plan 02)--MemoryExec: partitions=1, partition_sizes=[1] # Copy to directory as partitioned files with keep_partition_by_columns enabled -query TT +query I COPY (values ('1', 'a'), ('2', 'b'), ('3', 'c')) TO 'test_files/scratch/copy/partitioned_table4/' STORED AS parquet PARTITIONED BY (column1) OPTIONS (execution.keep_partition_by_columns true); ---- @@ -184,7 +184,7 @@ select column1, column2 from validate_partitioned_parquet4 order by column1,colu 1 a # Copy more files to directory via query -query IT +query I COPY (select * from source_table UNION ALL select * from source_table) to 'test_files/scratch/copy/table/' STORED AS PARQUET; ---- 4 @@ -203,7 +203,7 @@ select * from validate_parquet; 1 Foo 2 Bar -query ? +query I copy (values (struct(timestamp '2021-01-01 01:00:01', 1)), (struct(timestamp '2022-01-01 01:00:01', 2)), (struct(timestamp '2023-01-03 01:00:01', 3)), (struct(timestamp '2024-01-01 01:00:01', 4))) to 'test_files/scratch/copy/table_nested2/' STORED AS PARQUET; @@ -221,7 +221,7 @@ select * from validate_parquet_nested2; {c0: 2023-01-03T01:00:01, c1: 3} {c0: 2024-01-01T01:00:01, c1: 4} -query ?? +query I COPY (values (struct ('foo', (struct ('foo', make_array(struct('a',1), struct('b',2))))), make_array(timestamp '2023-01-01 01:00:01',timestamp '2023-01-01 01:00:01')), (struct('bar', (struct ('foo', make_array(struct('aa',10), struct('bb',20))))), make_array(timestamp '2024-01-01 01:00:01', timestamp '2024-01-01 01:00:01'))) @@ -239,7 +239,7 @@ select * from validate_parquet_nested; {c0: foo, c1: {c0: foo, c1: [{c0: a, c1: 1}, {c0: b, c1: 2}]}} [2023-01-01T01:00:01, 2023-01-01T01:00:01] {c0: bar, c1: {c0: foo, c1: [{c0: aa, c1: 10}, {c0: bb, c1: 20}]}} [2024-01-01T01:00:01, 2024-01-01T01:00:01] -query ? +query I copy (values ([struct('foo', 1), struct('bar', 2)])) to 'test_files/scratch/copy/array_of_struct/' STORED AS PARQUET; @@ -255,7 +255,7 @@ select * from validate_array_of_struct; ---- [{c0: foo, c1: 1}, {c0: bar, c1: 2}] -query ? +query I copy (values (struct('foo', [1,2,3], struct('bar', [2,3,4])))) to 'test_files/scratch/copy/struct_with_array/' STORED AS PARQUET; ---- @@ -272,7 +272,7 @@ select * from validate_struct_with_array; # Copy parquet with all supported statement overrides -query IT +query I COPY source_table TO 'test_files/scratch/copy/table_with_options/' STORED AS PARQUET @@ -378,7 +378,7 @@ select * from validate_parquet_with_options; 2 Bar # Copy from table to single file -query IT +query I COPY source_table to 'test_files/scratch/copy/table.parquet'; ---- 2 @@ -394,7 +394,7 @@ select * from validate_parquet_single; 2 Bar # copy from table to folder of compressed json files -query IT +query I COPY source_table to 'test_files/scratch/copy/table_json_gz' STORED AS JSON OPTIONS ('format.compression' gzip); ---- 2 @@ -410,7 +410,7 @@ select * from validate_json_gz; 2 Bar # copy from table to folder of compressed csv files -query IT +query I COPY source_table to 'test_files/scratch/copy/table_csv' STORED AS CSV OPTIONS ('format.has_header' false, 'format.compression' gzip); ---- 2 @@ -426,7 +426,7 @@ select * from validate_csv; 2 Bar # Copy from table to single csv -query IT +query I COPY source_table to 'test_files/scratch/copy/table.csv'; ---- 2 @@ -442,7 +442,7 @@ select * from validate_single_csv; 2 Bar # Copy from table to folder of json -query IT +query I COPY source_table to 'test_files/scratch/copy/table_json' STORED AS JSON; ---- 2 @@ -458,7 +458,7 @@ select * from validate_json; 2 Bar # Copy from table to single json file -query IT +query I COPY source_table to 'test_files/scratch/copy/table.json' STORED AS JSON ; ---- 2 @@ -474,7 +474,7 @@ select * from validate_single_json; 2 Bar # COPY csv files with all options set -query IT +query I COPY source_table to 'test_files/scratch/copy/table_csv_with_options' STORED AS CSV OPTIONS ( @@ -499,7 +499,7 @@ select * from validate_csv_with_options; 2;Bar # Copy from table to single arrow file -query IT +query I COPY source_table to 'test_files/scratch/copy/table.arrow' STORED AS ARROW; ---- 2 @@ -517,7 +517,7 @@ select * from validate_arrow_file; 2 Bar # Copy from dict encoded values to single arrow file -query T? +query I COPY (values ('c', arrow_cast('foo', 'Dictionary(Int32, Utf8)')), ('d', arrow_cast('bar', 'Dictionary(Int32, Utf8)'))) to 'test_files/scratch/copy/table_dict.arrow' STORED AS ARROW; @@ -538,7 +538,7 @@ d bar # Copy from table to folder of json -query IT +query I COPY source_table to 'test_files/scratch/copy/table_arrow' STORED AS ARROW; ---- 2 @@ -556,7 +556,7 @@ select * from validate_arrow; # Format Options Support without the 'format.' prefix # Copy with format options for Parquet without the 'format.' prefix -query IT +query I COPY source_table TO 'test_files/scratch/copy/format_table.parquet' OPTIONS ( compression snappy, @@ -566,14 +566,14 @@ OPTIONS ( 2 # Copy with format options for JSON without the 'format.' prefix -query IT +query I COPY source_table to 'test_files/scratch/copy/format_table' STORED AS JSON OPTIONS (compression gzip); ---- 2 # Copy with format options for CSV without the 'format.' prefix -query IT +query I COPY source_table to 'test_files/scratch/copy/format_table.csv' OPTIONS ( has_header false, diff --git a/datafusion/sqllogictest/test_files/csv_files.slt b/datafusion/sqllogictest/test_files/csv_files.slt index f7f5aa54dd0d..3fb9a6f20c24 100644 --- a/datafusion/sqllogictest/test_files/csv_files.slt +++ b/datafusion/sqllogictest/test_files/csv_files.slt @@ -115,14 +115,14 @@ CREATE TABLE src_table_2 ( (7, 'ggg', 700, 2), (8, 'hhh', 800, 2); -query ITII +query I COPY src_table_1 TO 'test_files/scratch/csv_files/csv_partitions/1.csv' STORED AS CSV; ---- 4 -query ITII +query I COPY src_table_2 TO 'test_files/scratch/csv_files/csv_partitions/2.csv' STORED AS CSV; ---- @@ -175,7 +175,7 @@ CREATE TABLE table_with_necessary_quoting ( (4, 'h|h|h'); # quote is required because `|` is delimiter and part of the data -query IT +query I COPY table_with_necessary_quoting TO 'test_files/scratch/csv_files/table_with_necessary_quoting.csv' STORED AS csv OPTIONS ('format.quote' '~', @@ -247,7 +247,7 @@ id2 "value2" id3 "value3" # ensure that double quote option is used when writing to csv -query TT +query I COPY csv_with_double_quote TO 'test_files/scratch/csv_files/table_with_double_quotes.csv' STORED AS csv OPTIONS ('format.double_quote' 'true'); @@ -271,7 +271,7 @@ id2 "value2" id3 "value3" # ensure when double quote option is disabled that quotes are escaped instead -query TT +query I COPY csv_with_double_quote TO 'test_files/scratch/csv_files/table_with_escaped_quotes.csv' STORED AS csv OPTIONS ('format.double_quote' 'false', 'format.escape' '#'); diff --git a/datafusion/sqllogictest/test_files/explain.slt b/datafusion/sqllogictest/test_files/explain.slt index eae4f428b4b4..1e8850efadff 100644 --- a/datafusion/sqllogictest/test_files/explain.slt +++ b/datafusion/sqllogictest/test_files/explain.slt @@ -177,6 +177,7 @@ initial_logical_plan 01)Projection: simple_explain_test.a, simple_explain_test.b, simple_explain_test.c 02)--TableScan: simple_explain_test logical_plan after inline_table_scan SAME TEXT AS ABOVE +logical_plan after expand_wildcard_rule SAME TEXT AS ABOVE logical_plan after type_coercion SAME TEXT AS ABOVE logical_plan after count_wildcard_rule SAME TEXT AS ABOVE analyzed_logical_plan SAME TEXT AS ABOVE diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index 1d5f9ba23d58..3c3b0631e3ff 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -22,7 +22,7 @@ SELECT true, false, false = false, true = false true false true false # test_mathematical_expressions_with_null -query RRRRRRRRRRRRRRRRRR?RRRRRRRIRRRRRRBB +query RRRRRRRRRRRRRRRRRR?RRRRRIIIRRRRRRBB SELECT sqrt(NULL), cbrt(NULL), @@ -365,7 +365,7 @@ SELECT bit_length('josé') ---- 40 -query ? +query I SELECT bit_length(NULL) ---- NULL @@ -395,7 +395,7 @@ SELECT btrim('\nxyxtrimyyx\n', 'xyz\n') ---- trim -query ? +query T SELECT btrim(NULL, 'xyz') ---- NULL @@ -476,7 +476,7 @@ SELECT initcap('hi THOMAS') ---- Hi Thomas -query ? +query T SELECT initcap(NULL) ---- NULL @@ -491,7 +491,7 @@ SELECT lower('TOM') ---- tom -query ? +query T SELECT lower(NULL) ---- NULL @@ -511,7 +511,7 @@ SELECT ltrim('zzzytest', 'xyz') ---- test -query ? +query T SELECT ltrim(NULL, 'xyz') ---- NULL @@ -531,7 +531,7 @@ SELECT octet_length('josé') ---- 5 -query ? +query I SELECT octet_length(NULL) ---- NULL @@ -551,7 +551,7 @@ SELECT repeat('Pg', CAST(NULL AS INT)) ---- NULL -query ? +query T SELECT repeat(NULL, 4) ---- NULL @@ -576,7 +576,7 @@ SELECT replace('abcdefabcdef', NULL, 'XX') ---- NULL -query ? +query T SELECT replace(NULL, 'cd', 'XX') ---- NULL @@ -596,7 +596,7 @@ SELECT rtrim('testxxzx', 'xyz') ---- test -query ? +query T SELECT rtrim(NULL, 'xyz') ---- NULL @@ -611,7 +611,7 @@ SELECT split_part('abc~@~def~@~ghi', '~@~', 20) ---- (empty) -query ? +query T SELECT split_part(NULL, '~@~', 20) ---- NULL @@ -788,7 +788,7 @@ SELECT upper('tom') ---- TOM -query ? +query T SELECT upper(NULL) ---- NULL @@ -1774,7 +1774,7 @@ SELECT arrow_cast(decode(arrow_cast('746f6d', 'LargeBinary'),'hex'), 'Utf8'); ---- tom -query ? +query T SELECT encode(NULL,'base64'); ---- NULL @@ -1784,7 +1784,7 @@ SELECT decode(NULL,'base64'); ---- NULL -query ? +query T SELECT encode(NULL,'hex'); ---- NULL @@ -1829,7 +1829,7 @@ SELECT md5(''); ---- d41d8cd98f00b204e9800998ecf8427e -query ? +query T SELECT md5(NULL); ---- NULL diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index 8a4855ea2c05..f728942b38c3 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -33,7 +33,7 @@ SELECT char_length('josé') ---- 4 -query ? +query I SELECT char_length(NULL) ---- NULL @@ -53,7 +53,7 @@ SELECT character_length('josé') ---- 4 -query ? +query I SELECT character_length(NULL) ---- NULL @@ -93,12 +93,12 @@ SELECT left('abcde', CAST(NULL AS INT)) ---- NULL -query ? +query T SELECT left(NULL, 2) ---- NULL -query ? +query T SELECT left(NULL, CAST(NULL AS INT)) ---- NULL @@ -128,7 +128,7 @@ SELECT length(arrow_cast('josé', 'Dictionary(Int32, Utf8)')) ---- 4 -query ? +query I SELECT length(NULL) ---- NULL @@ -193,12 +193,12 @@ SELECT lpad('xyxhi', 3) ---- xyx -query ? +query T SELECT lpad(NULL, 0) ---- NULL -query ? +query T SELECT lpad(NULL, 5, 'xy') ---- NULL @@ -244,7 +244,7 @@ SELECT reverse('loẅks') ---- sk̈wol -query ? +query T SELECT reverse(NULL) ---- NULL @@ -284,12 +284,12 @@ SELECT right('abcde', CAST(NULL AS INT)) ---- NULL -query ? +query T SELECT right(NULL, 2) ---- NULL -query ? +query T SELECT right(NULL, CAST(NULL AS INT)) ---- NULL @@ -374,7 +374,7 @@ SELECT strpos('joséésoj', 'abc') ---- 0 -query ? +query I SELECT strpos(NULL, 'abc') ---- NULL @@ -455,7 +455,7 @@ SELECT translate(arrow_cast('12345', 'Dictionary(Int32, Utf8)'), '143', 'ax') ---- a2x5 -query ? +query T SELECT translate(NULL, '143', 'ax') ---- NULL @@ -949,12 +949,12 @@ SELECT levenshtein('kitten', NULL) ---- NULL -query ? +query I SELECT levenshtein(NULL, 'sitting') ---- NULL -query ? +query I SELECT levenshtein(NULL, NULL) ---- NULL @@ -1041,7 +1041,7 @@ arrow 1 arrow arrow 2 arrow # Test substring_index with NULL values -query ?TT? +query TTTT SELECT substring_index(NULL, '.', 1), substring_index('arrow.apache.org', NULL, 1), @@ -1092,7 +1092,7 @@ docs.apache.com docs com community.influxdata.com community com arrow.apache.org arrow org - +# find_in_set tests query I SELECT find_in_set('b', 'a,b,c,d') ---- @@ -1120,7 +1120,7 @@ SELECT find_in_set('', '') ---- 1 -query ? +query I SELECT find_in_set(NULL, 'a,b,c,d') ---- NULL @@ -1131,11 +1131,28 @@ SELECT find_in_set('a', NULL) NULL -query ? +query I SELECT find_in_set(NULL, NULL) ---- NULL +# find_in_set tests with utf8view +query I +SELECT find_in_set(arrow_cast('b', 'Utf8View'), 'a,b,c,d') +---- +2 + + +query I +SELECT find_in_set('a', arrow_cast('a,b,c,d,a', 'Utf8View')) +---- +1 + +query I +SELECT find_in_set(arrow_cast('', 'Utf8View'), arrow_cast('a,b,c,d,a', 'Utf8View')) +---- +0 + # Verify that multiple calls to volatile functions like `random()` are not combined / optimized away query B SELECT r FROM (SELECT r1 == r2 r, r1, r2 FROM (SELECT random()+1 r1, random()+1 r2) WHERE r1 > 0 AND r2 > 0) diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index a4a886c75a77..5571315e2acc 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -4502,28 +4502,28 @@ CREATE TABLE src_table ( ('2020-12-19T00:00:00.00Z', 9); # Use src_table to create a partitioned file -query PI +query I COPY (SELECT * FROM src_table) TO 'test_files/scratch/group_by/timestamp_table/0.csv' STORED AS CSV; ---- 10 -query PI +query I COPY (SELECT * FROM src_table) TO 'test_files/scratch/group_by/timestamp_table/1.csv' STORED AS CSV; ---- 10 -query PI +query I COPY (SELECT * FROM src_table) TO 'test_files/scratch/group_by/timestamp_table/2.csv' STORED AS CSV; ---- 10 -query PI +query I COPY (SELECT * FROM src_table) TO 'test_files/scratch/group_by/timestamp_table/3.csv' STORED AS CSV; diff --git a/datafusion/sqllogictest/test_files/limit.slt b/datafusion/sqllogictest/test_files/limit.slt index 4cdd40ac8c34..439df7fede51 100644 --- a/datafusion/sqllogictest/test_files/limit.slt +++ b/datafusion/sqllogictest/test_files/limit.slt @@ -518,7 +518,7 @@ drop table aggregate_test_100; ## Test limit pushdown in StreamingTableExec ## Create sorted table with 5 rows -query IT +query I COPY (select * from (values (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd'), (5, 'e') )) TO 'test_files/scratch/limit/data.csv' STORED AS CSV; diff --git a/datafusion/sqllogictest/test_files/map.slt b/datafusion/sqllogictest/test_files/map.slt index eb350c22bb5d..0dc37c68bca4 100644 --- a/datafusion/sqllogictest/test_files/map.slt +++ b/datafusion/sqllogictest/test_files/map.slt @@ -199,25 +199,50 @@ SELECT MAP(arrow_cast(make_array('POST', 'HEAD', 'PATCH'), 'LargeList(Utf8)'), a statement ok create table t as values -('a', 1, 'k1', 10, ['k1', 'k2'], [1, 2]), -('b', 2, 'k3', 30, ['k3'], [3]), -('d', 4, 'k5', 50, ['k5'], [5]); +('a', 1, 'k1', 10, ['k1', 'k2'], [1, 2], 'POST', [[1,2,3]], ['a']), +('b', 2, 'k3', 30, ['k3'], [3], 'PUT', [[4]], ['b']), +('d', 4, 'k5', 50, ['k5'], [5], null, [[1,2]], ['c']); -query error +query ? SELECT make_map(column1, column2, column3, column4) FROM t; -# TODO: support array value -# ---- -# {a: 1, k1: 10} -# {b: 2, k3: 30} -# {d: 4, k5: 50} +---- +{a: 1, k1: 10} +{b: 2, k3: 30} +{d: 4, k5: 50} -query error +query ? SELECT map(column5, column6) FROM t; -# TODO: support array value -# ---- -# {k1:1, k2:2} -# {k3: 3} -# {k5: 5} +---- +{k1: 1, k2: 2} +{k3: 3} +{k5: 5} + +query ? +SELECT map(column8, column9) FROM t; +---- +{[1, 2, 3]: a} +{[4]: b} +{[1, 2]: c} + +query error +SELECT map(column6, column7) FROM t; + +query ? +select Map {column6: column7} from t; +---- +{[1, 2]: POST} +{[3]: PUT} +{[5]: } + +query ? +select Map {column8: column7} from t; +---- +{[[1, 2, 3]]: POST} +{[[4]]: PUT} +{[[1, 2]]: } + +query error +select Map {column7: column8} from t; query ? SELECT MAKE_MAP('POST', 41, 'HEAD', 33, 'PATCH', 30, 'OPTION', 29, 'GET', 27, 'PUT', 25, 'DELETE', 24) AS method_count from t; diff --git a/datafusion/sqllogictest/test_files/nvl.slt b/datafusion/sqllogictest/test_files/nvl.slt index c77214cc302a..81e79e1eb5b0 100644 --- a/datafusion/sqllogictest/test_files/nvl.slt +++ b/datafusion/sqllogictest/test_files/nvl.slt @@ -114,7 +114,7 @@ SELECT NVL(1, 3); ---- 1 -query ? +query I SELECT NVL(NULL, NULL); ---- NULL diff --git a/datafusion/sqllogictest/test_files/parquet.slt b/datafusion/sqllogictest/test_files/parquet.slt index 3342f85c8141..34d4ed6ff284 100644 --- a/datafusion/sqllogictest/test_files/parquet.slt +++ b/datafusion/sqllogictest/test_files/parquet.slt @@ -42,7 +42,7 @@ CREATE TABLE src_table ( # Setup 2 files, i.e., as many as there are partitions: # File 1: -query ITID +query I COPY (SELECT * FROM src_table LIMIT 3) TO 'test_files/scratch/parquet/test_table/0.parquet' STORED AS PARQUET; @@ -50,7 +50,7 @@ STORED AS PARQUET; 3 # File 2: -query ITID +query I COPY (SELECT * FROM src_table WHERE int_col > 3 LIMIT 3) TO 'test_files/scratch/parquet/test_table/1.parquet' STORED AS PARQUET; @@ -123,7 +123,7 @@ physical_plan 02)--ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/1.parquet]]}, projection=[int_col, string_col], output_ordering=[string_col@1 ASC NULLS LAST, int_col@0 ASC NULLS LAST] # Add another file to the directory underlying test_table -query ITID +query I COPY (SELECT * FROM src_table WHERE int_col > 6 LIMIT 3) TO 'test_files/scratch/parquet/test_table/2.parquet' STORED AS PARQUET; @@ -251,31 +251,29 @@ SELECT COUNT(*) FROM timestamp_with_tz; ---- 131072 -# FIXME(#TODO) fails with feature `force_hash_collisions` -# https://github.com/apache/datafusion/issues/11660 # Perform the query: -# query IPT -# SELECT -# count, -# LAG(timestamp, 1) OVER (ORDER BY timestamp), -# arrow_typeof(LAG(timestamp, 1) OVER (ORDER BY timestamp)) -# FROM timestamp_with_tz -# LIMIT 10; -# ---- -# 0 NULL Timestamp(Millisecond, Some("UTC")) -# 0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) -# 0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) -# 4 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) -# 0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) -# 0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) -# 0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) -# 14 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) -# 0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) -# 0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) +query IPT +SELECT + count, + LAG(timestamp, 1) OVER (ORDER BY timestamp), + arrow_typeof(LAG(timestamp, 1) OVER (ORDER BY timestamp)) +FROM timestamp_with_tz +LIMIT 10; +---- +0 NULL Timestamp(Millisecond, Some("UTC")) +0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) +0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) +4 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) +0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) +0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) +0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) +14 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) +0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) +0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC")) # Test config listing_table_ignore_subdirectory: -query ITID +query I COPY (SELECT * FROM src_table WHERE int_col > 6 LIMIT 3) TO 'test_files/scratch/parquet/test_table/subdir/3.parquet' STORED AS PARQUET; diff --git a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt index f7a81f08456f..b68d4f52d21c 100644 --- a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt +++ b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt @@ -56,7 +56,7 @@ CREATE TABLE src_table ( # Setup 3 files, in particular more files than there are partitions # File 1: -query IITIDII +query I COPY (SELECT * FROM src_table ORDER BY int_col LIMIT 3) TO 'test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet' STORED AS PARQUET; @@ -64,7 +64,7 @@ STORED AS PARQUET; 3 # File 2: -query IITIDII +query I COPY (SELECT * FROM src_table WHERE int_col > 3 ORDER BY int_col LIMIT 3) TO 'test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet' STORED AS PARQUET; @@ -72,7 +72,7 @@ STORED AS PARQUET; 3 # Add another file to the directory underlying test_table -query IITIDII +query I COPY (SELECT * FROM src_table WHERE int_col > 6 ORDER BY int_col LIMIT 3) TO 'test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet' STORED AS PARQUET; diff --git a/datafusion/sqllogictest/test_files/regexp.slt b/datafusion/sqllogictest/test_files/regexp.slt index aa99a54c26ee..149ad7f6fdcd 100644 --- a/datafusion/sqllogictest/test_files/regexp.slt +++ b/datafusion/sqllogictest/test_files/regexp.slt @@ -94,7 +94,7 @@ SELECT regexp_like('aa', '.*-(\d)'); ---- false -query ? +query B SELECT regexp_like(NULL, '.*-(\d)'); ---- NULL @@ -104,7 +104,7 @@ SELECT regexp_like('aaa-0', NULL); ---- NULL -query ? +query B SELECT regexp_like(null, '.*-(\d)'); ---- NULL @@ -294,7 +294,7 @@ SELECT regexp_replace('Thomas', '.[mN]a.', 'M'); ---- ThM -query ? +query T SELECT regexp_replace(NULL, 'b(..)', 'X\\1Y', 'g'); ---- NULL diff --git a/datafusion/sqllogictest/test_files/scalar.slt b/datafusion/sqllogictest/test_files/scalar.slt index 188a2c5863e6..6eed72e914bd 100644 --- a/datafusion/sqllogictest/test_files/scalar.slt +++ b/datafusion/sqllogictest/test_files/scalar.slt @@ -706,7 +706,7 @@ select power(2, 0), power(2, 1), power(2, 2); 1 2 4 # power scalar nulls -query R rowsort +query I rowsort select power(null, 64); ---- NULL @@ -718,7 +718,7 @@ select power(2, null); NULL # power scalar nulls #2 -query R rowsort +query I rowsort select power(null, null); ---- NULL @@ -1720,7 +1720,7 @@ CREATE TABLE test( (-14, -14, -14.5, -14.5), (NULL, NULL, NULL, NULL); -query RRRRIR rowsort +query IRRRIR rowsort SELECT power(i32, exp_i) as power_i32, power(i64, exp_f) as power_i64, pow(f32, exp_i) as power_f32, @@ -1895,7 +1895,7 @@ select 100000 where position('legend' in 'league of legend') = 11; 100000 # test null -query ? +query I select position(null in null) ---- NULL diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index f217cbab074f..49a18ca09de4 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -1195,12 +1195,12 @@ LIMIT 5 200 2000 # Trying to exclude non-existing column should give error -statement error DataFusion error: Schema error: No field named e. Valid fields are table1.a, table1.b, table1.c, table1.d. +statement error SELECT * EXCLUDE e FROM table1 # similarly, except should raise error if excluded column is not in the table -statement error DataFusion error: Schema error: No field named e. Valid fields are table1.a, table1.b, table1.c, table1.d. +statement error SELECT * EXCEPT(e) FROM table1 @@ -1214,7 +1214,7 @@ FROM table1 2 20 20 200 2000 # EXCEPT, or EXCLUDE shouldn't contain duplicate column names -statement error DataFusion error: Error during planning: EXCLUDE or EXCEPT contains duplicate column names +statement error SELECT * EXCLUDE(a, a) FROM table1 diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index dcc6784bf44a..264f85ff84b9 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -425,6 +425,50 @@ logical_plan 01)Projection: starts_with(test.column1_utf8view, Utf8View("äöüß")) AS c1, starts_with(test.column1_utf8view, Utf8View("")) AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3, starts_with(Utf8View(NULL), test.column1_utf8view) AS c4 02)--TableScan: test projection=[column1_utf8view] +### Initcap + +query TT +EXPLAIN SELECT + INITCAP(column1_utf8view) as c +FROM test; +---- +logical_plan +01)Projection: initcap(test.column1_utf8view) AS c +02)--TableScan: test projection=[column1_utf8view] + +# Create a table with lowercase strings +statement ok +CREATE TABLE test_lowercase AS SELECT + lower(column1_utf8) as column1_utf8_lower, + lower(column1_large_utf8) as column1_large_utf8_lower, + lower(column1_utf8view) as column1_utf8view_lower +FROM test; + +# Test INITCAP with utf8view, utf8, and largeutf8 +# Should not cast anything +query TT +EXPLAIN SELECT + INITCAP(column1_utf8view_lower) as c1, + INITCAP(column1_utf8_lower) as c2, + INITCAP(column1_large_utf8_lower) as c3 +FROM test_lowercase; +---- +logical_plan +01)Projection: initcap(test_lowercase.column1_utf8view_lower) AS c1, initcap(test_lowercase.column1_utf8_lower) AS c2, initcap(test_lowercase.column1_large_utf8_lower) AS c3 +02)--TableScan: test_lowercase projection=[column1_utf8_lower, column1_large_utf8_lower, column1_utf8view_lower] + +query TTT +SELECT + INITCAP(column1_utf8view_lower) as c1, + INITCAP(column1_utf8_lower) as c2, + INITCAP(column1_large_utf8_lower) as c3 +FROM test_lowercase; +---- +Andrew Andrew Andrew +Xiangpeng Xiangpeng Xiangpeng +Raphael Raphael Raphael +NULL NULL NULL + # Ensure string functions use native StringView implementation # and do not fall back to Utf8 or LargeUtf8 # Should see no casts to Utf8 in the plans below @@ -519,15 +563,143 @@ SELECT 228 0 NULL ## Ensure no casts for BTRIM +# Test BTRIM with Utf8View input +query TT +EXPLAIN SELECT + BTRIM(column1_utf8view) AS l +FROM test; +---- +logical_plan +01)Projection: btrim(test.column1_utf8view) AS l +02)--TableScan: test projection=[column1_utf8view] + +# Test BTRIM with Utf8View input and Utf8View pattern query TT EXPLAIN SELECT BTRIM(column1_utf8view, 'foo') AS l FROM test; ---- logical_plan -01)Projection: btrim(CAST(test.column1_utf8view AS Utf8), Utf8("foo")) AS l +01)Projection: btrim(test.column1_utf8view, Utf8View("foo")) AS l +02)--TableScan: test projection=[column1_utf8view] + +# Test BTRIM with Utf8View bytes longer than 12 +query TT +EXPLAIN SELECT + BTRIM(column1_utf8view, 'this is longer than 12') AS l +FROM test; +---- +logical_plan +01)Projection: btrim(test.column1_utf8view, Utf8View("this is longer than 12")) AS l +02)--TableScan: test projection=[column1_utf8view] + +# Test BTRIM outputs +query TTTT +SELECT + BTRIM(column1_utf8view, 'foo') AS l1, + BTRIM(column1_utf8view, 'A') AS l2, + BTRIM(column1_utf8view) AS l3, + BTRIM(column1_utf8view, NULL) AS l4 +FROM test; +---- +Andrew ndrew Andrew NULL +Xiangpeng Xiangpeng Xiangpeng NULL +Raphael Raphael Raphael NULL +NULL NULL NULL NULL + +## Ensure no casts for LTRIM +# Test LTRIM with Utf8View input +query TT +EXPLAIN SELECT + LTRIM(column1_utf8view) AS l +FROM test; +---- +logical_plan +01)Projection: ltrim(test.column1_utf8view) AS l +02)--TableScan: test projection=[column1_utf8view] + +# Test LTRIM with Utf8View input and Utf8View pattern +query TT +EXPLAIN SELECT + LTRIM(column1_utf8view, 'foo') AS l +FROM test; +---- +logical_plan +01)Projection: ltrim(test.column1_utf8view, Utf8View("foo")) AS l +02)--TableScan: test projection=[column1_utf8view] + +# Test LTRIM with Utf8View bytes longer than 12 +query TT +EXPLAIN SELECT + LTRIM(column1_utf8view, 'this is longer than 12') AS l +FROM test; +---- +logical_plan +01)Projection: ltrim(test.column1_utf8view, Utf8View("this is longer than 12")) AS l +02)--TableScan: test projection=[column1_utf8view] + +# Test LTRIM outputs +query TTTTT +SELECT + LTRIM(column1_utf8view, 'foo') AS l1, + LTRIM(column1_utf8view, column2_utf8view) AS l2, + LTRIM(column1_utf8view) AS l3, + LTRIM(column1_utf8view, NULL) AS l4, + LTRIM(column1_utf8view, 'Xiang') AS l5 +FROM test; +---- +Andrew Andrew Andrew NULL Andrew +Xiangpeng (empty) Xiangpeng NULL peng +Raphael aphael Raphael NULL Raphael +NULL NULL NULL NULL NULL + +## ensure no casts for RTRIM +# Test RTRIM with Utf8View input +query TT +EXPLAIN SELECT + RTRIM(column1_utf8view) AS l +FROM test; +---- +logical_plan +01)Projection: rtrim(test.column1_utf8view) AS l 02)--TableScan: test projection=[column1_utf8view] +# Test RTRIM with Utf8View input and Utf8View pattern +query TT +EXPLAIN SELECT + RTRIM(column1_utf8view, 'foo') AS l +FROM test; +---- +logical_plan +01)Projection: rtrim(test.column1_utf8view, Utf8View("foo")) AS l +02)--TableScan: test projection=[column1_utf8view] + +# Test RTRIM with Utf8View bytes longer than 12 +query TT +EXPLAIN SELECT + RTRIM(column1_utf8view, 'this is longer than 12') AS l +FROM test; +---- +logical_plan +01)Projection: rtrim(test.column1_utf8view, Utf8View("this is longer than 12")) AS l +02)--TableScan: test projection=[column1_utf8view] + +# Test RTRIM outputs +query TTTTT +SELECT + RTRIM(column1_utf8view, 'foo') AS l1, + RTRIM(column1_utf8view, column2_utf8view) AS l2, + RTRIM(column1_utf8view) AS l3, + RTRIM(column1_utf8view, NULL) AS l4, + RTRIM(column1_utf8view, 'peng') As l5 +FROM test; +---- +Andrew Andrew Andrew NULL Andrew +Xiangpeng (empty) Xiangpeng NULL Xia +Raphael Raphael Raphael NULL Raphael +NULL NULL NULL NULL NULL + + ## Ensure no casts for CHARACTER_LENGTH query TT EXPLAIN SELECT @@ -574,7 +746,6 @@ logical_plan 03)----TableScan: test projection=[column1_utf8view, column2_utf8view] ## Ensure no casts for ENDS_WITH -## TODO https://github.com/apache/datafusion/issues/11852 query TT EXPLAIN SELECT ENDS_WITH(column1_utf8view, 'foo') as c1, @@ -582,24 +753,10 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: ends_with(CAST(test.column1_utf8view AS Utf8), Utf8("foo")) AS c1, ends_with(__common_expr_1, __common_expr_1) AS c2 -02)--Projection: CAST(test.column2_utf8view AS Utf8) AS __common_expr_1, test.column1_utf8view -03)----TableScan: test projection=[column1_utf8view, column2_utf8view] - - -## Ensure no casts for INITCAP -## TODO https://github.com/apache/datafusion/issues/11853 -query TT -EXPLAIN SELECT - INITCAP(column1_utf8view) as c -FROM test; ----- -logical_plan -01)Projection: initcap(CAST(test.column1_utf8view AS Utf8)) AS c -02)--TableScan: test projection=[column1_utf8view] +01)Projection: ends_with(test.column1_utf8view, Utf8View("foo")) AS c1, ends_with(test.column2_utf8view, test.column2_utf8view) AS c2 +02)--TableScan: test projection=[column1_utf8view, column2_utf8view] ## Ensure no casts for LEVENSHTEIN -## TODO https://github.com/apache/datafusion/issues/11854 query TT EXPLAIN SELECT levenshtein(column1_utf8view, 'foo') as c1, @@ -607,9 +764,8 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: levenshtein(__common_expr_1, Utf8("foo")) AS c1, levenshtein(__common_expr_1, CAST(test.column2_utf8view AS Utf8)) AS c2 -02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view -03)----TableScan: test projection=[column1_utf8view, column2_utf8view] +01)Projection: levenshtein(test.column1_utf8view, Utf8View("foo")) AS c1, levenshtein(test.column1_utf8view, test.column2_utf8view) AS c2 +02)--TableScan: test projection=[column1_utf8view, column2_utf8view] ## Ensure no casts for LOWER ## TODO https://github.com/apache/datafusion/issues/11855 @@ -622,16 +778,6 @@ logical_plan 01)Projection: lower(CAST(test.column1_utf8view AS Utf8)) AS c1 02)--TableScan: test projection=[column1_utf8view] -## Ensure no casts for LTRIM -## TODO https://github.com/apache/datafusion/issues/11856 -query TT -EXPLAIN SELECT - LTRIM(column1_utf8view) as c1 -FROM test; ----- -logical_plan -01)Projection: ltrim(CAST(test.column1_utf8view AS Utf8)) AS c1 -02)--TableScan: test projection=[column1_utf8view] ## Ensure no casts for LPAD query TT @@ -662,14 +808,13 @@ logical_plan 02)--TableScan: test projection=[column1_utf8view, column2_utf8view] ## Ensure no casts for OCTET_LENGTH -## TODO https://github.com/apache/datafusion/issues/11858 query TT EXPLAIN SELECT OCTET_LENGTH(column1_utf8view) as c1 FROM test; ---- logical_plan -01)Projection: octet_length(CAST(test.column1_utf8view AS Utf8)) AS c1 +01)Projection: octet_length(test.column1_utf8view) AS c1 02)--TableScan: test projection=[column1_utf8view] ## Ensure no casts for OVERLAY @@ -749,30 +894,30 @@ logical_plan 01)Projection: reverse(CAST(test.column1_utf8view AS Utf8)) AS c1 02)--TableScan: test projection=[column1_utf8view] -## Ensure no casts for RTRIM -## TODO file ticket -query TT -EXPLAIN SELECT - RTRIM(column1_utf8view) as c1, - RTRIM(column1_utf8view, 'foo') as c2 -FROM test; ----- -logical_plan -01)Projection: rtrim(__common_expr_1) AS c1, rtrim(__common_expr_1, Utf8("foo")) AS c2 -02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1 -03)----TableScan: test projection=[column1_utf8view] ## Ensure no casts for RIGHT -## TODO file ticket query TT EXPLAIN SELECT RIGHT(column1_utf8view, 3) as c2 FROM test; ---- logical_plan -01)Projection: right(CAST(test.column1_utf8view AS Utf8), Int64(3)) AS c2 +01)Projection: right(test.column1_utf8view, Int64(3)) AS c2 02)--TableScan: test projection=[column1_utf8view] +# Test outputs of RIGHT +query TTT +SELECT + RIGHT(column1_utf8view, 3) as c1, + RIGHT(column1_utf8view, 0) as c2, + RIGHT(column1_utf8view, -3) as c3 +FROM test; +---- +rew (empty) rew +eng (empty) ngpeng +ael (empty) hael +NULL NULL NULL + ## Ensure no casts for RPAD ## TODO file ticket query TT @@ -787,19 +932,6 @@ logical_plan 03)----TableScan: test projection=[column1_utf8view, column2_utf8view] -## Ensure no casts for RTRIM -## TODO file ticket -query TT -EXPLAIN SELECT - RTRIM(column1_utf8view) as c, - RTRIM(column1_utf8view, column2_utf8view) as c1 -FROM test; ----- -logical_plan -01)Projection: rtrim(__common_expr_1) AS c, rtrim(__common_expr_1, CAST(test.column2_utf8view AS Utf8)) AS c1 -02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view -03)----TableScan: test projection=[column1_utf8view, column2_utf8view] - ## Ensure no casts for SPLIT_PART ## TODO file ticket query TT @@ -860,18 +992,24 @@ logical_plan 02)--TableScan: test projection=[column1_utf8view] ## Ensure no casts for FIND_IN_SET -## TODO file ticket query TT EXPLAIN SELECT FIND_IN_SET(column1_utf8view, 'a,b,c,d') as c FROM test; ---- logical_plan -01)Projection: find_in_set(CAST(test.column1_utf8view AS Utf8), Utf8("a,b,c,d")) AS c +01)Projection: find_in_set(test.column1_utf8view, Utf8View("a,b,c,d")) AS c 02)--TableScan: test projection=[column1_utf8view] - - +query I +SELECT + FIND_IN_SET(column1_utf8view, 'a,b,c,d') as c +FROM test; +---- +0 +0 +0 +NULL statement ok drop table test; diff --git a/datafusion/sqllogictest/test_files/struct.slt b/datafusion/sqllogictest/test_files/struct.slt index caa612f556fe..f3ac6549ad06 100644 --- a/datafusion/sqllogictest/test_files/struct.slt +++ b/datafusion/sqllogictest/test_files/struct.slt @@ -72,6 +72,14 @@ select struct(a, b, c)['c1'] from values; 2.2 3.3 +# explicit invocation of get_field +query R +select get_field(struct(a, b, c), 'c1') from values; +---- +1.1 +2.2 +3.3 + # struct scalar function #1 query ? select struct(1, 3.14, 'e'); @@ -218,9 +226,6 @@ select named_struct('field_a', 1, 'field_b', 2); ---- {field_a: 1, field_b: 2} -statement ok -drop table values; - query T select arrow_typeof(named_struct('first', 1, 'second', 2, 'third', 3)); ---- @@ -236,3 +241,44 @@ query ? select {'animal': {'cat': 1, 'dog': 2, 'bird': {'parrot': 3, 'canary': 1}}, 'genre': {'fiction': ['mystery', 'sci-fi', 'fantasy'], 'non-fiction': {'biography': 5, 'history': 7, 'science': {'physics': 2, 'biology': 3}}}, 'vehicle': {'car': {'sedan': 4, 'suv': 2}, 'bicycle': 3, 'boat': ['sailboat', 'motorboat']}, 'weather': {'sunny': True, 'temperature': 25.5, 'wind': {'speed': 10, 'direction': 'NW'}}}; ---- {animal: {cat: 1, dog: 2, bird: {parrot: 3, canary: 1}}, genre: {fiction: [mystery, sci-fi, fantasy], non-fiction: {biography: 5, history: 7, science: {physics: 2, biology: 3}}}, vehicle: {car: {sedan: 4, suv: 2}, bicycle: 3, boat: [sailboat, motorboat]}, weather: {sunny: true, temperature: 25.5, wind: {speed: 10, direction: NW}}} + +# test tuple as struct +query B +select ('x', 'y') = ('x', 'y'); +---- +true + +query B +select ('x', 'y') = ('y', 'x'); +---- +false + +query error DataFusion error: Error during planning: Cannot infer common argument type for comparison operation Struct.* +select ('x', 'y') = ('x', 'y', 'z'); + +query B +select ('x', 'y') IN (('x', 'y')); +---- +true + +query B +select ('x', 'y') IN (('x', 'y'), ('y', 'x')); +---- +true + +query I +select a from values where (a, c) = (1, 'a'); +---- +1 + +query I +select a from values where (a, c) IN ((1, 'a'), (2, 'b')); +---- +1 +2 + +statement ok +drop table values; + +statement ok +drop table struct_values; diff --git a/datafusion/sqllogictest/test_files/type_coercion.slt b/datafusion/sqllogictest/test_files/type_coercion.slt index aa1e6826eca5..e420c0cc7155 100644 --- a/datafusion/sqllogictest/test_files/type_coercion.slt +++ b/datafusion/sqllogictest/test_files/type_coercion.slt @@ -49,3 +49,179 @@ select interval '1 month' - '2023-05-01'::date; # interval - timestamp query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Timestamp\(Nanosecond, None\) to valid types SELECT interval '1 month' - '2023-05-01 12:30:00'::timestamp; + + +#################################### +## Test type coercion with UNIONs ## +#################################### + +# Disable optimizer to test only the analyzer with type coercion +statement ok +set datafusion.optimizer.max_passes = 0; + +statement ok +set datafusion.explain.logical_plan_only = true; + +# Create test table +statement ok +CREATE TABLE orders( + order_id INT UNSIGNED NOT NULL, + customer_id INT UNSIGNED NOT NULL, + o_item_id VARCHAR NOT NULL, + qty INT NOT NULL, + price DOUBLE NOT NULL, + delivered BOOLEAN NOT NULL +); + +# union_different_num_columns_error() / UNION +query error Error during planning: Union schemas have different number of fields: query 1 has 1 fields whereas query 2 has 2 fields +SELECT order_id FROM orders UNION SELECT customer_id, o_item_id FROM orders + +# union_different_num_columns_error() / UNION ALL +query error Error during planning: Union schemas have different number of fields: query 1 has 1 fields whereas query 2 has 2 fields +SELECT order_id FROM orders UNION ALL SELECT customer_id, o_item_id FROM orders + +# union_with_different_column_names() +query TT +EXPLAIN SELECT order_id from orders UNION ALL SELECT customer_id FROM orders +---- +logical_plan +01)Union +02)--Projection: orders.order_id +03)----TableScan: orders +04)--Projection: orders.customer_id AS order_id +05)----TableScan: orders + +# union_values_with_no_alias() +query TT +EXPLAIN SELECT 1, 2 UNION ALL SELECT 3, 4 +---- +logical_plan +01)Union +02)--Projection: Int64(1) AS Int64(1), Int64(2) AS Int64(2) +03)----EmptyRelation +04)--Projection: Int64(3) AS Int64(1), Int64(4) AS Int64(2) +05)----EmptyRelation + +# union_with_incompatible_data_type() +query error Incompatible inputs for Union: Previous inputs were of type Interval\(MonthDayNano\), but got incompatible type Int64 on column 'Int64\(1\)' +SELECT interval '1 year 1 day' UNION ALL SELECT 1 + +# union_with_different_decimal_data_types() +query TT +EXPLAIN SELECT 1 a UNION ALL SELECT 1.1 a +---- +logical_plan +01)Union +02)--Projection: CAST(Int64(1) AS Float64) AS a +03)----EmptyRelation +04)--Projection: Float64(1.1) AS a +05)----EmptyRelation + +# union_with_null() +query TT +EXPLAIN SELECT NULL a UNION ALL SELECT 1.1 a +---- +logical_plan +01)Union +02)--Projection: CAST(NULL AS Float64) AS a +03)----EmptyRelation +04)--Projection: Float64(1.1) AS a +05)----EmptyRelation + +# union_with_float_and_string() +query TT +EXPLAIN SELECT 'a' a UNION ALL SELECT 1.1 a +---- +logical_plan +01)Union +02)--Projection: Utf8("a") AS a +03)----EmptyRelation +04)--Projection: CAST(Float64(1.1) AS Utf8) AS a +05)----EmptyRelation + +# union_with_multiply_cols() +query TT +EXPLAIN SELECT 'a' a, 1 b UNION ALL SELECT 1.1 a, 1.1 b +---- +logical_plan +01)Union +02)--Projection: Utf8("a") AS a, CAST(Int64(1) AS Float64) AS b +03)----EmptyRelation +04)--Projection: CAST(Float64(1.1) AS Utf8) AS a, Float64(1.1) AS b +05)----EmptyRelation + +# sorted_union_with_different_types_and_group_by() +query TT +EXPLAIN SELECT a FROM (select 1 a) x GROUP BY 1 + UNION ALL +(SELECT a FROM (select 1.1 a) x GROUP BY 1) ORDER BY 1 +---- +logical_plan +01)Sort: x.a ASC NULLS LAST +02)--Union +03)----Projection: CAST(x.a AS Float64) AS a +04)------Aggregate: groupBy=[[x.a]], aggr=[[]] +05)--------SubqueryAlias: x +06)----------Projection: Int64(1) AS a +07)------------EmptyRelation +08)----Projection: x.a +09)------Aggregate: groupBy=[[x.a]], aggr=[[]] +10)--------SubqueryAlias: x +11)----------Projection: Float64(1.1) AS a +12)------------EmptyRelation + +# union_with_binary_expr_and_cast() +query TT +EXPLAIN SELECT cast(0.0 + a as integer) FROM (select 1 a) x GROUP BY 1 + UNION ALL +(SELECT 2.1 + a FROM (select 1 a) x GROUP BY 1) +---- +logical_plan +01)Union +02)--Projection: CAST(Float64(0) + x.a AS Float64) AS Float64(0) + x.a +03)----Aggregate: groupBy=[[CAST(Float64(0) + CAST(x.a AS Float64) AS Int32)]], aggr=[[]] +04)------SubqueryAlias: x +05)--------Projection: Int64(1) AS a +06)----------EmptyRelation +07)--Projection: Float64(2.1) + x.a AS Float64(0) + x.a +08)----Aggregate: groupBy=[[Float64(2.1) + CAST(x.a AS Float64)]], aggr=[[]] +09)------SubqueryAlias: x +10)--------Projection: Int64(1) AS a +11)----------EmptyRelation + +# union_with_aliases() +query TT +EXPLAIN SELECT a as a1 FROM (select 1 a) x GROUP BY 1 + UNION ALL +(SELECT a as a1 FROM (select 1.1 a) x GROUP BY 1) +---- +logical_plan +01)Union +02)--Projection: CAST(x.a AS Float64) AS a1 +03)----Aggregate: groupBy=[[x.a]], aggr=[[]] +04)------SubqueryAlias: x +05)--------Projection: Int64(1) AS a +06)----------EmptyRelation +07)--Projection: x.a AS a1 +08)----Aggregate: groupBy=[[x.a]], aggr=[[]] +09)------SubqueryAlias: x +10)--------Projection: Float64(1.1) AS a +11)----------EmptyRelation + +# union_with_incompatible_data_types() +query error Incompatible inputs for Union: Previous inputs were of type Utf8, but got incompatible type Boolean on column 'a' +SELECT 'a' a UNION ALL SELECT true a + +statement ok +SET datafusion.optimizer.max_passes = 3; + +statement ok +SET datafusion.explain.logical_plan_only = false; + +statement ok +DROP TABLE orders; + +######################################## +## Test type coercion with UNIONs end ## +######################################## diff --git a/datafusion/sqllogictest/test_files/union.slt b/datafusion/sqllogictest/test_files/union.slt index 476ebe7ebebe..288f99d82c10 100644 --- a/datafusion/sqllogictest/test_files/union.slt +++ b/datafusion/sqllogictest/test_files/union.slt @@ -637,8 +637,54 @@ SELECT t1.v2, t1.v0 FROM t2 NATURAL JOIN t1 SELECT t1.v2, t1.v0 FROM t2 NATURAL JOIN t1 WHERE (t1.v2 IS NULL); ---- +statement ok +CREATE TABLE t3 ( + id INT +) as VALUES + (1), + (2), + (3) +; + +statement ok +CREATE TABLE t4 ( + id TEXT +) as VALUES + ('4'), + ('5'), + ('6') +; + +# test type coersion for wildcard expansion +query T rowsort +(SELECT * FROM t3 ) UNION ALL (SELECT * FROM t4) +---- +1 +2 +3 +4 +5 +6 + statement ok DROP TABLE t1; statement ok DROP TABLE t2; + +statement ok +DROP TABLE t3; + +statement ok +DROP TABLE t4; + +# Test issue: https://github.com/apache/datafusion/issues/11742 +query R rowsort +WITH + tt(v1) AS (VALUES (1::INT),(NULL::INT)) +SELECT NVL(v1, 0.5) FROM tt + UNION ALL +SELECT NULL WHERE FALSE; +---- +0.5 +1 diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index 4957011b8ba2..afa576d12746 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -235,7 +235,7 @@ NULL 10 NULL NULL NULL 17 NULL NULL 18 -query IIII +query IIIT select unnest(column1), unnest(column2) + 2, column3 * 10, unnest(array_remove(column1, '4')) diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt index dfc882667617..ddf6a7aabffc 100644 --- a/datafusion/sqllogictest/test_files/window.slt +++ b/datafusion/sqllogictest/test_files/window.slt @@ -3929,7 +3929,8 @@ b 1 3 a 1 4 b 5 5 -statement error DataFusion error: Error during planning: Projection references non-aggregate values: Expression aggregate_test_100.c1 could not be resolved from available columns: rn +# Schema error: No field named aggregate_test_100.c1. Valid fields are rn. +statement error SELECT * FROM (SELECT c1, c2, ROW_NUMBER() OVER(PARTITION BY c1) as rn FROM aggregate_test_100 diff --git a/dev/release/README.md b/dev/release/README.md index 1817b3002578..397369a41aa3 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -256,20 +256,7 @@ to all of the DataFusion crates. Download and unpack the official release tarball Verify that the Cargo.toml in the tarball contains the correct version -(e.g. `version = "38.0.0"`) and then publish the crates by running the script `release-crates.sh` -in a directory extracted from the source tarball that was voted on. Note that this script doesn't -work if run in a Git repo. - -Alternatively the crates can be published one at a time with the following commands. Crates need to be -published in the correct order as shown in this diagram. - -![](crate-deps.svg) - -_To update this diagram, manually edit the dependencies in [crate-deps.dot](crate-deps.dot) and then run:_ - -```shell -dot -Tsvg dev/release/crate-deps.dot > dev/release/crate-deps.svg -``` +(e.g. `version = "38.0.0"`) and then publish the crates by running the following commands ```shell (cd datafusion/common && cargo publish) @@ -283,7 +270,9 @@ dot -Tsvg dev/release/crate-deps.dot > dev/release/crate-deps.svg (cd datafusion/sql && cargo publish) (cd datafusion/optimizer && cargo publish) (cd datafusion/common-runtime && cargo publish) +(cd datafusion/catalog && cargo publish) (cd datafusion/physical-plan && cargo publish) +(cd datafusion/physical-optimizer && cargo publish) (cd datafusion/core && cargo publish) (cd datafusion/proto-common && cargo publish) (cd datafusion/proto && cargo publish) diff --git a/dev/release/crate-deps.dot b/dev/release/crate-deps.dot deleted file mode 100644 index 1d903a56021d..000000000000 --- a/dev/release/crate-deps.dot +++ /dev/null @@ -1,91 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -digraph G { - datafusion_examples - datafusion_examples -> datafusion - datafusion_examples -> datafusion_common - datafusion_examples -> datafusion_expr - datafusion_examples -> datafusion_optimizer - datafusion_examples -> datafusion_physical_expr - datafusion_examples -> datafusion_sql - datafusion_expr - datafusion_expr -> datafusion_common - datafusion_functions - datafusion_functions -> datafusion_common - datafusion_functions -> datafusion_execution - datafusion_functions -> datafusion_expr - datafusion_wasmtest - datafusion_wasmtest -> datafusion - datafusion_wasmtest -> datafusion_common - datafusion_wasmtest -> datafusion_execution - datafusion_wasmtest -> datafusion_expr - datafusion_wasmtest -> datafusion_optimizer - datafusion_wasmtest -> datafusion_physical_expr - datafusion_wasmtest -> datafusion_physical_plan - datafusion_wasmtest -> datafusion_sql - datafusion_common - datafusion_sql - datafusion_sql -> datafusion_common - datafusion_sql -> datafusion_expr - datafusion_physical_plan - datafusion_physical_plan -> datafusion_common - datafusion_physical_plan -> datafusion_execution - datafusion_physical_plan -> datafusion_expr - datafusion_physical_plan -> datafusion_physical_expr - datafusion_benchmarks - datafusion_benchmarks -> datafusion - datafusion_benchmarks -> datafusion_common - datafusion_benchmarks -> datafusion_proto - datafusion_docs_tests - datafusion_docs_tests -> datafusion - datafusion_optimizer - datafusion_optimizer -> datafusion_common - datafusion_optimizer -> datafusion_expr - datafusion_optimizer -> datafusion_physical_expr - datafusion_optimizer -> datafusion_sql - datafusion_proto - datafusion_proto -> datafusion - datafusion_proto -> datafusion_common - datafusion_proto -> datafusion_expr - datafusion_physical_expr - datafusion_physical_expr -> datafusion_common - datafusion_physical_expr -> datafusion_execution - datafusion_physical_expr -> datafusion_expr - datafusion_sqllogictest - datafusion_sqllogictest -> datafusion - datafusion_sqllogictest -> datafusion_common - datafusion - datafusion -> datafusion_common - datafusion -> datafusion_execution - datafusion -> datafusion_expr - datafusion -> datafusion_functions - datafusion -> datafusion_functions_nested - datafusion -> datafusion_optimizer - datafusion -> datafusion_physical_expr - datafusion -> datafusion_physical_plan - datafusion -> datafusion_sql - datafusion_functions_nested - datafusion_functions_nested -> datafusion_common - datafusion_functions_nested -> datafusion_execution - datafusion_functions_nested -> datafusion_expr - datafusion_execution - datafusion_execution -> datafusion_common - datafusion_execution -> datafusion_expr - datafusion_substrait - datafusion_substrait -> datafusion -} \ No newline at end of file diff --git a/dev/release/crate-deps.svg b/dev/release/crate-deps.svg deleted file mode 100644 index c76fe3abb4ac..000000000000 --- a/dev/release/crate-deps.svg +++ /dev/null @@ -1,445 +0,0 @@ - - - - - - -G - - - -datafusion_examples - -datafusion_examples - - - -datafusion - -datafusion - - - -datafusion_examples->datafusion - - - - - -datafusion_common - -datafusion_common - - - -datafusion_examples->datafusion_common - - - - - -datafusion_expr - -datafusion_expr - - - -datafusion_examples->datafusion_expr - - - - - -datafusion_optimizer - -datafusion_optimizer - - - -datafusion_examples->datafusion_optimizer - - - - - -datafusion_physical_expr - -datafusion_physical_expr - - - -datafusion_examples->datafusion_physical_expr - - - - - -datafusion_sql - -datafusion_sql - - - -datafusion_examples->datafusion_sql - - - - - -datafusion->datafusion_common - - - - - -datafusion->datafusion_expr - - - - - -datafusion->datafusion_optimizer - - - - - -datafusion->datafusion_physical_expr - - - - - -datafusion->datafusion_sql - - - - - -datafusion_functions - -datafusion_functions - - - -datafusion->datafusion_functions - - - - - -datafusion_execution - -datafusion_execution - - - -datafusion->datafusion_execution - - - - - -datafusion_physical_plan - -datafusion_physical_plan - - - -datafusion->datafusion_physical_plan - - - - - -datafusion_functions_nested - -datafusion_functions_nested - - - -datafusion->datafusion_functions_nested - - - - - -datafusion_expr->datafusion_common - - - - - -datafusion_optimizer->datafusion_common - - - - - -datafusion_optimizer->datafusion_expr - - - - - -datafusion_optimizer->datafusion_physical_expr - - - - - -datafusion_optimizer->datafusion_sql - - - - - -datafusion_physical_expr->datafusion_common - - - - - -datafusion_physical_expr->datafusion_expr - - - - - -datafusion_physical_expr->datafusion_execution - - - - - -datafusion_sql->datafusion_common - - - - - -datafusion_sql->datafusion_expr - - - - - -datafusion_functions->datafusion_common - - - - - -datafusion_functions->datafusion_expr - - - - - -datafusion_functions->datafusion_execution - - - - - -datafusion_execution->datafusion_common - - - - - -datafusion_execution->datafusion_expr - - - - - -datafusion_wasmtest - -datafusion_wasmtest - - - -datafusion_wasmtest->datafusion - - - - - -datafusion_wasmtest->datafusion_common - - - - - -datafusion_wasmtest->datafusion_expr - - - - - -datafusion_wasmtest->datafusion_optimizer - - - - - -datafusion_wasmtest->datafusion_physical_expr - - - - - -datafusion_wasmtest->datafusion_sql - - - - - -datafusion_wasmtest->datafusion_execution - - - - - -datafusion_wasmtest->datafusion_physical_plan - - - - - -datafusion_physical_plan->datafusion_common - - - - - -datafusion_physical_plan->datafusion_expr - - - - - -datafusion_physical_plan->datafusion_physical_expr - - - - - -datafusion_physical_plan->datafusion_execution - - - - - -datafusion_benchmarks - -datafusion_benchmarks - - - -datafusion_benchmarks->datafusion - - - - - -datafusion_benchmarks->datafusion_common - - - - - -datafusion_proto - -datafusion_proto - - - -datafusion_benchmarks->datafusion_proto - - - - - -datafusion_proto->datafusion - - - - - -datafusion_proto->datafusion_common - - - - - -datafusion_proto->datafusion_expr - - - - - -datafusion_docs_tests - -datafusion_docs_tests - - - -datafusion_docs_tests->datafusion - - - - - -datafusion_sqllogictest - -datafusion_sqllogictest - - - -datafusion_sqllogictest->datafusion - - - - - -datafusion_sqllogictest->datafusion_common - - - - - -datafusion_functions_nested->datafusion_common - - - - - -datafusion_functions_nested->datafusion_expr - - - - - -datafusion_functions_nested->datafusion_execution - - - - - -datafusion_substrait - -datafusion_substrait - - - -datafusion_substrait->datafusion - - - - - diff --git a/dev/release/release-crates.sh b/dev/release/release-crates.sh deleted file mode 100644 index b9bda68b780b..000000000000 --- a/dev/release/release-crates.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -# This script publishes datafusion crates to crates.io. -# -# This script should only be run after the release has been approved -# by the Apache DataFusion PMC committee. -# -# See release/README.md for full release instructions - -set -eu - -# Do not run inside a git repo -if ! [ git rev-parse --is-inside-work-tree ]; then - cd datafusion/common && cargo publish - cd datafusion/expr && cargo publish - cd datafusion/sql && cargo publish - cd datafusion/physical-expr && cargo publish - cd datafusion/optimizer && cargo publish - cd datafusion/core && cargo publish - cd datafusion/proto && cargo publish - cd datafusion/execution && cargo publish - cd datafusion/substrait && cargo publish - cd datafusion-cli && cargo publish --no-verify -else - echo "Crates must be released from the source tarball that was voted on, not from the repo" - exit 1 -fi diff --git a/docs/source/contributor-guide/howtos.md b/docs/source/contributor-guide/howtos.md index 254b1de6521e..4e52a2fbcaa6 100644 --- a/docs/source/contributor-guide/howtos.md +++ b/docs/source/contributor-guide/howtos.md @@ -24,7 +24,7 @@ Below is a checklist of what you need to do to add a new scalar function to DataFusion: - Add the actual implementation of the function to a new module file within: - - [here](https://github.com/apache/datafusion/tree/main/datafusion/functions-array) for array functions + - [here](https://github.com/apache/datafusion/tree/main/datafusion/functions-nested) for arrays, maps and structs functions - [here](https://github.com/apache/datafusion/tree/main/datafusion/functions/src/crypto) for crypto functions - [here](https://github.com/apache/datafusion/tree/main/datafusion/functions/src/datetime) for datetime functions - [here](https://github.com/apache/datafusion/tree/main/datafusion/functions/src/encoding) for encoding functions