diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index ab6a615ab60b..44ca5aaf4eda 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -61,4 +61,4 @@ jobs: git add --all git commit -m 'Publish built docs triggered by ${{ github.sha }}' git push || git push --force - fi + fi \ No newline at end of file diff --git a/benchmarks/src/parquet_filter.rs b/benchmarks/src/parquet_filter.rs index eb9e09a7cb7c..5c98a2f8be3d 100644 --- a/benchmarks/src/parquet_filter.rs +++ b/benchmarks/src/parquet_filter.rs @@ -15,8 +15,10 @@ // specific language governing permissions and limitations // under the License. -use crate::AccessLogOpt; -use crate::{BenchmarkRun, CommonOpt}; +use std::path::PathBuf; + +use crate::{AccessLogOpt, BenchmarkRun, CommonOpt}; + use arrow::util::pretty; use datafusion::common::Result; use datafusion::logical_expr::utils::disjunction; @@ -25,7 +27,7 @@ use datafusion::physical_plan::collect; use datafusion::prelude::{col, SessionContext}; use datafusion::test_util::parquet::{ParquetScanOptions, TestParquetFile}; use datafusion_common::instant::Instant; -use std::path::PathBuf; + use structopt::StructOpt; /// Test performance of parquet filter pushdown @@ -179,7 +181,7 @@ async fn exec_scan( debug: bool, ) -> Result<(usize, std::time::Duration)> { let start = Instant::now(); - let exec = test_file.create_scan(Some(filter)).await?; + let exec = test_file.create_scan(ctx, Some(filter)).await?; let task_ctx = ctx.task_ctx(); let result = collect(exec, task_ctx).await?; diff --git a/benchmarks/src/sort.rs b/benchmarks/src/sort.rs index bda0f4ae3f43..19eec2949ef6 100644 --- a/benchmarks/src/sort.rs +++ b/benchmarks/src/sort.rs @@ -15,9 +15,11 @@ // specific language governing permissions and limitations // under the License. -use crate::AccessLogOpt; -use crate::BenchmarkRun; -use crate::CommonOpt; +use std::path::PathBuf; +use std::sync::Arc; + +use crate::{AccessLogOpt, BenchmarkRun, CommonOpt}; + use arrow::util::pretty; use datafusion::common::Result; use datafusion::physical_expr::PhysicalSortExpr; @@ -26,8 +28,7 @@ use datafusion::physical_plan::sorts::sort::SortExec; use datafusion::prelude::{SessionConfig, SessionContext}; use datafusion::test_util::parquet::TestParquetFile; use datafusion_common::instant::Instant; -use std::path::PathBuf; -use std::sync::Arc; + use structopt::StructOpt; /// Test performance of sorting large datasets @@ -174,7 +175,7 @@ async fn exec_sort( debug: bool, ) -> Result<(usize, std::time::Duration)> { let start = Instant::now(); - let scan = test_file.create_scan(None).await?; + let scan = test_file.create_scan(ctx, None).await?; let exec = Arc::new(SortExec::new(expr.to_owned(), scan)); let task_ctx = ctx.task_ctx(); let result = collect(exec, task_ctx).await?; diff --git a/benchmarks/src/tpch/run.rs b/benchmarks/src/tpch/run.rs index 5497315fa3ba..564a2f05b6fe 100644 --- a/benchmarks/src/tpch/run.rs +++ b/benchmarks/src/tpch/run.rs @@ -15,8 +15,14 @@ // specific language governing permissions and limitations // under the License. -use super::get_query_sql; +use std::path::PathBuf; +use std::sync::Arc; + +use super::{ + get_query_sql, get_tbl_tpch_table_schema, get_tpch_table_schema, TPCH_TABLES, +}; use crate::{BenchmarkRun, CommonOpt}; + use arrow::record_batch::RecordBatch; use arrow::util::pretty::{self, pretty_format_batches}; use datafusion::datasource::file_format::csv::CsvFormat; @@ -26,21 +32,16 @@ use datafusion::datasource::listing::{ ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl, }; use datafusion::datasource::{MemTable, TableProvider}; +use datafusion::error::Result; use datafusion::physical_plan::display::DisplayableExecutionPlan; use datafusion::physical_plan::{collect, displayable}; -use datafusion_common::{DEFAULT_CSV_EXTENSION, DEFAULT_PARQUET_EXTENSION}; -use log::info; - +use datafusion::prelude::*; use datafusion_common::instant::Instant; -use std::path::PathBuf; -use std::sync::Arc; +use datafusion_common::{DEFAULT_CSV_EXTENSION, DEFAULT_PARQUET_EXTENSION}; -use datafusion::error::Result; -use datafusion::prelude::*; +use log::info; use structopt::StructOpt; -use super::{get_tbl_tpch_table_schema, get_tpch_table_schema, TPCH_TABLES}; - /// Run the tpch benchmark. /// /// This benchmarks is derived from the [TPC-H][1] version @@ -253,7 +254,7 @@ impl RunOpt { } "parquet" => { let path = format!("{path}/{table}"); - let format = ParquetFormat::default().with_enable_pruning(Some(true)); + let format = ParquetFormat::default().with_enable_pruning(true); (Arc::new(format), path, DEFAULT_PARQUET_EXTENSION) } @@ -298,11 +299,12 @@ struct QueryResult { // Only run with "ci" mode when we have the data #[cfg(feature = "ci")] mod tests { + use std::path::Path; + use super::*; + use datafusion::common::exec_err; use datafusion::error::{DataFusionError, Result}; - use std::path::Path; - use datafusion_proto::bytes::{ logical_plan_from_bytes, logical_plan_to_bytes, physical_plan_from_bytes, physical_plan_to_bytes, diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 6efb657ea899..a0f68c76e4a8 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1238,8 +1238,8 @@ dependencies = [ "datafusion-common", "paste", "sqlparser", - "strum 0.26.1", - "strum_macros 0.26.1", + "strum 0.26.2", + "strum_macros 0.26.2", ] [[package]] @@ -1265,6 +1265,9 @@ name = "datafusion-functions-array" version = "36.0.0" dependencies = [ "arrow", + "arrow-array", + "arrow-buffer", + "arrow-schema", "datafusion-common", "datafusion-execution", "datafusion-expr", @@ -1363,7 +1366,7 @@ dependencies = [ "datafusion-expr", "log", "sqlparser", - "strum 0.26.1", + "strum 0.26.2", ] [[package]] @@ -3257,11 +3260,11 @@ checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125" [[package]] name = "strum" -version = "0.26.1" +version = "0.26.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "723b93e8addf9aa965ebe2d11da6d7540fa2283fcea14b3371ff055f7ba13f5f" +checksum = "5d8cec3501a5194c432b2b7976db6b7d10ec95c253208b45f83f7136aa985e29" dependencies = [ - "strum_macros 0.26.1", + "strum_macros 0.26.2", ] [[package]] @@ -3279,9 +3282,9 @@ dependencies = [ [[package]] name = "strum_macros" -version = "0.26.1" +version = "0.26.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a3417fc93d76740d974a01654a09777cb500428cc874ca9f45edfe0c4d4cd18" +checksum = "c6cf59daf282c0a494ba14fd21610a0325f9f90ec9d1231dea26bcb1d696c946" dependencies = [ "heck", "proc-macro2", diff --git a/datafusion-cli/src/catalog.rs b/datafusion-cli/src/catalog.rs index bcedf7248cec..a8ecb98637cb 100644 --- a/datafusion-cli/src/catalog.rs +++ b/datafusion-cli/src/catalog.rs @@ -15,8 +15,11 @@ // specific language governing permissions and limitations // under the License. -use crate::object_storage::get_object_store; -use async_trait::async_trait; +use std::any::Any; +use std::sync::{Arc, Weak}; + +use crate::object_storage::{get_object_store, AwsOptions, GcpOptions}; + use datafusion::catalog::schema::SchemaProvider; use datafusion::catalog::{CatalogProvider, CatalogProviderList}; use datafusion::common::plan_datafusion_err; @@ -26,12 +29,10 @@ use datafusion::datasource::listing::{ use datafusion::datasource::TableProvider; use datafusion::error::Result; use datafusion::execution::context::SessionState; + +use async_trait::async_trait; use dirs::home_dir; use parking_lot::RwLock; -use std::any::Any; -use std::collections::HashMap; -use std::sync::{Arc, Weak}; -use url::Url; /// Wraps another catalog, automatically creating table providers /// for local files if needed @@ -155,7 +156,7 @@ impl SchemaProvider for DynamicFileSchemaProvider { // if the inner schema provider didn't have a table by // that name, try to treat it as a listing table - let state = self + let mut state = self .state .upgrade() .ok_or_else(|| plan_datafusion_err!("locking error"))? @@ -163,7 +164,8 @@ impl SchemaProvider for DynamicFileSchemaProvider { .clone(); let optimized_name = substitute_tilde(name.to_owned()); let table_url = ListingTableUrl::parse(optimized_name.as_str())?; - let url: &Url = table_url.as_ref(); + let scheme = table_url.scheme(); + let url = table_url.as_ref(); // If the store is already registered for this URL then `get_store` // will return `Ok` which means we don't need to register it again. However, @@ -174,10 +176,22 @@ impl SchemaProvider for DynamicFileSchemaProvider { Err(_) => { // Register the store for this URL. Here we don't have access // to any command options so the only choice is to use an empty collection - let mut options = HashMap::new(); - let store = - get_object_store(&state, &mut options, table_url.scheme(), url) - .await?; + match scheme { + "s3" | "oss" => { + state = state.add_table_options_extension(AwsOptions::default()); + } + "gs" | "gcs" => { + state = state.add_table_options_extension(GcpOptions::default()) + } + _ => {} + }; + let store = get_object_store( + &state, + table_url.scheme(), + url, + state.default_table_options(), + ) + .await?; state.runtime_env().register_object_store(url, store); } } @@ -215,6 +229,7 @@ fn substitute_tilde(cur: String) -> String { #[cfg(test)] mod tests { use super::*; + use datafusion::catalog::schema::SchemaProvider; use datafusion::prelude::SessionContext; diff --git a/datafusion-cli/src/exec.rs b/datafusion-cli/src/exec.rs index 4eae5ffdd7e7..b11f1c202284 100644 --- a/datafusion-cli/src/exec.rs +++ b/datafusion-cli/src/exec.rs @@ -17,7 +17,6 @@ //! Execution functions -use datafusion_common::instant::Instant; use std::collections::HashMap; use std::fs::File; use std::io::prelude::*; @@ -27,15 +26,15 @@ use crate::print_format::PrintFormat; use crate::{ command::{Command, OutputFormat}, helper::{unescape_input, CliHelper}, - object_storage::get_object_store, + object_storage::{get_object_store, register_options}, print_options::{MaxRows, PrintOptions}, }; +use datafusion::common::instant::Instant; use datafusion::common::plan_datafusion_err; use datafusion::datasource::listing::ListingTableUrl; use datafusion::error::{DataFusionError, Result}; -use datafusion::logical_expr::dml::CopyTo; -use datafusion::logical_expr::{CreateExternalTable, DdlStatement, LogicalPlan}; +use datafusion::logical_expr::{DdlStatement, LogicalPlan}; use datafusion::physical_plan::{collect, execute_stream, ExecutionPlanProperties}; use datafusion::prelude::SessionContext; use datafusion::sql::parser::{DFParser, Statement}; @@ -44,7 +43,6 @@ use datafusion::sql::sqlparser::dialect::dialect_from_str; use rustyline::error::ReadlineError; use rustyline::Editor; use tokio::signal; -use url::Url; /// run and execute SQL statements and commands, against a context with the given print options pub async fn exec_from_commands( @@ -258,42 +256,74 @@ async fn create_plan( // Note that cmd is a mutable reference so that create_external_table function can remove all // datafusion-cli specific options before passing through to datafusion. Otherwise, datafusion // will raise Configuration errors. - if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &mut plan { - create_external_table(ctx, cmd).await?; + if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &plan { + register_object_store_and_config_extensions(ctx, &cmd.location, &cmd.options) + .await?; } if let LogicalPlan::Copy(copy_to) = &mut plan { - register_object_store(ctx, copy_to).await?; + register_object_store_and_config_extensions( + ctx, + ©_to.output_url, + ©_to.options, + ) + .await?; } Ok(plan) } -async fn register_object_store( - ctx: &SessionContext, - copy_to: &mut CopyTo, -) -> Result<(), DataFusionError> { - let url = ListingTableUrl::parse(copy_to.output_url.as_str())?; - let store = get_object_store( - &ctx.state(), - &mut HashMap::new(), - url.scheme(), - url.as_ref(), - ) - .await?; - ctx.runtime_env().register_object_store(url.as_ref(), store); - Ok(()) -} - -async fn create_external_table( +/// Asynchronously registers an object store and its configuration extensions +/// to the session context. +/// +/// This function dynamically registers a cloud object store based on the given +/// location and options. It first parses the location to determine the scheme +/// and constructs the URL accordingly. Depending on the scheme, it also registers +/// relevant options. The function then alters the default table options with the +/// given custom options. Finally, it retrieves and registers the object store +/// in the session context. +/// +/// # Parameters +/// +/// * `ctx`: A reference to the `SessionContext` for registering the object store. +/// * `location`: A string reference representing the location of the object store. +/// * `options`: A reference to a hash map containing configuration options for +/// the object store. +/// +/// # Returns +/// +/// A `Result<()>` which is an Ok value indicating successful registration, or +/// an error upon failure. +/// +/// # Errors +/// +/// This function can return an error if the location parsing fails, options +/// alteration fails, or if the object store cannot be retrieved and registered +/// successfully. +pub(crate) async fn register_object_store_and_config_extensions( ctx: &SessionContext, - cmd: &mut CreateExternalTable, + location: &String, + options: &HashMap, ) -> Result<()> { - let table_path = ListingTableUrl::parse(&cmd.location)?; + // Parse the location URL to extract the scheme and other components + let table_path = ListingTableUrl::parse(location)?; + + // Extract the scheme (e.g., "s3", "gcs") from the parsed URL let scheme = table_path.scheme(); - let url: &Url = table_path.as_ref(); - // registering the cloud object store dynamically using cmd.options - let store = get_object_store(&ctx.state(), &mut cmd.options, scheme, url).await?; + // Obtain a reference to the URL + let url = table_path.as_ref(); + + // Register the options based on the scheme extracted from the location + register_options(ctx, scheme); + + // Clone and modify the default table options based on the provided options + let mut table_options = ctx.state().default_table_options().clone(); + table_options.alter_with_string_hash_map(options)?; + + // Retrieve the appropriate object store based on the scheme, URL, and modified table options + let store = get_object_store(&ctx.state(), scheme, url, &table_options).await?; + + // Register the retrieved object store in the session context's runtime environment ctx.runtime_env().register_object_store(url, store); Ok(()) @@ -301,33 +331,48 @@ async fn create_external_table( #[cfg(test)] mod tests { - use std::str::FromStr; - use super::*; - use datafusion::common::{plan_err, FileType, FileTypeWriterOptions}; - use datafusion_common::file_options::StatementOptions; + use datafusion_common::config::FormatOptions; + use datafusion_common::plan_err; + + use url::Url; async fn create_external_table_test(location: &str, sql: &str) -> Result<()> { let ctx = SessionContext::new(); - let mut plan = ctx.state().create_logical_plan(sql).await?; - - if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &mut plan { - create_external_table(&ctx, cmd).await?; - let options: Vec<_> = cmd - .options - .iter() - .map(|(k, v)| (k.clone(), v.clone())) - .collect(); - let statement_options = StatementOptions::new(options); - let file_type = - datafusion_common::FileType::from_str(cmd.file_type.as_str())?; - - let _file_type_writer_options = FileTypeWriterOptions::build( - &file_type, - ctx.state().config_options(), - &statement_options, - )?; + let plan = ctx.state().create_logical_plan(sql).await?; + + if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &plan { + register_object_store_and_config_extensions( + &ctx, + &cmd.location, + &cmd.options, + ) + .await?; + } else { + return plan_err!("LogicalPlan is not a CreateExternalTable"); + } + + // Ensure the URL is supported by the object store + ctx.runtime_env() + .object_store(ListingTableUrl::parse(location)?)?; + + Ok(()) + } + + async fn copy_to_table_test(location: &str, sql: &str) -> Result<()> { + let ctx = SessionContext::new(); + // AWS CONFIG register. + + let plan = ctx.state().create_logical_plan(sql).await?; + + if let LogicalPlan::Copy(cmd) = &plan { + register_object_store_and_config_extensions( + &ctx, + &cmd.output_url, + &cmd.options, + ) + .await?; } else { return plan_err!("LogicalPlan is not a CreateExternalTable"); } @@ -374,7 +419,7 @@ mod tests { let mut plan = create_plan(&mut ctx, statement).await?; if let LogicalPlan::Copy(copy_to) = &mut plan { assert_eq!(copy_to.output_url, location); - assert_eq!(copy_to.file_format, FileType::PARQUET); + assert!(matches!(copy_to.format_options, FormatOptions::PARQUET(_))); ctx.runtime_env() .object_store_registry .get_store(&Url::parse(©_to.output_url).unwrap())?; @@ -386,6 +431,20 @@ mod tests { Ok(()) } + #[tokio::test] + async fn copy_to_object_store_table_s3() -> Result<()> { + let access_key_id = "fake_access_key_id"; + let secret_access_key = "fake_secret_access_key"; + let location = "s3://bucket/path/file.parquet"; + + // Missing region, use object_store defaults + let sql = format!("COPY (values (1,2)) TO '{location}' + (format parquet, 'aws.access_key_id' '{access_key_id}', 'aws.secret_access_key' '{secret_access_key}')"); + copy_to_table_test(location, &sql).await?; + + Ok(()) + } + #[tokio::test] async fn create_object_store_table_s3() -> Result<()> { let access_key_id = "fake_access_key_id"; @@ -396,12 +455,12 @@ mod tests { // Missing region, use object_store defaults let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET - OPTIONS('access_key_id' '{access_key_id}', 'secret_access_key' '{secret_access_key}') LOCATION '{location}'"); + OPTIONS('aws.access_key_id' '{access_key_id}', 'aws.secret_access_key' '{secret_access_key}') LOCATION '{location}'"); create_external_table_test(location, &sql).await?; // Should be OK let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET - OPTIONS('access_key_id' '{access_key_id}', 'secret_access_key' '{secret_access_key}', 'region' '{region}', 'session_token' '{session_token}') LOCATION '{location}'"); + OPTIONS('aws.access_key_id' '{access_key_id}', 'aws.secret_access_key' '{secret_access_key}', 'aws.region' '{region}', 'aws.session_token' '{session_token}') LOCATION '{location}'"); create_external_table_test(location, &sql).await?; Ok(()) @@ -416,7 +475,7 @@ mod tests { // Should be OK let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET - OPTIONS('access_key_id' '{access_key_id}', 'secret_access_key' '{secret_access_key}', 'endpoint' '{endpoint}') LOCATION '{location}'"); + OPTIONS('aws.access_key_id' '{access_key_id}', 'aws.secret_access_key' '{secret_access_key}', 'aws.oss.endpoint' '{endpoint}') LOCATION '{location}'"); create_external_table_test(location, &sql).await?; Ok(()) @@ -432,14 +491,14 @@ mod tests { // for service_account_path let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET - OPTIONS('service_account_path' '{service_account_path}') LOCATION '{location}'"); + OPTIONS('gcp.service_account_path' '{service_account_path}') LOCATION '{location}'"); let err = create_external_table_test(location, &sql) .await .unwrap_err(); assert!(err.to_string().contains("os error 2")); // for service_account_key - let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS('service_account_key' '{service_account_key}') LOCATION '{location}'"); + let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS('gcp.service_account_key' '{service_account_key}') LOCATION '{location}'"); let err = create_external_table_test(location, &sql) .await .unwrap_err() @@ -448,7 +507,7 @@ mod tests { // for application_credentials_path let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET - OPTIONS('application_credentials_path' '{application_credentials_path}') LOCATION '{location}'"); + OPTIONS('gcp.application_credentials_path' '{application_credentials_path}') LOCATION '{location}'"); let err = create_external_table_test(location, &sql) .await .unwrap_err(); diff --git a/datafusion-cli/src/object_storage.rs b/datafusion-cli/src/object_storage.rs index 897f3796550d..033c8f839ab2 100644 --- a/datafusion-cli/src/object_storage.rs +++ b/datafusion-cli/src/object_storage.rs @@ -15,40 +15,41 @@ // specific language governing permissions and limitations // under the License. -use async_trait::async_trait; -use aws_credential_types::provider::ProvideCredentials; -use datafusion::common::exec_datafusion_err; +use std::any::Any; +use std::fmt::{Debug, Display}; +use std::sync::Arc; + +use datafusion::common::{config_namespace, exec_datafusion_err, exec_err, internal_err}; use datafusion::error::{DataFusionError, Result}; use datafusion::execution::context::SessionState; -use object_store::aws::AwsCredential; -use object_store::http::HttpBuilder; -use object_store::ObjectStore; -use object_store::{ - aws::AmazonS3Builder, gcp::GoogleCloudStorageBuilder, CredentialProvider, +use datafusion::prelude::SessionContext; +use datafusion_common::config::{ + ConfigEntry, ConfigExtension, ConfigField, ExtensionOptions, TableOptions, Visit, }; -use std::collections::HashMap; -use std::sync::Arc; + +use async_trait::async_trait; +use aws_credential_types::provider::ProvideCredentials; +use object_store::aws::{AmazonS3Builder, AwsCredential}; +use object_store::gcp::GoogleCloudStorageBuilder; +use object_store::http::HttpBuilder; +use object_store::{CredentialProvider, ObjectStore}; use url::Url; pub async fn get_s3_object_store_builder( url: &Url, - options: &mut HashMap, + aws_options: &AwsOptions, ) -> Result { let bucket_name = get_bucket_name(url)?; let mut builder = AmazonS3Builder::from_env().with_bucket_name(bucket_name); - if let (Some(access_key_id), Some(secret_access_key)) = ( - // These options are datafusion-cli specific and must be removed before passing through to datafusion. - // Otherwise, a Configuration error will be raised. - options.remove("access_key_id"), - options.remove("secret_access_key"), - ) { - println!("removing secret access key!"); + if let (Some(access_key_id), Some(secret_access_key)) = + (&aws_options.access_key_id, &aws_options.secret_access_key) + { builder = builder .with_access_key_id(access_key_id) .with_secret_access_key(secret_access_key); - if let Some(session_token) = options.remove("session_token") { + if let Some(session_token) = &aws_options.session_token { builder = builder.with_token(session_token); } } else { @@ -62,7 +63,7 @@ pub async fn get_s3_object_store_builder( .ok_or_else(|| { DataFusionError::ObjectStore(object_store::Error::Generic { store: "S3", - source: "Failed to get S3 credentials from environment".into(), + source: "Failed to get S3 credentials from the environment".into(), }) })? .clone(); @@ -71,7 +72,7 @@ pub async fn get_s3_object_store_builder( builder = builder.with_credentials(credentials); } - if let Some(region) = options.remove("region") { + if let Some(region) = &aws_options.region { builder = builder.with_region(region); } @@ -104,7 +105,7 @@ impl CredentialProvider for S3CredentialProvider { pub fn get_oss_object_store_builder( url: &Url, - cmd: &mut HashMap, + aws_options: &AwsOptions, ) -> Result { let bucket_name = get_bucket_name(url)?; let mut builder = AmazonS3Builder::from_env() @@ -114,14 +115,14 @@ pub fn get_oss_object_store_builder( .with_region("do_not_care"); if let (Some(access_key_id), Some(secret_access_key)) = - (cmd.remove("access_key_id"), cmd.remove("secret_access_key")) + (&aws_options.access_key_id, &aws_options.secret_access_key) { builder = builder .with_access_key_id(access_key_id) .with_secret_access_key(secret_access_key); } - if let Some(endpoint) = cmd.remove("endpoint") { + if let Some(endpoint) = &aws_options.oss.endpoint { builder = builder.with_endpoint(endpoint); } @@ -130,21 +131,20 @@ pub fn get_oss_object_store_builder( pub fn get_gcs_object_store_builder( url: &Url, - cmd: &mut HashMap, + gs_options: &GcpOptions, ) -> Result { let bucket_name = get_bucket_name(url)?; let mut builder = GoogleCloudStorageBuilder::from_env().with_bucket_name(bucket_name); - if let Some(service_account_path) = cmd.remove("service_account_path") { + if let Some(service_account_path) = &gs_options.service_account_path { builder = builder.with_service_account_path(service_account_path); } - if let Some(service_account_key) = cmd.remove("service_account_key") { + if let Some(service_account_key) = &gs_options.service_account_key { builder = builder.with_service_account_key(service_account_key); } - if let Some(application_credentials_path) = cmd.remove("application_credentials_path") - { + if let Some(application_credentials_path) = &gs_options.application_credentials_path { builder = builder.with_application_credentials(application_credentials_path); } @@ -160,32 +160,277 @@ fn get_bucket_name(url: &Url) -> Result<&str> { }) } +/// This struct encapsulates AWS options one uses when setting up object storage. +#[derive(Default, Debug, Clone)] +pub struct AwsOptions { + /// Access Key ID + pub access_key_id: Option, + /// Secret Access Key + pub secret_access_key: Option, + /// Session token + pub session_token: Option, + /// AWS Region + pub region: Option, + /// Object Storage Service options + pub oss: OssOptions, +} + +config_namespace! { + pub struct OssOptions { + pub endpoint: Option, default = None + } +} + +impl ExtensionOptions for AwsOptions { + fn as_any(&self) -> &dyn Any { + self + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + fn cloned(&self) -> Box { + Box::new(self.clone()) + } + + fn set(&mut self, key: &str, value: &str) -> Result<()> { + let (_key, aws_key) = key.split_once('.').unwrap_or((key, "")); + let (key, rem) = aws_key.split_once('.').unwrap_or((aws_key, "")); + match key { + "access_key_id" => { + self.access_key_id.set(rem, value)?; + } + "secret_access_key" => { + self.secret_access_key.set(rem, value)?; + } + "session_token" => { + self.session_token.set(rem, value)?; + } + "region" => { + self.region.set(rem, value)?; + } + "oss" => { + self.oss.set(rem, value)?; + } + _ => { + return internal_err!("Config value \"{}\" not found on AwsOptions", rem); + } + } + Ok(()) + } + + fn entries(&self) -> Vec { + struct Visitor(Vec); + + impl Visit for Visitor { + fn some( + &mut self, + key: &str, + value: V, + description: &'static str, + ) { + self.0.push(ConfigEntry { + key: key.to_string(), + value: Some(value.to_string()), + description, + }) + } + + fn none(&mut self, key: &str, description: &'static str) { + self.0.push(ConfigEntry { + key: key.to_string(), + value: None, + description, + }) + } + } + + let mut v = Visitor(vec![]); + self.access_key_id.visit(&mut v, "access_key_id", ""); + self.secret_access_key + .visit(&mut v, "secret_access_key", ""); + self.session_token.visit(&mut v, "session_token", ""); + self.region.visit(&mut v, "region", ""); + self.oss.visit(&mut v, "oss", ""); + v.0 + } +} + +impl ConfigExtension for AwsOptions { + const PREFIX: &'static str = "aws"; +} + +/// This struct encapsulates GCP options one uses when setting up object storage. +#[derive(Debug, Clone, Default)] +pub struct GcpOptions { + /// Service account path + pub service_account_path: Option, + /// Service account key + pub service_account_key: Option, + /// Application credentials path + pub application_credentials_path: Option, +} + +impl ExtensionOptions for GcpOptions { + fn as_any(&self) -> &dyn Any { + self + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + fn cloned(&self) -> Box { + Box::new(self.clone()) + } + + fn set(&mut self, key: &str, value: &str) -> Result<()> { + let (_key, rem) = key.split_once('.').unwrap_or((key, "")); + match rem { + "service_account_path" => { + self.service_account_path.set(rem, value)?; + } + "service_account_key" => { + self.service_account_key.set(rem, value)?; + } + "application_credentials_path" => { + self.application_credentials_path.set(rem, value)?; + } + _ => { + return internal_err!("Config value \"{}\" not found on GcpOptions", rem); + } + } + Ok(()) + } + + fn entries(&self) -> Vec { + struct Visitor(Vec); + + impl Visit for Visitor { + fn some( + &mut self, + key: &str, + value: V, + description: &'static str, + ) { + self.0.push(ConfigEntry { + key: key.to_string(), + value: Some(value.to_string()), + description, + }) + } + + fn none(&mut self, key: &str, description: &'static str) { + self.0.push(ConfigEntry { + key: key.to_string(), + value: None, + description, + }) + } + } + + let mut v = Visitor(vec![]); + self.service_account_path + .visit(&mut v, "service_account_path", ""); + self.service_account_key + .visit(&mut v, "service_account_key", ""); + self.application_credentials_path.visit( + &mut v, + "application_credentials_path", + "", + ); + v.0 + } +} + +impl ConfigExtension for GcpOptions { + const PREFIX: &'static str = "gcp"; +} + +/// Registers storage options for different cloud storage schemes in a given +/// session context. +/// +/// This function is responsible for extending the session context with specific +/// options based on the storage scheme being used. These options are essential +/// for handling interactions with different cloud storage services such as Amazon +/// S3, Alibaba Cloud OSS, Google Cloud Storage, etc. +/// +/// # Parameters +/// +/// * `ctx` - A mutable reference to the session context where table options are +/// to be registered. The session context holds configuration and environment +/// for the current session. +/// * `scheme` - A string slice that represents the cloud storage scheme. This +/// determines which set of options will be registered in the session context. +/// +/// # Supported Schemes +/// +/// * `s3` or `oss` - Registers `AwsOptions` which are configurations specific to +/// Amazon S3 and Alibaba Cloud OSS. +/// * `gs` or `gcs` - Registers `GcpOptions` which are configurations specific to +/// Google Cloud Storage. +/// +/// NOTE: This function will not perform any action when given an unsupported scheme. +pub(crate) fn register_options(ctx: &SessionContext, scheme: &str) { + // Match the provided scheme against supported cloud storage schemes: + match scheme { + // For Amazon S3 or Alibaba Cloud OSS + "s3" | "oss" => { + // Register AWS specific table options in the session context: + ctx.register_table_options_extension(AwsOptions::default()) + } + // For Google Cloud Storage + "gs" | "gcs" => { + // Register GCP specific table options in the session context: + ctx.register_table_options_extension(GcpOptions::default()) + } + // For unsupported schemes, do nothing: + _ => {} + } +} + pub(crate) async fn get_object_store( state: &SessionState, - options: &mut HashMap, scheme: &str, url: &Url, + table_options: &TableOptions, ) -> Result, DataFusionError> { - let store = match scheme { + let store: Arc = match scheme { "s3" => { + let Some(options) = table_options.extensions.get::() else { + return exec_err!( + "Given table options incompatible with the 's3' scheme" + ); + }; let builder = get_s3_object_store_builder(url, options).await?; - Arc::new(builder.build()?) as Arc + Arc::new(builder.build()?) } "oss" => { + let Some(options) = table_options.extensions.get::() else { + return exec_err!( + "Given table options incompatible with the 'oss' scheme" + ); + }; let builder = get_oss_object_store_builder(url, options)?; - Arc::new(builder.build()?) as Arc + Arc::new(builder.build()?) } "gs" | "gcs" => { + let Some(options) = table_options.extensions.get::() else { + return exec_err!( + "Given table options incompatible with the 'gs'/'gcs' scheme" + ); + }; let builder = get_gcs_object_store_builder(url, options)?; - Arc::new(builder.build()?) as Arc + Arc::new(builder.build()?) } "http" | "https" => Arc::new( HttpBuilder::new() .with_url(url.origin().ascii_serialization()) .build()?, - ) as Arc, + ), _ => { - // for other types, try to get from the object_store_registry + // For other types, try to get from `object_store_registry`: state .runtime_env() .object_store_registry @@ -201,12 +446,14 @@ pub(crate) async fn get_object_store( #[cfg(test)] mod tests { use super::*; + use datafusion::common::plan_err; use datafusion::{ datasource::listing::ListingTableUrl, logical_expr::{DdlStatement, LogicalPlan}, prelude::SessionContext, }; + use object_store::{aws::AmazonS3ConfigKey, gcp::GoogleConfigKey}; #[tokio::test] @@ -218,14 +465,19 @@ mod tests { let location = "s3://bucket/path/file.parquet"; let table_url = ListingTableUrl::parse(location)?; - let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS('access_key_id' '{access_key_id}', 'secret_access_key' '{secret_access_key}', 'region' '{region}', 'session_token' {session_token}) LOCATION '{location}'"); + let scheme = table_url.scheme(); + let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS('aws.access_key_id' '{access_key_id}', 'aws.secret_access_key' '{secret_access_key}', 'aws.region' '{region}', 'aws.session_token' {session_token}) LOCATION '{location}'"); let ctx = SessionContext::new(); let mut plan = ctx.state().create_logical_plan(&sql).await?; if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &mut plan { + register_options(&ctx, scheme); + let mut table_options = ctx.state().default_table_options().clone(); + table_options.alter_with_string_hash_map(&cmd.options)?; + let aws_options = table_options.extensions.get::().unwrap(); let builder = - get_s3_object_store_builder(table_url.as_ref(), &mut cmd.options).await?; + get_s3_object_store_builder(table_url.as_ref(), aws_options).await?; // get the actual configuration information, then assert_eq! let config = [ (AmazonS3ConfigKey::AccessKeyId, access_key_id), @@ -251,14 +503,18 @@ mod tests { let location = "oss://bucket/path/file.parquet"; let table_url = ListingTableUrl::parse(location)?; - let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS('access_key_id' '{access_key_id}', 'secret_access_key' '{secret_access_key}', 'endpoint' '{endpoint}') LOCATION '{location}'"); + let scheme = table_url.scheme(); + let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS('aws.access_key_id' '{access_key_id}', 'aws.secret_access_key' '{secret_access_key}', 'aws.oss.endpoint' '{endpoint}') LOCATION '{location}'"); let ctx = SessionContext::new(); let mut plan = ctx.state().create_logical_plan(&sql).await?; if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &mut plan { - let builder = - get_oss_object_store_builder(table_url.as_ref(), &mut cmd.options)?; + register_options(&ctx, scheme); + let mut table_options = ctx.state().default_table_options().clone(); + table_options.alter_with_string_hash_map(&cmd.options)?; + let aws_options = table_options.extensions.get::().unwrap(); + let builder = get_oss_object_store_builder(table_url.as_ref(), aws_options)?; // get the actual configuration information, then assert_eq! let config = [ (AmazonS3ConfigKey::AccessKeyId, access_key_id), @@ -284,14 +540,18 @@ mod tests { let location = "gcs://bucket/path/file.parquet"; let table_url = ListingTableUrl::parse(location)?; - let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS('service_account_path' '{service_account_path}', 'service_account_key' '{service_account_key}', 'application_credentials_path' '{application_credentials_path}') LOCATION '{location}'"); + let scheme = table_url.scheme(); + let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS('gcp.service_account_path' '{service_account_path}', 'gcp.service_account_key' '{service_account_key}', 'gcp.application_credentials_path' '{application_credentials_path}') LOCATION '{location}'"); let ctx = SessionContext::new(); let mut plan = ctx.state().create_logical_plan(&sql).await?; if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &mut plan { - let builder = - get_gcs_object_store_builder(table_url.as_ref(), &mut cmd.options)?; + register_options(&ctx, scheme); + let mut table_options = ctx.state().default_table_options().clone(); + table_options.alter_with_string_hash_map(&cmd.options)?; + let gcp_options = table_options.extensions.get::().unwrap(); + let builder = get_gcs_object_store_builder(table_url.as_ref(), gcp_options)?; // get the actual configuration information, then assert_eq! let config = [ (GoogleConfigKey::ServiceAccount, service_account_path), diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md index e1fb401e7b73..dbc8050555b9 100644 --- a/datafusion-examples/README.md +++ b/datafusion-examples/README.md @@ -45,7 +45,7 @@ cargo run --example csv_sql - [`avro_sql.rs`](examples/avro_sql.rs): Build and run a query plan from a SQL statement against a local AVRO file - [`csv_sql.rs`](examples/csv_sql.rs): Build and run a query plan from a SQL statement against a local CSV file - [`csv_sql_streaming.rs`](examples/csv_sql_streaming.rs): Build and run a streaming query plan from a SQL statement against a local CSV file -- [`catalog.rs`](examples/external_dependency/catalog.rs): Register the table into a custom catalog +- [`catalog.rs`](examples/catalog.rs): Register the table into a custom catalog - [`custom_datasource.rs`](examples/custom_datasource.rs): Run queries against a custom datasource (TableProvider) - [`dataframe.rs`](examples/dataframe.rs): Run a query using a DataFrame against a local parquet file - [`dataframe-to-s3.rs`](examples/external_dependency/dataframe-to-s3.rs): Run a query using a DataFrame against a parquet file from s3 and writing back to s3 diff --git a/datafusion-examples/examples/dataframe_output.rs b/datafusion-examples/examples/dataframe_output.rs index c773384dfcd5..60ca090d722d 100644 --- a/datafusion-examples/examples/dataframe_output.rs +++ b/datafusion-examples/examples/dataframe_output.rs @@ -16,6 +16,7 @@ // under the License. use datafusion::{dataframe::DataFrameWriteOptions, prelude::*}; +use datafusion_common::config::CsvOptions; use datafusion_common::{parsers::CompressionTypeVariant, DataFusionError}; /// This example demonstrates the various methods to write out a DataFrame to local storage. @@ -60,8 +61,8 @@ async fn main() -> Result<(), DataFusionError> { "./datafusion-examples/test_csv/", // DataFrameWriteOptions contains options which control how data is written // such as compression codec - DataFrameWriteOptions::new().with_compression(CompressionTypeVariant::GZIP), - None, + DataFrameWriteOptions::new(), + Some(CsvOptions::default().with_compression(CompressionTypeVariant::GZIP)), ) .await?; @@ -69,6 +70,7 @@ async fn main() -> Result<(), DataFusionError> { .write_json( "./datafusion-examples/test_json/", DataFrameWriteOptions::new(), + None, ) .await?; diff --git a/datafusion-examples/examples/external_dependency/dataframe-to-s3.rs b/datafusion-examples/examples/external_dependency/dataframe-to-s3.rs index 883da7d0d13d..8d56c440da36 100644 --- a/datafusion-examples/examples/external_dependency/dataframe-to-s3.rs +++ b/datafusion-examples/examples/external_dependency/dataframe-to-s3.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +use std::env; +use std::sync::Arc; + use datafusion::dataframe::DataFrameWriteOptions; use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::listing::ListingOptions; @@ -23,8 +26,6 @@ use datafusion::prelude::*; use datafusion_common::{FileType, GetExt}; use object_store::aws::AmazonS3Builder; -use std::env; -use std::sync::Arc; use url::Url; /// This example demonstrates querying data from AmazonS3 and writing @@ -52,7 +53,7 @@ async fn main() -> Result<()> { .register_object_store(&s3_url, arc_s3.clone()); let path = format!("s3://{bucket_name}/test_data/"); - let file_format = ParquetFormat::default().with_enable_pruning(Some(true)); + let file_format = ParquetFormat::default().with_enable_pruning(true); let listing_options = ListingOptions::new(Arc::new(file_format)) .with_file_extension(FileType::PARQUET.get_ext()); ctx.register_listing_table("test", &path, listing_options, None, None) @@ -69,7 +70,7 @@ async fn main() -> Result<()> { //write as JSON to s3 let json_out = format!("s3://{bucket_name}/json_out"); df.clone() - .write_json(&json_out, DataFrameWriteOptions::new()) + .write_json(&json_out, DataFrameWriteOptions::new(), None) .await?; //write as csv to s3 @@ -77,7 +78,7 @@ async fn main() -> Result<()> { df.write_csv(&csv_out, DataFrameWriteOptions::new(), None) .await?; - let file_format = ParquetFormat::default().with_enable_pruning(Some(true)); + let file_format = ParquetFormat::default().with_enable_pruning(true); let listing_options = ListingOptions::new(Arc::new(file_format)) .with_file_extension(FileType::PARQUET.get_ext()); ctx.register_listing_table("test2", &out_path, listing_options, None, None) diff --git a/datafusion-examples/examples/parquet_sql_multiple_files.rs b/datafusion-examples/examples/parquet_sql_multiple_files.rs index 0e2968f20356..30ca1df73d91 100644 --- a/datafusion-examples/examples/parquet_sql_multiple_files.rs +++ b/datafusion-examples/examples/parquet_sql_multiple_files.rs @@ -15,12 +15,14 @@ // specific language governing permissions and limitations // under the License. +use std::path::Path; +use std::sync::Arc; + use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::listing::ListingOptions; use datafusion::prelude::*; + use object_store::local::LocalFileSystem; -use std::path::Path; -use std::sync::Arc; /// This example demonstrates executing a simple query against an Arrow data source (a directory /// with multiple Parquet files) and fetching results. The query is run twice, once showing @@ -34,7 +36,7 @@ async fn main() -> Result<(), Box> { let test_data = datafusion::test_util::parquet_test_data(); // Configure listing options - let file_format = ParquetFormat::default().with_enable_pruning(Some(true)); + let file_format = ParquetFormat::default().with_enable_pruning(true); let listing_options = ListingOptions::new(Arc::new(file_format)) // This is a workaround for this example since `test_data` contains // many different parquet different files, diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 181f318d3eb3..72d51cb15a88 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -16,11 +16,15 @@ // under the License. //! Runtime configuration, via [`ConfigOptions`] -use crate::error::_internal_err; -use crate::{DataFusionError, Result}; + use std::any::Any; use std::collections::{BTreeMap, HashMap}; -use std::fmt::Display; +use std::fmt::{self, Display}; +use std::str::FromStr; + +use crate::error::_config_err; +use crate::parsers::CompressionTypeVariant; +use crate::{DataFusionError, FileType, Result}; /// A macro that wraps a configuration struct and automatically derives /// [`Default`] and [`ConfigField`] for it, allowing it to be used @@ -98,6 +102,7 @@ use std::fmt::Display; /// /// NB: Misplaced commas may result in nonsensical errors /// +#[macro_export] macro_rules! config_namespace { ( $(#[doc = $struct_d:tt])* @@ -110,8 +115,7 @@ macro_rules! config_namespace { ) => { $(#[doc = $struct_d])* - #[derive(Debug, Clone)] - #[non_exhaustive] + #[derive(Debug, Clone, PartialEq)] $vis struct $struct_name{ $( $(#[doc = $d])* @@ -126,9 +130,9 @@ macro_rules! config_namespace { $( stringify!($field_name) => self.$field_name.set(rem, value), )* - _ => _internal_err!( + _ => return Err(DataFusionError::Configuration(format!( "Config value \"{}\" not found on {}", key, stringify!($struct_name) - ) + ))) } } @@ -635,7 +639,7 @@ impl ConfigField for ConfigOptions { "optimizer" => self.optimizer.set(rem, value), "explain" => self.explain.set(rem, value), "sql_parser" => self.sql_parser.set(rem, value), - _ => _internal_err!("Config value \"{key}\" not found on ConfigOptions"), + _ => _config_err!("Config value \"{key}\" not found on ConfigOptions"), } } @@ -663,9 +667,9 @@ impl ConfigOptions { /// Set a configuration option pub fn set(&mut self, key: &str, value: &str) -> Result<()> { let (prefix, key) = key.split_once('.').ok_or_else(|| { - DataFusionError::External( - format!("could not find config namespace for key \"{key}\"",).into(), - ) + DataFusionError::Configuration(format!( + "could not find config namespace for key \"{key}\"", + )) })?; if prefix == "datafusion" { @@ -674,9 +678,9 @@ impl ConfigOptions { let e = self.extensions.0.get_mut(prefix); let e = e.ok_or_else(|| { - DataFusionError::External( - format!("Could not find config namespace \"{prefix}\"",).into(), - ) + DataFusionError::Configuration(format!( + "Could not find config namespace \"{prefix}\"" + )) })?; e.0.set(key, value) } @@ -886,7 +890,7 @@ impl Clone for ExtensionBox { /// A trait implemented by `config_namespace` and for field types that provides /// the ability to walk and mutate the configuration tree -trait ConfigField { +pub trait ConfigField { fn visit(&self, v: &mut V, key: &str, description: &'static str); fn set(&mut self, key: &str, value: &str) -> Result<()>; @@ -905,6 +909,7 @@ impl ConfigField for Option { } } +#[macro_export] macro_rules! config_field { ($t:ty) => { impl ConfigField for $t { @@ -929,11 +934,52 @@ config_field!(String); config_field!(bool); config_field!(usize); config_field!(f64); -config_field!(u8); config_field!(u64); +impl ConfigField for u8 { + fn visit(&self, v: &mut V, key: &str, description: &'static str) { + v.some(key, self, description) + } + + fn set(&mut self, key: &str, value: &str) -> Result<()> { + if value.is_empty() { + return Err(DataFusionError::Configuration(format!( + "Input string for {} key is empty", + key + ))); + } + // Check if the string is a valid number + if let Ok(num) = value.parse::() { + // TODO: Let's decide how we treat the numerical strings. + *self = num; + } else { + let bytes = value.as_bytes(); + // Check if the first character is ASCII (single byte) + if bytes.len() > 1 || !value.chars().next().unwrap().is_ascii() { + return Err(DataFusionError::Configuration(format!( + "Error parsing {} as u8. Non-ASCII string provided", + value + ))); + } + *self = bytes[0]; + } + Ok(()) + } +} + +impl ConfigField for CompressionTypeVariant { + fn visit(&self, v: &mut V, key: &str, description: &'static str) { + v.some(key, self, description) + } + + fn set(&mut self, _: &str, value: &str) -> Result<()> { + *self = CompressionTypeVariant::from_str(value)?; + Ok(()) + } +} + /// An implementation trait used to recursively walk configuration -trait Visit { +pub trait Visit { fn some(&mut self, key: &str, value: V, description: &'static str); fn none(&mut self, key: &str, description: &'static str); @@ -1044,7 +1090,7 @@ macro_rules! extensions_options { Ok(()) } )* - _ => Err($crate::DataFusionError::Internal( + _ => Err($crate::DataFusionError::Configuration( format!(concat!("Config value \"{}\" not found on ", stringify!($struct_name)), key) )) } @@ -1064,3 +1110,556 @@ macro_rules! extensions_options { } } } + +#[derive(Debug, Clone, Default)] +pub struct TableOptions { + pub csv: CsvOptions, + pub parquet: TableParquetOptions, + pub json: JsonOptions, + pub current_format: Option, + /// Optional extensions registered using [`Extensions::insert`] + pub extensions: Extensions, +} + +impl ConfigField for TableOptions { + fn visit(&self, v: &mut V, _key_prefix: &str, _description: &'static str) { + self.csv.visit(v, "csv", ""); + self.parquet.visit(v, "parquet", ""); + self.json.visit(v, "json", ""); + } + + fn set(&mut self, key: &str, value: &str) -> Result<()> { + // Extensions are handled in the public `ConfigOptions::set` + let (key, rem) = key.split_once('.').unwrap_or((key, "")); + match key { + "csv" => self.csv.set(rem, value), + "parquet" => self.parquet.set(rem, value), + "json" => self.json.set(rem, value), + _ => _config_err!("Config value \"{key}\" not found on TableOptions"), + } + } +} + +impl TableOptions { + /// Creates a new [`ConfigOptions`] with default values + pub fn new() -> Self { + Self::default() + } + + pub fn set_file_format(&mut self, format: FileType) { + self.current_format = Some(format); + } + + pub fn default_from_session_config(config: &ConfigOptions) -> Self { + let mut initial = TableOptions::default(); + initial.parquet.global = config.execution.parquet.clone(); + initial + } + + /// Set extensions to provided value + pub fn with_extensions(mut self, extensions: Extensions) -> Self { + self.extensions = extensions; + self + } + + /// Set a configuration option + pub fn set(&mut self, key: &str, value: &str) -> Result<()> { + let (prefix, _) = key.split_once('.').ok_or_else(|| { + DataFusionError::Configuration(format!( + "could not find config namespace for key \"{key}\"" + )) + })?; + + if prefix == "csv" || prefix == "json" || prefix == "parquet" { + if let Some(format) = &self.current_format { + match format { + FileType::CSV if prefix != "csv" => { + return Err(DataFusionError::Configuration(format!( + "Key \"{key}\" is not applicable for CSV format" + ))) + } + #[cfg(feature = "parquet")] + FileType::PARQUET if prefix != "parquet" => { + return Err(DataFusionError::Configuration(format!( + "Key \"{key}\" is not applicable for PARQUET format" + ))) + } + FileType::JSON if prefix != "json" => { + return Err(DataFusionError::Configuration(format!( + "Key \"{key}\" is not applicable for JSON format" + ))) + } + _ => {} + } + } + return ConfigField::set(self, key, value); + } + + let e = self.extensions.0.get_mut(prefix); + let e = e.ok_or_else(|| { + DataFusionError::Configuration(format!( + "Could not find config namespace \"{prefix}\"" + )) + })?; + e.0.set(key, value) + } + + pub fn from_string_hash_map(settings: &HashMap) -> Result { + let mut ret = Self::default(); + for (k, v) in settings { + ret.set(k, v)?; + } + + Ok(ret) + } + + pub fn alter_with_string_hash_map( + &mut self, + settings: &HashMap, + ) -> Result<()> { + for (k, v) in settings { + self.set(k, v)?; + } + Ok(()) + } + + /// Returns the [`ConfigEntry`] stored within this [`ConfigOptions`] + pub fn entries(&self) -> Vec { + struct Visitor(Vec); + + impl Visit for Visitor { + fn some( + &mut self, + key: &str, + value: V, + description: &'static str, + ) { + self.0.push(ConfigEntry { + key: key.to_string(), + value: Some(value.to_string()), + description, + }) + } + + fn none(&mut self, key: &str, description: &'static str) { + self.0.push(ConfigEntry { + key: key.to_string(), + value: None, + description, + }) + } + } + + let mut v = Visitor(vec![]); + self.visit(&mut v, "csv", ""); + self.visit(&mut v, "json", ""); + self.visit(&mut v, "parquet", ""); + + v.0.extend(self.extensions.0.values().flat_map(|e| e.0.entries())); + v.0 + } +} + +#[derive(Clone, Default, Debug, PartialEq)] +pub struct TableParquetOptions { + /// Global Parquet options that propagates to all columns. + pub global: ParquetOptions, + /// Column specific options. Default usage is parquet.XX::column. + pub column_specific_options: HashMap, +} + +impl ConfigField for TableParquetOptions { + fn visit(&self, v: &mut V, key_prefix: &str, description: &'static str) { + self.global.visit(v, key_prefix, description); + self.column_specific_options + .visit(v, key_prefix, description) + } + + fn set(&mut self, key: &str, value: &str) -> Result<()> { + // Determine the key if it's a global or column-specific setting + if key.contains("::") { + self.column_specific_options.set(key, value) + } else { + self.global.set(key, value) + } + } +} + +macro_rules! config_namespace_with_hashmap { + ( + $(#[doc = $struct_d:tt])* + $vis:vis struct $struct_name:ident { + $( + $(#[doc = $d:tt])* + $field_vis:vis $field_name:ident : $field_type:ty, default = $default:expr + )*$(,)* + } + ) => { + + $(#[doc = $struct_d])* + #[derive(Debug, Clone, PartialEq)] + $vis struct $struct_name{ + $( + $(#[doc = $d])* + $field_vis $field_name : $field_type, + )* + } + + impl ConfigField for $struct_name { + fn set(&mut self, key: &str, value: &str) -> Result<()> { + let (key, rem) = key.split_once('.').unwrap_or((key, "")); + match key { + $( + stringify!($field_name) => self.$field_name.set(rem, value), + )* + _ => _config_err!( + "Config value \"{}\" not found on {}", key, stringify!($struct_name) + ) + } + } + + fn visit(&self, v: &mut V, key_prefix: &str, _description: &'static str) { + $( + let key = format!(concat!("{}.", stringify!($field_name)), key_prefix); + let desc = concat!($($d),*).trim(); + self.$field_name.visit(v, key.as_str(), desc); + )* + } + } + + impl Default for $struct_name { + fn default() -> Self { + Self { + $($field_name: $default),* + } + } + } + + impl ConfigField for HashMap { + fn set(&mut self, key: &str, value: &str) -> Result<()> { + let parts: Vec<&str> = key.splitn(2, "::").collect(); + match parts.as_slice() { + [inner_key, hashmap_key] => { + // Get or create the ColumnOptions for the specified column + let inner_value = self + .entry((*hashmap_key).to_owned()) + .or_insert_with($struct_name::default); + + inner_value.set(inner_key, value) + } + _ => Err(DataFusionError::Configuration(format!( + "Unrecognized key '{}'.", + key + ))), + } + } + + fn visit(&self, v: &mut V, key_prefix: &str, _description: &'static str) { + for (column_name, col_options) in self { + $( + let key = format!("{}.{field}::{}", key_prefix, column_name, field = stringify!($field_name)); + let desc = concat!($($d),*).trim(); + col_options.$field_name.visit(v, key.as_str(), desc); + )* + } + } + } + } +} + +config_namespace_with_hashmap! { + pub struct ColumnOptions { + /// Sets if bloom filter is enabled for the column path. + pub bloom_filter_enabled: Option, default = None + + /// Sets encoding for the column path. + /// Valid values are: plain, plain_dictionary, rle, + /// bit_packed, delta_binary_packed, delta_length_byte_array, + /// delta_byte_array, rle_dictionary, and byte_stream_split. + /// These values are not case-sensitive. If NULL, uses + /// default parquet options + pub encoding: Option, default = None + + /// Sets if dictionary encoding is enabled for the column path. If NULL, uses + /// default parquet options + pub dictionary_enabled: Option, default = None + + /// Sets default parquet compression codec for the column path. + /// Valid values are: uncompressed, snappy, gzip(level), + /// lzo, brotli(level), lz4, zstd(level), and lz4_raw. + /// These values are not case-sensitive. If NULL, uses + /// default parquet options + pub compression: Option, default = None + + /// Sets if statistics are enabled for the column + /// Valid values are: "none", "chunk", and "page" + /// These values are not case sensitive. If NULL, uses + /// default parquet options + pub statistics_enabled: Option, default = None + + /// Sets bloom filter false positive probability for the column path. If NULL, uses + /// default parquet options + pub bloom_filter_fpp: Option, default = None + + /// Sets bloom filter number of distinct values. If NULL, uses + /// default parquet options + pub bloom_filter_ndv: Option, default = None + + /// Sets max statistics size for the column path. If NULL, uses + /// default parquet options + pub max_statistics_size: Option, default = None + } +} + +config_namespace! { + /// Options controlling CSV format + pub struct CsvOptions { + pub has_header: bool, default = true + pub delimiter: u8, default = b',' + pub quote: u8, default = b'"' + pub escape: Option, default = None + pub compression: CompressionTypeVariant, default = CompressionTypeVariant::UNCOMPRESSED + pub schema_infer_max_rec: usize, default = 100 + pub date_format: Option, default = None + pub datetime_format: Option, default = None + pub timestamp_format: Option, default = None + pub timestamp_tz_format: Option, default = None + pub time_format: Option, default = None + pub null_value: Option, default = None + } +} + +impl CsvOptions { + /// Set a limit in terms of records to scan to infer the schema + /// - default to `DEFAULT_SCHEMA_INFER_MAX_RECORD` + pub fn with_compression( + mut self, + compression_type_variant: CompressionTypeVariant, + ) -> Self { + self.compression = compression_type_variant; + self + } + + /// Set a limit in terms of records to scan to infer the schema + /// - default to `DEFAULT_SCHEMA_INFER_MAX_RECORD` + pub fn with_schema_infer_max_rec(mut self, max_rec: usize) -> Self { + self.schema_infer_max_rec = max_rec; + self + } + + /// Set true to indicate that the first line is a header. + /// - default to true + pub fn with_has_header(mut self, has_header: bool) -> Self { + self.has_header = has_header; + self + } + + /// True if the first line is a header. + pub fn has_header(&self) -> bool { + self.has_header + } + + /// The character separating values within a row. + /// - default to ',' + pub fn with_delimiter(mut self, delimiter: u8) -> Self { + self.delimiter = delimiter; + self + } + + /// The quote character in a row. + /// - default to '"' + pub fn with_quote(mut self, quote: u8) -> Self { + self.quote = quote; + self + } + + /// The escape character in a row. + /// - default is None + pub fn with_escape(mut self, escape: Option) -> Self { + self.escape = escape; + self + } + + /// Set a `CompressionTypeVariant` of CSV + /// - defaults to `CompressionTypeVariant::UNCOMPRESSED` + pub fn with_file_compression_type( + mut self, + compression: CompressionTypeVariant, + ) -> Self { + self.compression = compression; + self + } + + /// The delimiter character. + pub fn delimiter(&self) -> u8 { + self.delimiter + } + + /// The quote character. + pub fn quote(&self) -> u8 { + self.quote + } + + /// The escape character. + pub fn escape(&self) -> Option { + self.escape + } +} + +config_namespace! { + /// Options controlling JSON format + pub struct JsonOptions { + pub compression: CompressionTypeVariant, default = CompressionTypeVariant::UNCOMPRESSED + pub schema_infer_max_rec: usize, default = 100 + } +} + +#[derive(Debug, Clone, PartialEq)] +pub enum FormatOptions { + CSV(CsvOptions), + JSON(JsonOptions), + #[cfg(feature = "parquet")] + PARQUET(TableParquetOptions), + AVRO, + ARROW, +} +impl Display for FormatOptions { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let out = match self { + FormatOptions::CSV(_) => "csv", + FormatOptions::JSON(_) => "json", + #[cfg(feature = "parquet")] + FormatOptions::PARQUET(_) => "parquet", + FormatOptions::AVRO => "avro", + FormatOptions::ARROW => "arrow", + }; + write!(f, "{}", out) + } +} + +impl From for FormatOptions { + fn from(value: FileType) -> Self { + match value { + FileType::ARROW => FormatOptions::ARROW, + FileType::AVRO => FormatOptions::AVRO, + #[cfg(feature = "parquet")] + FileType::PARQUET => FormatOptions::PARQUET(TableParquetOptions::default()), + FileType::CSV => FormatOptions::CSV(CsvOptions::default()), + FileType::JSON => FormatOptions::JSON(JsonOptions::default()), + } + } +} + +#[cfg(test)] +mod tests { + use std::any::Any; + use std::collections::HashMap; + + use crate::config::{ + ConfigEntry, ConfigExtension, ExtensionOptions, Extensions, TableOptions, + }; + + #[derive(Default, Debug, Clone)] + pub struct TestExtensionConfig { + /// Should "foo" be replaced by "bar"? + pub properties: HashMap, + } + + impl ExtensionOptions for TestExtensionConfig { + fn as_any(&self) -> &dyn Any { + self + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + fn cloned(&self) -> Box { + Box::new(self.clone()) + } + + fn set(&mut self, key: &str, value: &str) -> crate::Result<()> { + let (key, rem) = key.split_once('.').unwrap_or((key, "")); + assert_eq!(key, "test"); + self.properties.insert(rem.to_owned(), value.to_owned()); + Ok(()) + } + + fn entries(&self) -> Vec { + self.properties + .iter() + .map(|(k, v)| ConfigEntry { + key: k.into(), + value: Some(v.into()), + description: "", + }) + .collect() + } + } + + impl ConfigExtension for TestExtensionConfig { + const PREFIX: &'static str = "test"; + } + + #[test] + fn create_table_config() { + let mut extension = Extensions::new(); + extension.insert(TestExtensionConfig::default()); + let table_config = TableOptions::new().with_extensions(extension); + let kafka_config = table_config.extensions.get::(); + assert!(kafka_config.is_some()) + } + + #[test] + fn alter_kafka_config() { + let mut extension = Extensions::new(); + extension.insert(TestExtensionConfig::default()); + let mut table_config = TableOptions::new().with_extensions(extension); + table_config.set("parquet.write_batch_size", "10").unwrap(); + assert_eq!(table_config.parquet.global.write_batch_size, 10); + table_config.set("test.bootstrap.servers", "asd").unwrap(); + let kafka_config = table_config + .extensions + .get::() + .unwrap(); + assert_eq!( + kafka_config.properties.get("bootstrap.servers").unwrap(), + "asd" + ); + } + + #[test] + fn parquet_table_options() { + let mut table_config = TableOptions::new(); + table_config + .set("parquet.bloom_filter_enabled::col1", "true") + .unwrap(); + assert_eq!( + table_config.parquet.column_specific_options["col1"].bloom_filter_enabled, + Some(true) + ); + } + + #[test] + fn csv_u8_table_options() { + let mut table_config = TableOptions::new(); + table_config.set("csv.delimiter", ";").unwrap(); + assert_eq!(table_config.csv.delimiter as char, ';'); + table_config.set("csv.escape", "\"").unwrap(); + assert_eq!(table_config.csv.escape.unwrap() as char, '"'); + table_config.set("csv.escape", "\'").unwrap(); + assert_eq!(table_config.csv.escape.unwrap() as char, '\''); + } + + #[test] + fn parquet_table_options_config_entry() { + let mut table_config = TableOptions::new(); + table_config + .set("parquet.bloom_filter_enabled::col1", "true") + .unwrap(); + let entries = table_config.entries(); + assert!(entries + .iter() + .any(|item| item.key == "parquet.bloom_filter_enabled::col1")) + } +} diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index 0f4e97905938..1ecd5b62bee8 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -535,6 +535,9 @@ make_error!(not_impl_err, not_impl_datafusion_err, NotImplemented); // Exposes a macro to create `DataFusionError::Execution` with optional backtrace make_error!(exec_err, exec_datafusion_err, Execution); +// Exposes a macro to create `DataFusionError::Configuration` with optional backtrace +make_error!(config_err, config_datafusion_err, Configuration); + // Exposes a macro to create `DataFusionError::Substrait` with optional backtrace make_error!(substrait_err, substrait_datafusion_err, Substrait); @@ -594,6 +597,7 @@ macro_rules! schema_err { // To avoid compiler error when using macro in the same crate: // macros from the current crate cannot be referred to by absolute paths +pub use config_err as _config_err; pub use internal_datafusion_err as _internal_datafusion_err; pub use internal_err as _internal_err; pub use not_impl_err as _not_impl_err; diff --git a/datafusion/common/src/file_options/arrow_writer.rs b/datafusion/common/src/file_options/arrow_writer.rs index cb921535aba5..99513eecf3f1 100644 --- a/datafusion/common/src/file_options/arrow_writer.rs +++ b/datafusion/common/src/file_options/arrow_writer.rs @@ -17,13 +17,6 @@ //! Options related to how Arrow files should be written -use crate::{ - config::ConfigOptions, - error::{DataFusionError, Result}, -}; - -use super::StatementOptions; - #[derive(Clone, Debug)] pub struct ArrowWriterOptions {} @@ -38,11 +31,3 @@ impl Default for ArrowWriterOptions { Self::new() } } - -impl TryFrom<(&ConfigOptions, &StatementOptions)> for ArrowWriterOptions { - type Error = DataFusionError; - - fn try_from(_value: (&ConfigOptions, &StatementOptions)) -> Result { - Ok(ArrowWriterOptions {}) - } -} diff --git a/datafusion/common/src/file_options/avro_writer.rs b/datafusion/common/src/file_options/avro_writer.rs index 2e3a64705842..51d923e2c315 100644 --- a/datafusion/common/src/file_options/avro_writer.rs +++ b/datafusion/common/src/file_options/avro_writer.rs @@ -17,20 +17,5 @@ //! Options related to how avro files should be written -use crate::{ - config::ConfigOptions, - error::{DataFusionError, Result}, -}; - -use super::StatementOptions; - #[derive(Clone, Debug)] pub struct AvroWriterOptions {} - -impl TryFrom<(&ConfigOptions, &StatementOptions)> for AvroWriterOptions { - type Error = DataFusionError; - - fn try_from(_value: (&ConfigOptions, &StatementOptions)) -> Result { - Ok(AvroWriterOptions {}) - } -} diff --git a/datafusion/common/src/file_options/csv_writer.rs b/datafusion/common/src/file_options/csv_writer.rs index d6046f0219dd..5f1a62682f8d 100644 --- a/datafusion/common/src/file_options/csv_writer.rs +++ b/datafusion/common/src/file_options/csv_writer.rs @@ -17,18 +17,12 @@ //! Options related to how csv files should be written -use std::str::FromStr; +use crate::config::CsvOptions; +use crate::error::{DataFusionError, Result}; +use crate::parsers::CompressionTypeVariant; use arrow::csv::WriterBuilder; -use crate::{ - config::ConfigOptions, - error::{DataFusionError, Result}, - parsers::CompressionTypeVariant, -}; - -use super::StatementOptions; - /// Options for writing CSV files #[derive(Clone, Debug)] pub struct CsvWriterOptions { @@ -51,58 +45,32 @@ impl CsvWriterOptions { } } -impl TryFrom<(&ConfigOptions, &StatementOptions)> for CsvWriterOptions { +impl TryFrom<&CsvOptions> for CsvWriterOptions { type Error = DataFusionError; - fn try_from(value: (&ConfigOptions, &StatementOptions)) -> Result { - let _configs = value.0; - let statement_options = value.1; - let mut builder = WriterBuilder::default(); - let mut compression = CompressionTypeVariant::UNCOMPRESSED; - for (option, value) in &statement_options.options { - builder = match option.to_lowercase().as_str(){ - "header" => { - let has_header = value.parse() - .map_err(|_| DataFusionError::Configuration(format!("Unable to parse {value} as bool as required for {option}!")))?; - builder.with_header(has_header) - }, - "date_format" => builder.with_date_format(value.to_owned()), - "datetime_format" => builder.with_datetime_format(value.to_owned()), - "timestamp_format" => builder.with_timestamp_format(value.to_owned()), - "time_format" => builder.with_time_format(value.to_owned()), - "rfc3339" => builder, // No-op - "null_value" => builder.with_null(value.to_owned()), - "compression" => { - compression = CompressionTypeVariant::from_str(value.replace('\'', "").as_str())?; - builder - }, - "delimiter" => { - // Ignore string literal single quotes passed from sql parsing - let value = value.replace('\'', ""); - let chars: Vec = value.chars().collect(); - if chars.len()>1{ - return Err(DataFusionError::Configuration(format!( - "CSV Delimiter Option must be a single char, got: {}", value - ))) - } - builder.with_delimiter(chars[0].try_into().map_err(|_| { - DataFusionError::Internal( - "Unable to convert CSV delimiter into u8".into(), - ) - })?) - }, - "quote" | "escape" => { - // https://github.com/apache/arrow-rs/issues/5146 - // These two attributes are only available when reading csv files. - // To avoid error - builder - }, - _ => return Err(DataFusionError::Configuration(format!("Found unsupported option {option} with value {value} for CSV format!"))) - } + fn try_from(value: &CsvOptions) -> Result { + let mut builder = WriterBuilder::default() + .with_header(value.has_header) + .with_delimiter(value.delimiter); + + if let Some(v) = &value.date_format { + builder = builder.with_date_format(v.into()) + } + if let Some(v) = &value.datetime_format { + builder = builder.with_datetime_format(v.into()) + } + if let Some(v) = &value.timestamp_format { + builder = builder.with_timestamp_format(v.into()) + } + if let Some(v) = &value.time_format { + builder = builder.with_time_format(v.into()) + } + if let Some(v) = &value.null_value { + builder = builder.with_null(v.into()) } Ok(CsvWriterOptions { writer_options: builder, - compression, + compression: value.compression, }) } } diff --git a/datafusion/common/src/file_options/file_type.rs b/datafusion/common/src/file_options/file_type.rs index 97362bdad3cc..812cb02a5f77 100644 --- a/datafusion/common/src/file_options/file_type.rs +++ b/datafusion/common/src/file_options/file_type.rs @@ -17,12 +17,11 @@ //! File type abstraction -use crate::error::{DataFusionError, Result}; - -use core::fmt; -use std::fmt::Display; +use std::fmt::{self, Display}; use std::str::FromStr; +use crate::error::{DataFusionError, Result}; + /// The default file extension of arrow files pub const DEFAULT_ARROW_EXTENSION: &str = ".arrow"; /// The default file extension of avro files @@ -105,10 +104,11 @@ impl FromStr for FileType { #[cfg(test)] #[cfg(feature = "parquet")] mod tests { - use crate::error::DataFusionError; - use crate::file_options::FileType; use std::str::FromStr; + use crate::error::DataFusionError; + use crate::FileType; + #[test] fn from_str() { for (ext, file_type) in [ diff --git a/datafusion/common/src/file_options/json_writer.rs b/datafusion/common/src/file_options/json_writer.rs index 7f988016c69d..750d2972329b 100644 --- a/datafusion/common/src/file_options/json_writer.rs +++ b/datafusion/common/src/file_options/json_writer.rs @@ -17,16 +17,12 @@ //! Options related to how json files should be written -use std::str::FromStr; - use crate::{ - config::ConfigOptions, + config::JsonOptions, error::{DataFusionError, Result}, parsers::CompressionTypeVariant, }; -use super::StatementOptions; - /// Options for writing JSON files #[derive(Clone, Debug)] pub struct JsonWriterOptions { @@ -39,21 +35,12 @@ impl JsonWriterOptions { } } -impl TryFrom<(&ConfigOptions, &StatementOptions)> for JsonWriterOptions { +impl TryFrom<&JsonOptions> for JsonWriterOptions { type Error = DataFusionError; - fn try_from(value: (&ConfigOptions, &StatementOptions)) -> Result { - let _configs = value.0; - let statement_options = value.1; - let mut compression = CompressionTypeVariant::UNCOMPRESSED; - for (option, value) in &statement_options.options { - match option.to_lowercase().as_str(){ - "compression" => { - compression = CompressionTypeVariant::from_str(value.replace('\'', "").as_str())?; - }, - _ => return Err(DataFusionError::Configuration(format!("Found unsupported option {option} with value {value} for JSON format!"))) - } - } - Ok(JsonWriterOptions { compression }) + fn try_from(value: &JsonOptions) -> Result { + Ok(JsonWriterOptions { + compression: value.compression, + }) } } diff --git a/datafusion/common/src/file_options/mod.rs b/datafusion/common/src/file_options/mod.rs index 3a48f188fb97..a72b812adc8d 100644 --- a/datafusion/common/src/file_options/mod.rs +++ b/datafusion/common/src/file_options/mod.rs @@ -24,346 +24,61 @@ pub mod file_type; pub mod json_writer; #[cfg(feature = "parquet")] pub mod parquet_writer; -pub(crate) mod parse_utils; - -use std::{ - collections::HashMap, - fmt::{self, Display}, - path::Path, - str::FromStr, -}; - -use crate::{ - config::ConfigOptions, file_options::parse_utils::parse_boolean_string, - DataFusionError, FileType, Result, -}; - -#[cfg(feature = "parquet")] -use self::parquet_writer::ParquetWriterOptions; - -use self::{ - arrow_writer::ArrowWriterOptions, avro_writer::AvroWriterOptions, - csv_writer::CsvWriterOptions, json_writer::JsonWriterOptions, -}; - -/// Represents a single arbitrary setting in a -/// [StatementOptions] where OptionTuple.0 determines -/// the specific setting to be modified and OptionTuple.1 -/// determines the value which should be applied -pub type OptionTuple = (String, String); - -/// Represents arbitrary tuples of options passed as String -/// tuples from SQL statements. As in the following statement: -/// COPY ... TO ... (setting1 value1, setting2 value2, ...) -#[derive(Clone, PartialEq, Eq, Hash, Debug)] -pub struct StatementOptions { - options: Vec, -} - -/// Useful for conversion from external tables which use Hashmap -impl From<&HashMap> for StatementOptions { - fn from(value: &HashMap) -> Self { - Self { - options: value - .iter() - .map(|(k, v)| (k.to_owned(), v.to_owned())) - .collect::>(), - } - } -} - -impl StatementOptions { - pub fn new(options: Vec) -> Self { - Self { options } - } - - pub fn into_inner(self) -> Vec { - self.options - } - - /// Scans for option and if it exists removes it and attempts to parse as a boolean - /// Returns none if it does not exist. - pub fn take_bool_option(&mut self, find: &str) -> Result> { - let maybe_option = self.scan_and_remove_option(find); - maybe_option - .map(|(_, v)| parse_boolean_string(find, v)) - .transpose() - } - - /// Scans for option and if it exists removes it and returns it - /// Returns none if it does not exist - pub fn take_str_option(&mut self, find: &str) -> Option { - let maybe_option = self.scan_and_remove_option(find); - maybe_option.map(|(_, v)| v) - } - - /// Finds partition_by option if exists and parses into a `Vec`. - /// If option doesn't exist, returns empty `vec![]`. - /// E.g. (partition_by 'colA, colB, colC') -> `vec!['colA','colB','colC']` - pub fn take_partition_by(&mut self) -> Vec { - let partition_by = self.take_str_option("partition_by"); - match partition_by { - Some(part_cols) => { - let dequoted = part_cols - .chars() - .enumerate() - .filter(|(idx, c)| { - !((*idx == 0 || *idx == part_cols.len() - 1) - && (*c == '\'' || *c == '"')) - }) - .map(|(_idx, c)| c) - .collect::(); - dequoted - .split(',') - .map(|s| s.trim().replace("''", "'")) - .collect::>() - } - None => vec![], - } - } - - /// Infers the file_type given a target and arbitrary options. - /// If the options contain an explicit "format" option, that will be used. - /// Otherwise, attempt to infer file_type from the extension of target. - /// Finally, return an error if unable to determine the file_type - /// If found, format is removed from the options list. - pub fn try_infer_file_type(&mut self, target: &str) -> Result { - let explicit_format = self.scan_and_remove_option("format"); - let format = match explicit_format { - Some(s) => FileType::from_str(s.1.as_str()), - None => { - // try to infer file format from file extension - let extension: &str = &Path::new(target) - .extension() - .ok_or(DataFusionError::Configuration( - "Format not explicitly set and unable to get file extension!" - .to_string(), - ))? - .to_str() - .ok_or(DataFusionError::Configuration( - "Format not explicitly set and failed to parse file extension!" - .to_string(), - ))? - .to_lowercase(); - - FileType::from_str(extension) - } - }?; - - Ok(format) - } - - /// Finds an option in StatementOptions if exists, removes and returns it - /// along with the vec of remaining options. - fn scan_and_remove_option(&mut self, find: &str) -> Option { - let idx = self - .options - .iter() - .position(|(k, _)| k.to_lowercase() == find.to_lowercase()); - match idx { - Some(i) => Some(self.options.swap_remove(i)), - None => None, - } - } -} - -/// This type contains all options needed to initialize a particular -/// RecordBatchWriter type. Each element in the enum contains a thin wrapper -/// around a "writer builder" type (e.g. arrow::csv::WriterBuilder) -/// plus any DataFusion specific writing options (e.g. CSV compression) -#[derive(Clone, Debug)] -pub enum FileTypeWriterOptions { - #[cfg(feature = "parquet")] - Parquet(ParquetWriterOptions), - CSV(CsvWriterOptions), - JSON(JsonWriterOptions), - Avro(AvroWriterOptions), - Arrow(ArrowWriterOptions), -} - -impl FileTypeWriterOptions { - /// Constructs a FileTypeWriterOptions given a FileType to be written - /// and arbitrary String tuple options. May return an error if any - /// string setting is unrecognized or unsupported. - pub fn build( - file_type: &FileType, - config_defaults: &ConfigOptions, - statement_options: &StatementOptions, - ) -> Result { - let options = (config_defaults, statement_options); - - let file_type_write_options = match file_type { - #[cfg(feature = "parquet")] - FileType::PARQUET => { - FileTypeWriterOptions::Parquet(ParquetWriterOptions::try_from(options)?) - } - FileType::CSV => { - FileTypeWriterOptions::CSV(CsvWriterOptions::try_from(options)?) - } - FileType::JSON => { - FileTypeWriterOptions::JSON(JsonWriterOptions::try_from(options)?) - } - FileType::AVRO => { - FileTypeWriterOptions::Avro(AvroWriterOptions::try_from(options)?) - } - FileType::ARROW => { - FileTypeWriterOptions::Arrow(ArrowWriterOptions::try_from(options)?) - } - }; - - Ok(file_type_write_options) - } - - /// Constructs a FileTypeWriterOptions from session defaults only. - pub fn build_default( - file_type: &FileType, - config_defaults: &ConfigOptions, - ) -> Result { - let empty_statement = StatementOptions::new(vec![]); - let options = (config_defaults, &empty_statement); - - let file_type_write_options = match file_type { - #[cfg(feature = "parquet")] - FileType::PARQUET => { - FileTypeWriterOptions::Parquet(ParquetWriterOptions::try_from(options)?) - } - FileType::CSV => { - FileTypeWriterOptions::CSV(CsvWriterOptions::try_from(options)?) - } - FileType::JSON => { - FileTypeWriterOptions::JSON(JsonWriterOptions::try_from(options)?) - } - FileType::AVRO => { - FileTypeWriterOptions::Avro(AvroWriterOptions::try_from(options)?) - } - FileType::ARROW => { - FileTypeWriterOptions::Arrow(ArrowWriterOptions::try_from(options)?) - } - }; - - Ok(file_type_write_options) - } - - /// Tries to extract ParquetWriterOptions from this FileTypeWriterOptions enum. - /// Returns an error if a different type from parquet is set. - #[cfg(feature = "parquet")] - pub fn try_into_parquet(&self) -> Result<&ParquetWriterOptions> { - match self { - FileTypeWriterOptions::Parquet(opt) => Ok(opt), - _ => Err(DataFusionError::Internal(format!( - "Expected parquet options but found options for: {}", - self - ))), - } - } - - /// Tries to extract CsvWriterOptions from this FileTypeWriterOptions enum. - /// Returns an error if a different type from csv is set. - pub fn try_into_csv(&self) -> Result<&CsvWriterOptions> { - match self { - FileTypeWriterOptions::CSV(opt) => Ok(opt), - _ => Err(DataFusionError::Internal(format!( - "Expected csv options but found options for {}", - self - ))), - } - } - - /// Tries to extract JsonWriterOptions from this FileTypeWriterOptions enum. - /// Returns an error if a different type from json is set. - pub fn try_into_json(&self) -> Result<&JsonWriterOptions> { - match self { - FileTypeWriterOptions::JSON(opt) => Ok(opt), - _ => Err(DataFusionError::Internal(format!( - "Expected json options but found options for {}", - self, - ))), - } - } - - /// Tries to extract AvroWriterOptions from this FileTypeWriterOptions enum. - /// Returns an error if a different type from avro is set. - pub fn try_into_avro(&self) -> Result<&AvroWriterOptions> { - match self { - FileTypeWriterOptions::Avro(opt) => Ok(opt), - _ => Err(DataFusionError::Internal(format!( - "Expected avro options but found options for {}!", - self - ))), - } - } - - /// Tries to extract ArrowWriterOptions from this FileTypeWriterOptions enum. - /// Returns an error if a different type from arrow is set. - pub fn try_into_arrow(&self) -> Result<&ArrowWriterOptions> { - match self { - FileTypeWriterOptions::Arrow(opt) => Ok(opt), - _ => Err(DataFusionError::Internal(format!( - "Expected arrow options but found options for {}", - self - ))), - } - } -} - -impl Display for FileTypeWriterOptions { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let name = match self { - FileTypeWriterOptions::Arrow(_) => "ArrowWriterOptions", - FileTypeWriterOptions::Avro(_) => "AvroWriterOptions", - FileTypeWriterOptions::CSV(_) => "CsvWriterOptions", - FileTypeWriterOptions::JSON(_) => "JsonWriterOptions", - #[cfg(feature = "parquet")] - FileTypeWriterOptions::Parquet(_) => "ParquetWriterOptions", - }; - write!(f, "{}", name) - } -} #[cfg(test)] #[cfg(feature = "parquet")] mod tests { use std::collections::HashMap; - use parquet::{ - basic::{Compression, Encoding, ZstdLevel}, - file::properties::{EnabledStatistics, WriterVersion}, - schema::types::ColumnPath, - }; - + use super::parquet_writer::ParquetWriterOptions; use crate::{ - config::ConfigOptions, + config::TableOptions, file_options::{csv_writer::CsvWriterOptions, json_writer::JsonWriterOptions}, parsers::CompressionTypeVariant, + Result, }; - use crate::Result; - - use super::{parquet_writer::ParquetWriterOptions, StatementOptions}; + use parquet::{ + basic::{Compression, Encoding, ZstdLevel}, + file::properties::{EnabledStatistics, WriterVersion}, + schema::types::ColumnPath, + }; #[test] fn test_writeroptions_parquet_from_statement_options() -> Result<()> { let mut option_map: HashMap = HashMap::new(); - option_map.insert("max_row_group_size".to_owned(), "123".to_owned()); - option_map.insert("data_pagesize_limit".to_owned(), "123".to_owned()); - option_map.insert("write_batch_size".to_owned(), "123".to_owned()); - option_map.insert("writer_version".to_owned(), "2.0".to_owned()); - option_map.insert("dictionary_page_size_limit".to_owned(), "123".to_owned()); - option_map.insert("created_by".to_owned(), "df write unit test".to_owned()); - option_map.insert("column_index_truncate_length".to_owned(), "123".to_owned()); - option_map.insert("data_page_row_count_limit".to_owned(), "123".to_owned()); - option_map.insert("bloom_filter_enabled".to_owned(), "true".to_owned()); - option_map.insert("encoding".to_owned(), "plain".to_owned()); - option_map.insert("dictionary_enabled".to_owned(), "true".to_owned()); - option_map.insert("compression".to_owned(), "zstd(4)".to_owned()); - option_map.insert("statistics_enabled".to_owned(), "page".to_owned()); - option_map.insert("bloom_filter_fpp".to_owned(), "0.123".to_owned()); - option_map.insert("bloom_filter_ndv".to_owned(), "123".to_owned()); - - let options = StatementOptions::from(&option_map); - let config = ConfigOptions::new(); - - let parquet_options = ParquetWriterOptions::try_from((&config, &options))?; + option_map.insert("parquet.max_row_group_size".to_owned(), "123".to_owned()); + option_map.insert("parquet.data_pagesize_limit".to_owned(), "123".to_owned()); + option_map.insert("parquet.write_batch_size".to_owned(), "123".to_owned()); + option_map.insert("parquet.writer_version".to_owned(), "2.0".to_owned()); + option_map.insert( + "parquet.dictionary_page_size_limit".to_owned(), + "123".to_owned(), + ); + option_map.insert( + "parquet.created_by".to_owned(), + "df write unit test".to_owned(), + ); + option_map.insert( + "parquet.column_index_truncate_length".to_owned(), + "123".to_owned(), + ); + option_map.insert( + "parquet.data_page_row_count_limit".to_owned(), + "123".to_owned(), + ); + option_map.insert("parquet.bloom_filter_enabled".to_owned(), "true".to_owned()); + option_map.insert("parquet.encoding".to_owned(), "plain".to_owned()); + option_map.insert("parquet.dictionary_enabled".to_owned(), "true".to_owned()); + option_map.insert("parquet.compression".to_owned(), "zstd(4)".to_owned()); + option_map.insert("parquet.statistics_enabled".to_owned(), "page".to_owned()); + option_map.insert("parquet.bloom_filter_fpp".to_owned(), "0.123".to_owned()); + option_map.insert("parquet.bloom_filter_ndv".to_owned(), "123".to_owned()); + + let mut table_config = TableOptions::new(); + table_config.alter_with_string_hash_map(&option_map)?; + + let parquet_options = ParquetWriterOptions::try_from(&table_config.parquet)?; let properties = parquet_options.writer_options(); // Verify the expected options propagated down to parquet crate WriterProperties struct @@ -415,37 +130,58 @@ mod tests { fn test_writeroptions_parquet_column_specific() -> Result<()> { let mut option_map: HashMap = HashMap::new(); - option_map.insert("bloom_filter_enabled::col1".to_owned(), "true".to_owned()); option_map.insert( - "bloom_filter_enabled::col2.nested".to_owned(), + "parquet.bloom_filter_enabled::col1".to_owned(), + "true".to_owned(), + ); + option_map.insert( + "parquet.bloom_filter_enabled::col2.nested".to_owned(), + "true".to_owned(), + ); + option_map.insert("parquet.encoding::col1".to_owned(), "plain".to_owned()); + option_map.insert("parquet.encoding::col2.nested".to_owned(), "rle".to_owned()); + option_map.insert( + "parquet.dictionary_enabled::col1".to_owned(), "true".to_owned(), ); - option_map.insert("encoding::col1".to_owned(), "plain".to_owned()); - option_map.insert("encoding::col2.nested".to_owned(), "rle".to_owned()); - option_map.insert("dictionary_enabled::col1".to_owned(), "true".to_owned()); option_map.insert( - "dictionary_enabled::col2.nested".to_owned(), + "parquet.dictionary_enabled::col2.nested".to_owned(), "true".to_owned(), ); - option_map.insert("compression::col1".to_owned(), "zstd(4)".to_owned()); - option_map.insert("compression::col2.nested".to_owned(), "zstd(10)".to_owned()); - option_map.insert("statistics_enabled::col1".to_owned(), "page".to_owned()); + option_map.insert("parquet.compression::col1".to_owned(), "zstd(4)".to_owned()); + option_map.insert( + "parquet.compression::col2.nested".to_owned(), + "zstd(10)".to_owned(), + ); + option_map.insert( + "parquet.statistics_enabled::col1".to_owned(), + "page".to_owned(), + ); option_map.insert( - "statistics_enabled::col2.nested".to_owned(), + "parquet.statistics_enabled::col2.nested".to_owned(), "none".to_owned(), ); - option_map.insert("bloom_filter_fpp::col1".to_owned(), "0.123".to_owned()); option_map.insert( - "bloom_filter_fpp::col2.nested".to_owned(), + "parquet.bloom_filter_fpp::col1".to_owned(), + "0.123".to_owned(), + ); + option_map.insert( + "parquet.bloom_filter_fpp::col2.nested".to_owned(), "0.456".to_owned(), ); - option_map.insert("bloom_filter_ndv::col1".to_owned(), "123".to_owned()); - option_map.insert("bloom_filter_ndv::col2.nested".to_owned(), "456".to_owned()); + option_map.insert( + "parquet.bloom_filter_ndv::col1".to_owned(), + "123".to_owned(), + ); + option_map.insert( + "parquet.bloom_filter_ndv::col2.nested".to_owned(), + "456".to_owned(), + ); - let options = StatementOptions::from(&option_map); - let config = ConfigOptions::new(); + let mut table_config = TableOptions::new(); + table_config.alter_with_string_hash_map(&option_map)?; - let parquet_options = ParquetWriterOptions::try_from((&config, &options))?; + let parquet_options = ParquetWriterOptions::try_from(&table_config.parquet)?; let properties = parquet_options.writer_options(); let col1 = ColumnPath::from(vec!["col1".to_owned()]); @@ -535,20 +271,20 @@ mod tests { // for StatementOptions fn test_writeroptions_csv_from_statement_options() -> Result<()> { let mut option_map: HashMap = HashMap::new(); - option_map.insert("header".to_owned(), "true".to_owned()); - option_map.insert("date_format".to_owned(), "123".to_owned()); - option_map.insert("datetime_format".to_owned(), "123".to_owned()); - option_map.insert("timestamp_format".to_owned(), "2.0".to_owned()); - option_map.insert("time_format".to_owned(), "123".to_owned()); - option_map.insert("rfc3339".to_owned(), "true".to_owned()); - option_map.insert("null_value".to_owned(), "123".to_owned()); - option_map.insert("compression".to_owned(), "gzip".to_owned()); - option_map.insert("delimiter".to_owned(), ";".to_owned()); - - let options = StatementOptions::from(&option_map); - let config = ConfigOptions::new(); - - let csv_options = CsvWriterOptions::try_from((&config, &options))?; + option_map.insert("csv.has_header".to_owned(), "true".to_owned()); + option_map.insert("csv.date_format".to_owned(), "123".to_owned()); + option_map.insert("csv.datetime_format".to_owned(), "123".to_owned()); + option_map.insert("csv.timestamp_format".to_owned(), "2.0".to_owned()); + option_map.insert("csv.time_format".to_owned(), "123".to_owned()); + option_map.insert("csv.null_value".to_owned(), "123".to_owned()); + option_map.insert("csv.compression".to_owned(), "gzip".to_owned()); + option_map.insert("csv.delimiter".to_owned(), ";".to_owned()); + + let mut table_config = TableOptions::new(); + table_config.alter_with_string_hash_map(&option_map)?; + + let csv_options = CsvWriterOptions::try_from(&table_config.csv)?; + let builder = csv_options.writer_options; assert!(builder.header()); let buff = Vec::new(); @@ -563,12 +299,12 @@ mod tests { // for StatementOptions fn test_writeroptions_json_from_statement_options() -> Result<()> { let mut option_map: HashMap = HashMap::new(); - option_map.insert("compression".to_owned(), "gzip".to_owned()); + option_map.insert("json.compression".to_owned(), "gzip".to_owned()); - let options = StatementOptions::from(&option_map); - let config = ConfigOptions::new(); + let mut table_config = TableOptions::new(); + table_config.alter_with_string_hash_map(&option_map)?; - let json_options = JsonWriterOptions::try_from((&config, &options))?; + let json_options = JsonWriterOptions::try_from(&table_config.json)?; assert_eq!(json_options.compression, CompressionTypeVariant::GZIP); Ok(()) diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index 80fa023587ee..e8a350e8d389 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -17,15 +17,11 @@ //! Options related to how parquet files should be written -use parquet::file::properties::{WriterProperties, WriterPropertiesBuilder}; - -use crate::{config::ConfigOptions, DataFusionError, Result}; - -use super::StatementOptions; +use crate::{config::TableParquetOptions, DataFusionError, Result}; use parquet::{ basic::{BrotliLevel, GzipLevel, ZstdLevel}, - file::properties::{EnabledStatistics, WriterVersion}, + file::properties::{EnabledStatistics, WriterProperties, WriterVersion}, schema::types::ColumnPath, }; @@ -47,165 +43,102 @@ impl ParquetWriterOptions { } } -/// Constructs a default Parquet WriterPropertiesBuilder using -/// Session level ConfigOptions to initialize settings -pub fn default_builder(options: &ConfigOptions) -> Result { - let parquet_session_options = &options.execution.parquet; - let mut builder = WriterProperties::builder() - .set_data_page_size_limit(parquet_session_options.data_pagesize_limit) - .set_write_batch_size(parquet_session_options.write_batch_size) - .set_writer_version(parse_version_string( - &parquet_session_options.writer_version, - )?) - .set_dictionary_page_size_limit( - parquet_session_options.dictionary_page_size_limit, - ) - .set_max_row_group_size(parquet_session_options.max_row_group_size) - .set_created_by(parquet_session_options.created_by.clone()) - .set_column_index_truncate_length( - parquet_session_options.column_index_truncate_length, - ) - .set_data_page_row_count_limit(parquet_session_options.data_page_row_count_limit) - .set_bloom_filter_enabled(parquet_session_options.bloom_filter_enabled); - - builder = match &parquet_session_options.encoding { - Some(encoding) => builder.set_encoding(parse_encoding_string(encoding)?), - None => builder, - }; - - builder = match &parquet_session_options.dictionary_enabled { - Some(enabled) => builder.set_dictionary_enabled(*enabled), - None => builder, - }; - - builder = match &parquet_session_options.compression { - Some(compression) => { - builder.set_compression(parse_compression_string(compression)?) +impl TryFrom<&TableParquetOptions> for ParquetWriterOptions { + type Error = DataFusionError; + + fn try_from(parquet_options: &TableParquetOptions) -> Result { + let parquet_session_options = &parquet_options.global; + let mut builder = WriterProperties::builder() + .set_data_page_size_limit(parquet_session_options.data_pagesize_limit) + .set_write_batch_size(parquet_session_options.write_batch_size) + .set_writer_version(parse_version_string( + &parquet_session_options.writer_version, + )?) + .set_dictionary_page_size_limit( + parquet_session_options.dictionary_page_size_limit, + ) + .set_max_row_group_size(parquet_session_options.max_row_group_size) + .set_created_by(parquet_session_options.created_by.clone()) + .set_column_index_truncate_length( + parquet_session_options.column_index_truncate_length, + ) + .set_data_page_row_count_limit( + parquet_session_options.data_page_row_count_limit, + ) + .set_bloom_filter_enabled(parquet_session_options.bloom_filter_enabled); + + if let Some(encoding) = &parquet_session_options.encoding { + builder = builder.set_encoding(parse_encoding_string(encoding)?); + } + + if let Some(enabled) = parquet_session_options.dictionary_enabled { + builder = builder.set_dictionary_enabled(enabled); + } + + if let Some(compression) = &parquet_session_options.compression { + builder = builder.set_compression(parse_compression_string(compression)?); + } + + if let Some(statistics) = &parquet_session_options.statistics_enabled { + builder = + builder.set_statistics_enabled(parse_statistics_string(statistics)?); } - None => builder, - }; - builder = match &parquet_session_options.statistics_enabled { - Some(statistics) => { - builder.set_statistics_enabled(parse_statistics_string(statistics)?) + if let Some(size) = parquet_session_options.max_statistics_size { + builder = builder.set_max_statistics_size(size); } - None => builder, - }; - builder = match &parquet_session_options.max_statistics_size { - Some(size) => builder.set_max_statistics_size(*size), - None => builder, - }; + if let Some(fpp) = parquet_session_options.bloom_filter_fpp { + builder = builder.set_bloom_filter_fpp(fpp); + } - builder = match &parquet_session_options.bloom_filter_fpp { - Some(fpp) => builder.set_bloom_filter_fpp(*fpp), - None => builder, - }; + if let Some(ndv) = parquet_session_options.bloom_filter_ndv { + builder = builder.set_bloom_filter_ndv(ndv); + } - builder = match &parquet_session_options.bloom_filter_ndv { - Some(ndv) => builder.set_bloom_filter_ndv(*ndv), - None => builder, - }; + for (column, options) in &parquet_options.column_specific_options { + let path = ColumnPath::new(column.split('.').map(|s| s.to_owned()).collect()); - Ok(builder) -} + if let Some(bloom_filter_enabled) = options.bloom_filter_enabled { + builder = builder + .set_column_bloom_filter_enabled(path.clone(), bloom_filter_enabled); + } -impl TryFrom<(&ConfigOptions, &StatementOptions)> for ParquetWriterOptions { - type Error = DataFusionError; + if let Some(encoding) = &options.encoding { + let parsed_encoding = parse_encoding_string(encoding)?; + builder = builder.set_column_encoding(path.clone(), parsed_encoding); + } + + if let Some(dictionary_enabled) = options.dictionary_enabled { + builder = builder + .set_column_dictionary_enabled(path.clone(), dictionary_enabled); + } + + if let Some(compression) = &options.compression { + let parsed_compression = parse_compression_string(compression)?; + builder = + builder.set_column_compression(path.clone(), parsed_compression); + } + + if let Some(statistics_enabled) = &options.statistics_enabled { + let parsed_value = parse_statistics_string(statistics_enabled)?; + builder = + builder.set_column_statistics_enabled(path.clone(), parsed_value); + } + + if let Some(bloom_filter_fpp) = options.bloom_filter_fpp { + builder = + builder.set_column_bloom_filter_fpp(path.clone(), bloom_filter_fpp); + } + + if let Some(bloom_filter_ndv) = options.bloom_filter_ndv { + builder = + builder.set_column_bloom_filter_ndv(path.clone(), bloom_filter_ndv); + } - fn try_from( - configs_and_statement_options: (&ConfigOptions, &StatementOptions), - ) -> Result { - let configs = configs_and_statement_options.0; - let statement_options = configs_and_statement_options.1; - let mut builder = default_builder(configs)?; - for (option, value) in &statement_options.options { - let (option, col_path) = split_option_and_column_path(option); - builder = match option.to_lowercase().as_str(){ - "max_row_group_size" => builder - .set_max_row_group_size(value.parse() - .map_err(|_| DataFusionError::Configuration(format!("Unable to parse {value} as u64 as required for {option}!")))?), - "data_pagesize_limit" => builder - .set_data_page_size_limit(value.parse() - .map_err(|_| DataFusionError::Configuration(format!("Unable to parse {value} as usize as required for {option}!")))?), - "write_batch_size" => builder - .set_write_batch_size(value.parse() - .map_err(|_| DataFusionError::Configuration(format!("Unable to parse {value} as usize as required for {option}!")))?), - "writer_version" => builder - .set_writer_version(parse_version_string(value)?), - "dictionary_page_size_limit" => builder - .set_dictionary_page_size_limit(value.parse() - .map_err(|_| DataFusionError::Configuration(format!("Unable to parse {value} as usize as required for {option}!")))?), - "created_by" => builder - .set_created_by(value.to_owned()), - "column_index_truncate_length" => builder - .set_column_index_truncate_length(Some(value.parse() - .map_err(|_| DataFusionError::Configuration(format!("Unable to parse {value} as usize as required for {option}!")))?)), - "data_page_row_count_limit" => builder - .set_data_page_row_count_limit(value.parse() - .map_err(|_| DataFusionError::Configuration(format!("Unable to parse {value} as usize as required for {option}!")))?), - "bloom_filter_enabled" => { - let parsed_value = value.parse() - .map_err(|_| DataFusionError::Configuration(format!("Unable to parse {value} as bool as required for {option}!")))?; - match col_path{ - Some(path) => builder.set_column_bloom_filter_enabled(path, parsed_value), - None => builder.set_bloom_filter_enabled(parsed_value) - } - }, - "encoding" => { - let parsed_encoding = parse_encoding_string(value)?; - match col_path{ - Some(path) => builder.set_column_encoding(path, parsed_encoding), - None => builder.set_encoding(parsed_encoding) - } - }, - "dictionary_enabled" => { - let parsed_value = value.parse() - .map_err(|_| DataFusionError::Configuration(format!("Unable to parse {value} as bool as required for {option}!")))?; - match col_path{ - Some(path) => builder.set_column_dictionary_enabled(path, parsed_value), - None => builder.set_dictionary_enabled(parsed_value) - } - }, - "compression" => { - let parsed_compression = parse_compression_string(value)?; - match col_path{ - Some(path) => builder.set_column_compression(path, parsed_compression), - None => builder.set_compression(parsed_compression) - } - }, - "statistics_enabled" => { - let parsed_value = parse_statistics_string(value)?; - match col_path{ - Some(path) => builder.set_column_statistics_enabled(path, parsed_value), - None => builder.set_statistics_enabled(parsed_value) - } - }, - "max_statistics_size" => { - let parsed_value = value.parse() - .map_err(|_| DataFusionError::Configuration(format!("Unable to parse {value} as usize as required for {option}!")))?; - match col_path{ - Some(path) => builder.set_column_max_statistics_size(path, parsed_value), - None => builder.set_max_statistics_size(parsed_value) - } - }, - "bloom_filter_fpp" => { - let parsed_value = value.parse() - .map_err(|_| DataFusionError::Configuration(format!("Unable to parse {value} as f64 as required for {option}!")))?; - match col_path{ - Some(path) => builder.set_column_bloom_filter_fpp(path, parsed_value), - None => builder.set_bloom_filter_fpp(parsed_value) - } - }, - "bloom_filter_ndv" => { - let parsed_value = value.parse() - .map_err(|_| DataFusionError::Configuration(format!("Unable to parse {value} as u64 as required for {option}!")))?; - match col_path{ - Some(path) => builder.set_column_bloom_filter_ndv(path, parsed_value), - None => builder.set_bloom_filter_ndv(parsed_value) - } - }, - _ => return Err(DataFusionError::Configuration(format!("Found unsupported option {option} with value {value} for Parquet format!"))) + if let Some(max_statistics_size) = options.max_statistics_size { + builder = + builder.set_column_max_statistics_size(path, max_statistics_size); } } Ok(ParquetWriterOptions { @@ -282,7 +215,7 @@ fn require_level(codec: &str, level: Option) -> Result { } /// Parses datafusion.execution.parquet.compression String to a parquet::basic::Compression -pub(crate) fn parse_compression_string( +pub fn parse_compression_string( str_setting: &str, ) -> Result { let str_setting_lower: &str = &str_setting.to_lowercase(); @@ -359,15 +292,3 @@ pub(crate) fn parse_statistics_string(str_setting: &str) -> Result (String, Option) { - match str_setting.replace('\'', "").split_once("::") { - Some((s1, s2)) => { - let col_path = ColumnPath::new(s2.split('.').map(|s| s.to_owned()).collect()); - (s1.to_owned(), Some(col_path)) - } - None => (str_setting.to_owned(), None), - } -} diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index ef77d25f1ec0..da7d6579bfe6 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -55,7 +55,6 @@ pub use file_options::file_type::{ FileType, GetExt, DEFAULT_ARROW_EXTENSION, DEFAULT_AVRO_EXTENSION, DEFAULT_CSV_EXTENSION, DEFAULT_JSON_EXTENSION, DEFAULT_PARQUET_EXTENSION, }; -pub use file_options::FileTypeWriterOptions; pub use functional_dependencies::{ aggregate_functional_dependencies, get_required_group_by_exprs_indices, get_target_functional_dependencies, Constraint, Constraints, Dependency, diff --git a/datafusion/common/src/parsers.rs b/datafusion/common/src/parsers.rs index 9583ecbdb733..e23edb4e2adb 100644 --- a/datafusion/common/src/parsers.rs +++ b/datafusion/common/src/parsers.rs @@ -16,12 +16,13 @@ // under the License. //! Interval parsing logic -use sqlparser::parser::ParserError; -use std::fmt::Display; +use std::fmt::Display; use std::result; use std::str::FromStr; +use sqlparser::parser::ParserError; + /// Readable file compression type #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum CompressionTypeVariant { diff --git a/datafusion/core/benches/sql_planner.rs b/datafusion/core/benches/sql_planner.rs index d800bcfe5bfc..3f7d66f5cc15 100644 --- a/datafusion/core/benches/sql_planner.rs +++ b/datafusion/core/benches/sql_planner.rs @@ -166,6 +166,8 @@ fn create_context() -> SessionContext { .unwrap(); ctx.register_table("t700", create_table_provider("c", 700)) .unwrap(); + ctx.register_table("t1000", create_table_provider("d", 1000)) + .unwrap(); let tpch_schemas = create_tpch_schemas(); tpch_schemas.iter().for_each(|(name, schema)| { @@ -194,6 +196,16 @@ fn criterion_benchmark(c: &mut Criterion) { b.iter(|| physical_plan(&ctx, "SELECT c1 FROM t700")) }); + // Test simplest + c.bench_function("logical_select_all_from_1000", |b| { + b.iter(|| logical_plan(&ctx, "SELECT * FROM t1000")) + }); + + // Test simplest + c.bench_function("physical_select_all_from_1000", |b| { + b.iter(|| physical_plan(&ctx, "SELECT * FROM t1000")) + }); + c.bench_function("logical_trivial_join_low_numbered_columns", |b| { b.iter(|| { logical_plan( diff --git a/datafusion/core/benches/sql_query_with_io.rs b/datafusion/core/benches/sql_query_with_io.rs index c7a838385bd6..916f48ce40c6 100644 --- a/datafusion/core/benches/sql_query_with_io.rs +++ b/datafusion/core/benches/sql_query_with_io.rs @@ -123,7 +123,7 @@ async fn setup_context(object_store: Arc) -> SessionContext { for table_id in 0..TABLES { let table_name = table_name(table_id); - let file_format = ParquetFormat::default().with_enable_pruning(Some(true)); + let file_format = ParquetFormat::default().with_enable_pruning(true); let options = ListingOptions::new(Arc::new(file_format)) .with_table_partition_cols(vec![(String::from("partition"), DataType::UInt8)]) .with_target_partitions(THREADS); diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 3bdf2af4552d..5f192b83fdd9 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -21,6 +21,7 @@ mod parquet; use std::any::Any; +use std::collections::HashMap; use std::sync::Arc; use crate::arrow::record_batch::RecordBatch; @@ -41,16 +42,12 @@ use crate::prelude::SessionContext; use arrow::array::{Array, ArrayRef, Int64Array, StringArray}; use arrow::compute::{cast, concat}; -use arrow::csv::WriterBuilder; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; -use datafusion_common::file_options::csv_writer::CsvWriterOptions; -use datafusion_common::file_options::json_writer::JsonWriterOptions; -use datafusion_common::parsers::CompressionTypeVariant; +use arrow::datatypes::{DataType, Field}; +use arrow_schema::{Schema, SchemaRef}; +use datafusion_common::config::{CsvOptions, FormatOptions, JsonOptions}; use datafusion_common::{ - plan_err, Column, DFSchema, DataFusionError, FileType, FileTypeWriterOptions, - ParamValues, SchemaError, UnnestOptions, + plan_err, Column, DFSchema, DataFusionError, ParamValues, SchemaError, UnnestOptions, }; -use datafusion_expr::dml::CopyOptions; use datafusion_expr::{ avg, count, is_null, max, median, min, stddev, utils::COUNT_STAR_EXPANSION, TableProviderFilterPushDown, UNNAMED_TABLE, @@ -66,10 +63,6 @@ pub struct DataFrameWriteOptions { /// Controls if all partitions should be coalesced into a single output file /// Generally will have slower performance when set to true. single_file_output: bool, - /// Sets compression by DataFusion applied after file serialization. - /// Allows compression of CSV and JSON. - /// Not supported for parquet. - compression: CompressionTypeVariant, /// Sets which columns should be used for hive-style partitioned writes by name. /// Can be set to empty vec![] for non-partitioned writes. partition_by: Vec, @@ -81,7 +74,6 @@ impl DataFrameWriteOptions { DataFrameWriteOptions { overwrite: false, single_file_output: false, - compression: CompressionTypeVariant::UNCOMPRESSED, partition_by: vec![], } } @@ -97,12 +89,6 @@ impl DataFrameWriteOptions { self } - /// Sets the compression type applied to the output file(s) - pub fn with_compression(mut self, compression: CompressionTypeVariant) -> Self { - self.compression = compression; - self - } - /// Sets the partition_by columns for output partitioning pub fn with_partition_by(mut self, partition_by: Vec) -> Self { self.partition_by = partition_by; @@ -1168,28 +1154,22 @@ impl DataFrame { self, path: &str, options: DataFrameWriteOptions, - writer_properties: Option, + writer_options: Option, ) -> Result, DataFusionError> { if options.overwrite { return Err(DataFusionError::NotImplemented( "Overwrites are not implemented for DataFrame::write_csv.".to_owned(), )); } - let props = match writer_properties { - Some(props) => props, - None => WriterBuilder::new(), - }; - - let file_type_writer_options = - FileTypeWriterOptions::CSV(CsvWriterOptions::new(props, options.compression)); - let copy_options = CopyOptions::WriterOptions(Box::new(file_type_writer_options)); + let table_options = self.session_state.default_table_options(); + let props = writer_options.unwrap_or_else(|| table_options.csv.clone()); let plan = LogicalPlanBuilder::copy_to( self.plan, path.into(), - FileType::CSV, + FormatOptions::CSV(props), + HashMap::new(), options.partition_by, - copy_options, )? .build()?; DataFrame::new(self.session_state, plan).collect().await @@ -1212,6 +1192,7 @@ impl DataFrame { /// .write_json( /// "output.json", /// DataFrameWriteOptions::new(), + /// None /// ).await?; /// # fs::remove_file("output.json")?; /// # Ok(()) @@ -1221,21 +1202,24 @@ impl DataFrame { self, path: &str, options: DataFrameWriteOptions, + writer_options: Option, ) -> Result, DataFusionError> { if options.overwrite { return Err(DataFusionError::NotImplemented( "Overwrites are not implemented for DataFrame::write_json.".to_owned(), )); } - let file_type_writer_options = - FileTypeWriterOptions::JSON(JsonWriterOptions::new(options.compression)); - let copy_options = CopyOptions::WriterOptions(Box::new(file_type_writer_options)); + + let table_options = self.session_state.default_table_options(); + + let props = writer_options.unwrap_or_else(|| table_options.json.clone()); + let plan = LogicalPlanBuilder::copy_to( self.plan, path.into(), - FileType::JSON, + FormatOptions::JSON(props), + Default::default(), options.partition_by, - copy_options, )? .build()?; DataFrame::new(self.session_state, plan).collect().await diff --git a/datafusion/core/src/dataframe/parquet.rs b/datafusion/core/src/dataframe/parquet.rs index b7d63bf0a4b7..f4e8c9dfcd6f 100644 --- a/datafusion/core/src/dataframe/parquet.rs +++ b/datafusion/core/src/dataframe/parquet.rs @@ -15,16 +15,12 @@ // specific language governing permissions and limitations // under the License. -use datafusion_common::file_options::parquet_writer::{ - default_builder, ParquetWriterOptions, -}; -use parquet::file::properties::WriterProperties; - use super::{ - CompressionTypeVariant, CopyOptions, DataFrame, DataFrameWriteOptions, - DataFusionError, FileType, FileTypeWriterOptions, LogicalPlanBuilder, RecordBatch, + DataFrame, DataFrameWriteOptions, DataFusionError, LogicalPlanBuilder, RecordBatch, }; +use datafusion_common::config::{FormatOptions, TableParquetOptions}; + impl DataFrame { /// Execute the `DataFrame` and write the results to Parquet file(s). /// @@ -53,30 +49,24 @@ impl DataFrame { self, path: &str, options: DataFrameWriteOptions, - writer_properties: Option, + writer_options: Option, ) -> Result, DataFusionError> { if options.overwrite { return Err(DataFusionError::NotImplemented( "Overwrites are not implemented for DataFrame::write_parquet.".to_owned(), )); } - match options.compression{ - CompressionTypeVariant::UNCOMPRESSED => (), - _ => return Err(DataFusionError::Configuration("DataFrame::write_parquet method does not support compression set via DataFrameWriteOptions. Set parquet compression via writer_properties instead.".to_owned())) - } - let props = match writer_properties { - Some(props) => props, - None => default_builder(self.session_state.config_options())?.build(), - }; - let file_type_writer_options = - FileTypeWriterOptions::Parquet(ParquetWriterOptions::new(props)); - let copy_options = CopyOptions::WriterOptions(Box::new(file_type_writer_options)); + + let table_options = self.session_state.default_table_options(); + + let props = writer_options.unwrap_or_else(|| table_options.parquet.clone()); + let plan = LogicalPlanBuilder::copy_to( self.plan, path.into(), - FileType::PARQUET, + FormatOptions::PARQUET(props), + Default::default(), options.partition_by, - copy_options, )? .build()?; DataFrame::new(self.session_state, plan).collect().await @@ -87,21 +77,20 @@ impl DataFrame { mod tests { use std::sync::Arc; - use object_store::local::LocalFileSystem; - use parquet::basic::{BrotliLevel, GzipLevel, ZstdLevel}; - use parquet::file::reader::FileReader; - use tempfile::TempDir; - use url::Url; - - use datafusion_expr::{col, lit}; - + use super::super::Result; + use super::*; use crate::arrow::util::pretty; use crate::execution::context::SessionContext; use crate::execution::options::ParquetReadOptions; use crate::test_util; - use super::super::Result; - use super::*; + use datafusion_common::file_options::parquet_writer::parse_compression_string; + use datafusion_expr::{col, lit}; + + use object_store::local::LocalFileSystem; + use parquet::file::reader::FileReader; + use tempfile::TempDir; + use url::Url; #[tokio::test] async fn filter_pushdown_dataframe() -> Result<()> { @@ -136,15 +125,14 @@ mod tests { #[tokio::test] async fn write_parquet_with_compression() -> Result<()> { let test_df = test_util::test_table().await?; - let output_path = "file://local/test.parquet"; let test_compressions = vec![ - parquet::basic::Compression::SNAPPY, - parquet::basic::Compression::LZ4, - parquet::basic::Compression::LZ4_RAW, - parquet::basic::Compression::GZIP(GzipLevel::default()), - parquet::basic::Compression::BROTLI(BrotliLevel::default()), - parquet::basic::Compression::ZSTD(ZstdLevel::default()), + "snappy", + "brotli(1)", + "lz4", + "lz4_raw", + "gzip(6)", + "zstd(1)", ]; for compression in test_compressions.into_iter() { let df = test_df.clone(); @@ -153,14 +141,12 @@ mod tests { let local_url = Url::parse("file://local").unwrap(); let ctx = &test_df.session_state; ctx.runtime_env().register_object_store(&local_url, local); + let mut options = TableParquetOptions::default(); + options.global.compression = Some(compression.to_string()); df.write_parquet( output_path, DataFrameWriteOptions::new().with_single_file_output(true), - Some( - WriterProperties::builder() - .set_compression(compression) - .build(), - ), + Some(options), ) .await?; @@ -176,7 +162,7 @@ mod tests { let written_compression = parquet_metadata.row_group(0).column(0).compression(); - assert_eq!(written_compression, compression); + assert_eq!(written_compression, parse_compression_string(compression)?); } Ok(()) diff --git a/datafusion/core/src/datasource/file_format/arrow.rs b/datafusion/core/src/datasource/file_format/arrow.rs index 90417a978137..99bfbbad9d10 100644 --- a/datafusion/core/src/datasource/file_format/arrow.rs +++ b/datafusion/core/src/datasource/file_format/arrow.rs @@ -24,40 +24,36 @@ use std::borrow::Cow; use std::fmt::{self, Debug}; use std::sync::Arc; +use super::file_compression_type::FileCompressionType; +use super::write::demux::start_demuxer_task; +use super::write::{create_writer, SharedBuffer}; use crate::datasource::file_format::FileFormat; use crate::datasource::physical_plan::{ ArrowExec, FileGroupDisplay, FileScanConfig, FileSinkConfig, }; use crate::error::Result; use crate::execution::context::SessionState; -use crate::physical_plan::ExecutionPlan; +use crate::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan}; use arrow::ipc::convert::fb_to_schema; use arrow::ipc::reader::FileReader; -use arrow::ipc::root_as_message; -use arrow_ipc::writer::IpcWriteOptions; -use arrow_ipc::CompressionType; +use arrow::ipc::writer::IpcWriteOptions; +use arrow::ipc::{root_as_message, CompressionType}; use arrow_schema::{ArrowError, Schema, SchemaRef}; - -use bytes::Bytes; use datafusion_common::{not_impl_err, DataFusionError, FileType, Statistics}; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortRequirement}; - -use crate::physical_plan::{DisplayAs, DisplayFormatType}; -use async_trait::async_trait; use datafusion_physical_plan::insert::{DataSink, FileSinkExec}; use datafusion_physical_plan::metrics::MetricsSet; + +use async_trait::async_trait; +use bytes::Bytes; use futures::stream::BoxStream; use futures::StreamExt; use object_store::{GetResultPayload, ObjectMeta, ObjectStore}; use tokio::io::AsyncWriteExt; use tokio::task::JoinSet; -use super::file_compression_type::FileCompressionType; -use super::write::demux::start_demuxer_task; -use super::write::{create_writer, SharedBuffer}; - /// Initial writing buffer size. Note this is just a size hint for efficiency. It /// will grow beyond the set value if needed. const INITIAL_BUFFER_BYTES: usize = 1048576; @@ -215,11 +211,6 @@ impl DataSink for ArrowFileSink { data: SendableRecordBatchStream, context: &Arc, ) -> Result { - // No props are supported yet, but can be by updating FileTypeWriterOptions - // to populate this struct and use those options to initialize the arrow_ipc::writer::FileWriter - // https://github.com/apache/arrow-datafusion/issues/8635 - let _arrow_props = self.config.file_type_writer_options.try_into_arrow()?; - let object_store = context .runtime_env() .object_store(&self.config.object_store_url)?; @@ -390,12 +381,11 @@ async fn collect_at_least_n_bytes( #[cfg(test)] mod tests { - use chrono::DateTime; - use object_store::{chunked::ChunkedStore, memory::InMemory, path::Path}; - + use super::*; use crate::execution::context::SessionContext; - use super::*; + use chrono::DateTime; + use object_store::{chunked::ChunkedStore, memory::InMemory, path::Path}; #[tokio::test] async fn test_infer_schema_stream() -> Result<()> { diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index 9cae6675e825..a7849258329b 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -23,7 +23,7 @@ use std::fmt::{self, Debug}; use std::sync::Arc; use super::write::orchestration::stateless_multipart_put; -use super::{FileFormat, DEFAULT_SCHEMA_INFER_MAX_RECORD}; +use super::FileFormat; use crate::datasource::file_format::file_compression_type::FileCompressionType; use crate::datasource::file_format::write::BatchSerializer; use crate::datasource::physical_plan::{ @@ -39,6 +39,8 @@ use arrow::array::RecordBatch; use arrow::csv::WriterBuilder; use arrow::datatypes::{DataType, Field, Fields, Schema}; use arrow::{self, datatypes::SchemaRef}; +use datafusion_common::config::CsvOptions; +use datafusion_common::file_options::csv_writer::CsvWriterOptions; use datafusion_common::{exec_err, not_impl_err, DataFusionError, FileType}; use datafusion_execution::TaskContext; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortRequirement}; @@ -51,27 +53,9 @@ use futures::{pin_mut, Stream, StreamExt, TryStreamExt}; use object_store::{delimited::newline_delimited_stream, ObjectMeta, ObjectStore}; /// Character Separated Value `FileFormat` implementation. -#[derive(Debug)] +#[derive(Debug, Default)] pub struct CsvFormat { - has_header: bool, - delimiter: u8, - quote: u8, - escape: Option, - schema_infer_max_rec: Option, - file_compression_type: FileCompressionType, -} - -impl Default for CsvFormat { - fn default() -> Self { - Self { - schema_infer_max_rec: Some(DEFAULT_SCHEMA_INFER_MAX_RECORD), - has_header: true, - delimiter: b',', - quote: b'"', - escape: None, - file_compression_type: FileCompressionType::UNCOMPRESSED, - } - } + options: CsvOptions, } impl CsvFormat { @@ -110,7 +94,7 @@ impl CsvFormat { &self, stream: BoxStream<'static, Result>, ) -> BoxStream<'static, Result> { - let file_compression_type = self.file_compression_type.to_owned(); + let file_compression_type: FileCompressionType = self.options.compression.into(); let decoder = file_compression_type.convert_stream(stream); let steam = match decoder { Ok(decoded_stream) => { @@ -131,43 +115,54 @@ impl CsvFormat { steam.boxed() } + /// Set the csv options + pub fn with_options(mut self, options: CsvOptions) -> Self { + self.options = options; + self + } + + /// Retrieve the csv options + pub fn options(&self) -> &CsvOptions { + &self.options + } + /// Set a limit in terms of records to scan to infer the schema /// - default to `DEFAULT_SCHEMA_INFER_MAX_RECORD` - pub fn with_schema_infer_max_rec(mut self, max_rec: Option) -> Self { - self.schema_infer_max_rec = max_rec; + pub fn with_schema_infer_max_rec(mut self, max_rec: usize) -> Self { + self.options.schema_infer_max_rec = max_rec; self } /// Set true to indicate that the first line is a header. /// - default to true pub fn with_has_header(mut self, has_header: bool) -> Self { - self.has_header = has_header; + self.options.has_header = has_header; self } /// True if the first line is a header. pub fn has_header(&self) -> bool { - self.has_header + self.options.has_header } /// The character separating values within a row. /// - default to ',' pub fn with_delimiter(mut self, delimiter: u8) -> Self { - self.delimiter = delimiter; + self.options.delimiter = delimiter; self } /// The quote character in a row. /// - default to '"' pub fn with_quote(mut self, quote: u8) -> Self { - self.quote = quote; + self.options.quote = quote; self } /// The escape character in a row. /// - default is None pub fn with_escape(mut self, escape: Option) -> Self { - self.escape = escape; + self.options.escape = escape; self } @@ -177,23 +172,23 @@ impl CsvFormat { mut self, file_compression_type: FileCompressionType, ) -> Self { - self.file_compression_type = file_compression_type; + self.options.compression = file_compression_type.into(); self } /// The delimiter character. pub fn delimiter(&self) -> u8 { - self.delimiter + self.options.delimiter } /// The quote character. pub fn quote(&self) -> u8 { - self.quote + self.options.quote } /// The escape character. pub fn escape(&self) -> Option { - self.escape + self.options.escape } } @@ -211,7 +206,7 @@ impl FileFormat for CsvFormat { ) -> Result { let mut schemas = vec![]; - let mut records_to_read = self.schema_infer_max_rec.unwrap_or(usize::MAX); + let mut records_to_read = self.options.schema_infer_max_rec; for object in objects { let stream = self.read_to_delimited_chunks(store, object).await; @@ -247,11 +242,11 @@ impl FileFormat for CsvFormat { ) -> Result> { let exec = CsvExec::new( conf, - self.has_header, - self.delimiter, - self.quote, - self.escape, - self.file_compression_type.to_owned(), + self.options.has_header, + self.options.delimiter, + self.options.quote, + self.options.escape, + self.options.compression.into(), ); Ok(Arc::new(exec)) } @@ -267,12 +262,10 @@ impl FileFormat for CsvFormat { return not_impl_err!("Overwrites are not implemented yet for CSV"); } - if self.file_compression_type != FileCompressionType::UNCOMPRESSED { - return not_impl_err!("Inserting compressed CSV is not implemented yet."); - } + let writer_options = CsvWriterOptions::try_from(&self.options)?; let sink_schema = conf.output_schema().clone(); - let sink = Arc::new(CsvSink::new(conf)); + let sink = Arc::new(CsvSink::new(conf, writer_options)); Ok(Arc::new(FileSinkExec::new( input, @@ -305,8 +298,8 @@ impl CsvFormat { while let Some(chunk) = stream.next().await.transpose()? { let format = arrow::csv::reader::Format::default() - .with_header(self.has_header && first_chunk) - .with_delimiter(self.delimiter); + .with_header(self.options.has_header && first_chunk) + .with_delimiter(self.options.delimiter); let (Schema { fields, .. }, records_read) = format.infer_schema(chunk.reader(), Some(records_to_read))?; @@ -439,6 +432,7 @@ impl BatchSerializer for CsvSerializer { pub struct CsvSink { /// Config options for writing data config: FileSinkConfig, + writer_options: CsvWriterOptions, } impl Debug for CsvSink { @@ -461,8 +455,11 @@ impl DisplayAs for CsvSink { impl CsvSink { /// Create from config. - pub fn new(config: FileSinkConfig) -> Self { - Self { config } + pub fn new(config: FileSinkConfig, writer_options: CsvWriterOptions) -> Self { + Self { + config, + writer_options, + } } /// Retrieve the inner [`FileSinkConfig`]. @@ -475,11 +472,10 @@ impl CsvSink { data: SendableRecordBatchStream, context: &Arc, ) -> Result { - let writer_options = self.config.file_type_writer_options.try_into_csv()?; - let builder = &writer_options.writer_options; + let builder = &self.writer_options.writer_options; let builder_clone = builder.clone(); - let options_clone = writer_options.clone(); + let options_clone = self.writer_options.clone(); let get_serializer = move || { Arc::new( CsvSerializer::new() @@ -494,10 +490,15 @@ impl CsvSink { "csv".into(), Box::new(get_serializer), &self.config, - writer_options.compression.into(), + self.writer_options.compression.into(), ) .await } + + /// Retrieve the writer options + pub fn writer_options(&self) -> &CsvWriterOptions { + &self.writer_options + } } #[async_trait] @@ -668,11 +669,9 @@ mod tests { }; let num_rows_to_read = 100; - let csv_format = CsvFormat { - has_header: false, - schema_infer_max_rec: Some(num_rows_to_read), - ..Default::default() - }; + let csv_format = CsvFormat::default() + .with_has_header(false) + .with_schema_infer_max_rec(num_rows_to_read); let inferred_schema = csv_format .infer_schema( &state, @@ -723,7 +722,7 @@ mod tests { let path = Path::from("csv/aggregate_test_100.csv"); let csv = CsvFormat::default().with_has_header(true); - let records_to_read = csv.schema_infer_max_rec.unwrap_or(usize::MAX); + let records_to_read = csv.options().schema_infer_max_rec; let store = Arc::new(integration) as Arc; let original_stream = store.get(&path).await?; diff --git a/datafusion/core/src/datasource/file_format/file_compression_type.rs b/datafusion/core/src/datasource/file_format/file_compression_type.rs index 48094eede87b..c538819e2684 100644 --- a/datafusion/core/src/datasource/file_format/file_compression_type.rs +++ b/datafusion/core/src/datasource/file_format/file_compression_type.rs @@ -17,7 +17,13 @@ //! File Compression type abstraction +use std::str::FromStr; + use crate::error::{DataFusionError, Result}; + +use datafusion_common::parsers::CompressionTypeVariant::{self, *}; +use datafusion_common::{FileType, GetExt}; + #[cfg(feature = "compression")] use async_compression::tokio::bufread::{ BzDecoder as AsyncBzDecoder, BzEncoder as AsyncBzEncoder, @@ -31,15 +37,12 @@ use async_compression::tokio::write::{BzEncoder, GzipEncoder, XzEncoder, ZstdEnc use bytes::Bytes; #[cfg(feature = "compression")] use bzip2::read::MultiBzDecoder; -use datafusion_common::{parsers::CompressionTypeVariant, FileType, GetExt}; #[cfg(feature = "compression")] use flate2::read::MultiGzDecoder; - use futures::stream::BoxStream; use futures::StreamExt; #[cfg(feature = "compression")] use futures::TryStreamExt; -use std::str::FromStr; use tokio::io::AsyncWrite; #[cfg(feature = "compression")] use tokio_util::io::{ReaderStream, StreamReader}; @@ -47,7 +50,6 @@ use tokio_util::io::{ReaderStream, StreamReader}; use xz2::read::XzDecoder; #[cfg(feature = "compression")] use zstd::Decoder as ZstdDecoder; -use CompressionTypeVariant::*; /// Readable file compression type #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -73,6 +75,12 @@ impl From for FileCompressionType { } } +impl From for CompressionTypeVariant { + fn from(t: FileCompressionType) -> Self { + t.variant + } +} + impl FromStr for FileCompressionType { type Err = DataFusionError; @@ -261,14 +269,17 @@ impl FileTypeExt for FileType { #[cfg(test)] mod tests { + use std::str::FromStr; + use crate::datasource::file_format::file_compression_type::{ FileCompressionType, FileTypeExt, }; use crate::error::DataFusionError; - use bytes::Bytes; + use datafusion_common::file_options::file_type::FileType; + + use bytes::Bytes; use futures::StreamExt; - use std::str::FromStr; #[test] fn get_ext_with_compression() { diff --git a/datafusion/core/src/datasource/file_format/json.rs b/datafusion/core/src/datasource/file_format/json.rs index 121fe5e8dcb1..0cc38bbb5554 100644 --- a/datafusion/core/src/datasource/file_format/json.rs +++ b/datafusion/core/src/datasource/file_format/json.rs @@ -27,7 +27,6 @@ use super::write::orchestration::stateless_multipart_put; use super::{FileFormat, FileScanConfig}; use crate::datasource::file_format::file_compression_type::FileCompressionType; use crate::datasource::file_format::write::BatchSerializer; -use crate::datasource::file_format::DEFAULT_SCHEMA_INFER_MAX_RECORD; use crate::datasource::physical_plan::FileGroupDisplay; use crate::datasource::physical_plan::{FileSinkConfig, NdJsonExec}; use crate::error::Result; @@ -42,6 +41,8 @@ use arrow::datatypes::SchemaRef; use arrow::json; use arrow::json::reader::{infer_json_schema_from_iterator, ValueIter}; use arrow_array::RecordBatch; +use datafusion_common::config::JsonOptions; +use datafusion_common::file_options::json_writer::JsonWriterOptions; use datafusion_common::{not_impl_err, FileType}; use datafusion_execution::TaskContext; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortRequirement}; @@ -53,26 +54,27 @@ use bytes::{Buf, Bytes}; use object_store::{GetResultPayload, ObjectMeta, ObjectStore}; /// New line delimited JSON `FileFormat` implementation. -#[derive(Debug)] +#[derive(Debug, Default)] pub struct JsonFormat { - schema_infer_max_rec: Option, - file_compression_type: FileCompressionType, + options: JsonOptions, } -impl Default for JsonFormat { - fn default() -> Self { - Self { - schema_infer_max_rec: Some(DEFAULT_SCHEMA_INFER_MAX_RECORD), - file_compression_type: FileCompressionType::UNCOMPRESSED, - } +impl JsonFormat { + /// Set JSON options + pub fn with_options(mut self, options: JsonOptions) -> Self { + self.options = options; + self + } + + /// Retrieve JSON options + pub fn options(&self) -> &JsonOptions { + &self.options } -} -impl JsonFormat { /// Set a limit in terms of records to scan to infer the schema /// - defaults to `DEFAULT_SCHEMA_INFER_MAX_RECORD` - pub fn with_schema_infer_max_rec(mut self, max_rec: Option) -> Self { - self.schema_infer_max_rec = max_rec; + pub fn with_schema_infer_max_rec(mut self, max_rec: usize) -> Self { + self.options.schema_infer_max_rec = max_rec; self } @@ -82,7 +84,7 @@ impl JsonFormat { mut self, file_compression_type: FileCompressionType, ) -> Self { - self.file_compression_type = file_compression_type; + self.options.compression = file_compression_type.into(); self } } @@ -100,8 +102,8 @@ impl FileFormat for JsonFormat { objects: &[ObjectMeta], ) -> Result { let mut schemas = Vec::new(); - let mut records_to_read = self.schema_infer_max_rec.unwrap_or(usize::MAX); - let file_compression_type = self.file_compression_type.to_owned(); + let mut records_to_read = self.options.schema_infer_max_rec; + let file_compression_type = FileCompressionType::from(self.options.compression); for object in objects { let mut take_while = || { let should_take = records_to_read > 0; @@ -154,7 +156,8 @@ impl FileFormat for JsonFormat { conf: FileScanConfig, _filters: Option<&Arc>, ) -> Result> { - let exec = NdJsonExec::new(conf, self.file_compression_type.to_owned()); + let exec = + NdJsonExec::new(conf, FileCompressionType::from(self.options.compression)); Ok(Arc::new(exec)) } @@ -169,11 +172,10 @@ impl FileFormat for JsonFormat { return not_impl_err!("Overwrites are not implemented yet for Json"); } - if self.file_compression_type != FileCompressionType::UNCOMPRESSED { - return not_impl_err!("Inserting compressed JSON is not implemented yet."); - } + let writer_options = JsonWriterOptions::try_from(&self.options)?; + let sink_schema = conf.output_schema().clone(); - let sink = Arc::new(JsonSink::new(conf)); + let sink = Arc::new(JsonSink::new(conf, writer_options)); Ok(Arc::new(FileSinkExec::new( input, @@ -217,6 +219,8 @@ impl BatchSerializer for JsonSerializer { pub struct JsonSink { /// Config options for writing data config: FileSinkConfig, + /// + writer_options: JsonWriterOptions, } impl Debug for JsonSink { @@ -239,8 +243,11 @@ impl DisplayAs for JsonSink { impl JsonSink { /// Create from config. - pub fn new(config: FileSinkConfig) -> Self { - Self { config } + pub fn new(config: FileSinkConfig, writer_options: JsonWriterOptions) -> Self { + Self { + config, + writer_options, + } } /// Retrieve the inner [`FileSinkConfig`]. @@ -253,9 +260,6 @@ impl JsonSink { data: SendableRecordBatchStream, context: &Arc, ) -> Result { - let writer_options = self.config.file_type_writer_options.try_into_json()?; - let compression = &writer_options.compression; - let get_serializer = move || Arc::new(JsonSerializer::new()) as _; stateless_multipart_put( @@ -264,10 +268,14 @@ impl JsonSink { "json".into(), Box::new(get_serializer), &self.config, - (*compression).into(), + self.writer_options.compression.into(), ) .await } + /// Retrieve the writer options + pub fn writer_options(&self) -> &JsonWriterOptions { + &self.writer_options + } } #[async_trait] @@ -293,21 +301,22 @@ impl DataSink for JsonSink { #[cfg(test)] mod tests { use super::super::test_util::scan_format; + use super::*; + use crate::execution::options::NdJsonReadOptions; + use crate::physical_plan::collect; + use crate::prelude::{SessionConfig, SessionContext}; + use crate::test::object_store::local_unpartitioned_file; + use arrow::util::pretty; use datafusion_common::cast::as_int64_array; use datafusion_common::stats::Precision; use datafusion_common::{assert_batches_eq, internal_err}; + use futures::StreamExt; use object_store::local::LocalFileSystem; use regex::Regex; use rstest::rstest; - use super::*; - use crate::execution::options::NdJsonReadOptions; - use crate::physical_plan::collect; - use crate::prelude::{SessionConfig, SessionContext}; - use crate::test::object_store::local_unpartitioned_file; - #[tokio::test] async fn read_small_batches() -> Result<()> { let config = SessionConfig::new().with_batch_size(2); @@ -413,7 +422,7 @@ mod tests { let ctx = session.state(); let store = Arc::new(LocalFileSystem::new()) as _; let filename = "tests/data/schema_infer_limit.json"; - let format = JsonFormat::default().with_schema_infer_max_rec(Some(3)); + let format = JsonFormat::default().with_schema_infer_max_rec(3); let file_schema = format .infer_schema(&ctx, &store, &[local_unpartitioned_file(filename)]) diff --git a/datafusion/core/src/datasource/file_format/options.rs b/datafusion/core/src/datasource/file_format/options.rs index d389137785ff..f66683c311c1 100644 --- a/datafusion/core/src/datasource/file_format/options.rs +++ b/datafusion/core/src/datasource/file_format/options.rs @@ -19,9 +19,6 @@ use std::sync::Arc; -use arrow::datatypes::{DataType, Schema, SchemaRef}; -use async_trait::async_trait; - use crate::datasource::file_format::arrow::ArrowFormat; use crate::datasource::file_format::file_compression_type::FileCompressionType; #[cfg(feature = "parquet")] @@ -35,11 +32,16 @@ use crate::datasource::{ use crate::error::Result; use crate::execution::context::{SessionConfig, SessionState}; use crate::logical_expr::Expr; + +use arrow::datatypes::{DataType, Schema, SchemaRef}; +use datafusion_common::config::TableOptions; use datafusion_common::{ DEFAULT_ARROW_EXTENSION, DEFAULT_AVRO_EXTENSION, DEFAULT_CSV_EXTENSION, DEFAULT_JSON_EXTENSION, DEFAULT_PARQUET_EXTENSION, }; +use async_trait::async_trait; + /// Options that control the reading of CSV files. /// /// Note this structure is supplied when a datasource is created and @@ -430,7 +432,11 @@ impl<'a> NdJsonReadOptions<'a> { /// ['ReadOptions'] is implemented by Options like ['CsvReadOptions'] that control the reading of respective files/sources. pub trait ReadOptions<'a> { /// Helper to convert these user facing options to `ListingTable` options - fn to_listing_options(&self, config: &SessionConfig) -> ListingOptions; + fn to_listing_options( + &self, + config: &SessionConfig, + table_options: TableOptions, + ) -> ListingOptions; /// Infer and resolve the schema from the files/sources provided. async fn get_resolved_schema( @@ -455,7 +461,7 @@ pub trait ReadOptions<'a> { return Ok(Arc::new(s.to_owned())); } - self.to_listing_options(config) + self.to_listing_options(config, state.default_table_options().clone()) .infer_schema(&state, &table_path) .await } @@ -463,13 +469,18 @@ pub trait ReadOptions<'a> { #[async_trait] impl ReadOptions<'_> for CsvReadOptions<'_> { - fn to_listing_options(&self, config: &SessionConfig) -> ListingOptions { + fn to_listing_options( + &self, + config: &SessionConfig, + table_options: TableOptions, + ) -> ListingOptions { let file_format = CsvFormat::default() + .with_options(table_options.csv) .with_has_header(self.has_header) .with_delimiter(self.delimiter) .with_quote(self.quote) .with_escape(self.escape) - .with_schema_infer_max_rec(Some(self.schema_infer_max_records)) + .with_schema_infer_max_rec(self.schema_infer_max_records) .with_file_compression_type(self.file_compression_type.to_owned()); ListingOptions::new(Arc::new(file_format)) @@ -493,10 +504,19 @@ impl ReadOptions<'_> for CsvReadOptions<'_> { #[cfg(feature = "parquet")] #[async_trait] impl ReadOptions<'_> for ParquetReadOptions<'_> { - fn to_listing_options(&self, config: &SessionConfig) -> ListingOptions { - let file_format = ParquetFormat::new() - .with_enable_pruning(self.parquet_pruning) - .with_skip_metadata(self.skip_metadata); + fn to_listing_options( + &self, + config: &SessionConfig, + table_options: TableOptions, + ) -> ListingOptions { + let mut file_format = ParquetFormat::new().with_options(table_options.parquet); + + if let Some(parquet_pruning) = self.parquet_pruning { + file_format = file_format.with_enable_pruning(parquet_pruning) + } + if let Some(skip_metadata) = self.skip_metadata { + file_format = file_format.with_skip_metadata(skip_metadata) + } ListingOptions::new(Arc::new(file_format)) .with_file_extension(self.file_extension) @@ -518,9 +538,14 @@ impl ReadOptions<'_> for ParquetReadOptions<'_> { #[async_trait] impl ReadOptions<'_> for NdJsonReadOptions<'_> { - fn to_listing_options(&self, config: &SessionConfig) -> ListingOptions { + fn to_listing_options( + &self, + config: &SessionConfig, + table_options: TableOptions, + ) -> ListingOptions { let file_format = JsonFormat::default() - .with_schema_infer_max_rec(Some(self.schema_infer_max_records)) + .with_options(table_options.json) + .with_schema_infer_max_rec(self.schema_infer_max_records) .with_file_compression_type(self.file_compression_type.to_owned()); ListingOptions::new(Arc::new(file_format)) @@ -543,7 +568,11 @@ impl ReadOptions<'_> for NdJsonReadOptions<'_> { #[async_trait] impl ReadOptions<'_> for AvroReadOptions<'_> { - fn to_listing_options(&self, config: &SessionConfig) -> ListingOptions { + fn to_listing_options( + &self, + config: &SessionConfig, + _table_options: TableOptions, + ) -> ListingOptions { let file_format = AvroFormat; ListingOptions::new(Arc::new(file_format)) @@ -565,7 +594,11 @@ impl ReadOptions<'_> for AvroReadOptions<'_> { #[async_trait] impl ReadOptions<'_> for ArrowReadOptions<'_> { - fn to_listing_options(&self, config: &SessionConfig) -> ListingOptions { + fn to_listing_options( + &self, + config: &SessionConfig, + _table_options: TableOptions, + ) -> ListingOptions { let file_format = ArrowFormat; ListingOptions::new(Arc::new(file_format)) diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 3824177cb363..c04c536e7ca6 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -17,56 +17,23 @@ //! [`ParquetFormat`]: Parquet [`FileFormat`] abstractions -use arrow_array::RecordBatch; -use async_trait::async_trait; -use datafusion_common::stats::Precision; -use datafusion_physical_plan::metrics::MetricsSet; -use parquet::arrow::arrow_writer::{ - compute_leaves, get_column_writers, ArrowColumnChunk, ArrowColumnWriter, - ArrowLeafColumn, -}; -use parquet::file::writer::SerializedFileWriter; use std::any::Any; use std::fmt; use std::fmt::Debug; use std::sync::Arc; -use tokio::io::{AsyncWrite, AsyncWriteExt}; -use tokio::sync::mpsc::{self, Receiver, Sender}; -use tokio::task::JoinSet; - -use crate::datasource::file_format::file_compression_type::FileCompressionType; -use crate::datasource::statistics::{create_max_min_accs, get_col_stats}; -use arrow::datatypes::SchemaRef; -use arrow::datatypes::{Fields, Schema}; -use bytes::{BufMut, BytesMut}; -use datafusion_common::{exec_err, not_impl_err, DataFusionError, FileType}; -use datafusion_common_runtime::SpawnedTask; -use datafusion_execution::TaskContext; -use datafusion_physical_expr::{PhysicalExpr, PhysicalSortRequirement}; -use futures::{StreamExt, TryStreamExt}; -use hashbrown::HashMap; -use object_store::path::Path; -use object_store::{ObjectMeta, ObjectStore}; -use parquet::arrow::{ - arrow_to_parquet_schema, parquet_to_arrow_schema, AsyncArrowWriter, -}; -use parquet::file::footer::{decode_footer, decode_metadata}; -use parquet::file::metadata::ParquetMetaData; -use parquet::file::properties::WriterProperties; -use parquet::file::statistics::Statistics as ParquetStatistics; use super::write::demux::start_demuxer_task; use super::write::{create_writer, AbortableWrite, SharedBuffer}; use super::{FileFormat, FileScanConfig}; use crate::arrow::array::{ - BooleanArray, Float32Array, Float64Array, Int32Array, Int64Array, + BooleanArray, Float32Array, Float64Array, Int32Array, Int64Array, RecordBatch, }; -use crate::arrow::datatypes::DataType; -use crate::config::ConfigOptions; - +use crate::arrow::datatypes::{DataType, Fields, Schema, SchemaRef}; +use crate::datasource::file_format::file_compression_type::FileCompressionType; use crate::datasource::physical_plan::{ FileGroupDisplay, FileSinkConfig, ParquetExec, SchemaAdapter, }; +use crate::datasource::statistics::{create_max_min_accs, get_col_stats}; use crate::error::Result; use crate::execution::context::SessionState; use crate::physical_plan::expressions::{MaxAccumulator, MinAccumulator}; @@ -76,6 +43,41 @@ use crate::physical_plan::{ Statistics, }; +use datafusion_common::config::TableParquetOptions; +use datafusion_common::file_options::parquet_writer::ParquetWriterOptions; +use datafusion_common::stats::Precision; +use datafusion_common::{ + exec_err, internal_datafusion_err, not_impl_err, DataFusionError, FileType, +}; +use datafusion_common_runtime::SpawnedTask; +use datafusion_execution::TaskContext; +use datafusion_physical_expr::{PhysicalExpr, PhysicalSortRequirement}; +use datafusion_physical_plan::metrics::MetricsSet; + +use async_trait::async_trait; +use bytes::{BufMut, BytesMut}; +use parquet::arrow::arrow_writer::{ + compute_leaves, get_column_writers, ArrowColumnChunk, ArrowColumnWriter, + ArrowLeafColumn, +}; +use parquet::arrow::{ + arrow_to_parquet_schema, parquet_to_arrow_schema, AsyncArrowWriter, +}; +use parquet::file::footer::{decode_footer, decode_metadata}; +use parquet::file::metadata::ParquetMetaData; +use parquet::file::properties::WriterProperties; +use parquet::file::statistics::Statistics as ParquetStatistics; +use parquet::file::writer::SerializedFileWriter; +use parquet::format::FileMetaData; +use tokio::io::{AsyncWrite, AsyncWriteExt}; +use tokio::sync::mpsc::{self, Receiver, Sender}; +use tokio::task::JoinSet; + +use futures::{StreamExt, TryStreamExt}; +use hashbrown::HashMap; +use object_store::path::Path; +use object_store::{ObjectMeta, ObjectStore}; + /// Size of the buffer for [`AsyncArrowWriter`]. const PARQUET_WRITER_BUFFER_SIZE: usize = 10485760; @@ -88,20 +90,9 @@ const INITIAL_BUFFER_BYTES: usize = 1048576; const BUFFER_FLUSH_BYTES: usize = 1024000; /// The Apache Parquet `FileFormat` implementation -/// -/// Note it is recommended these are instead configured on the [`ConfigOptions`] -/// associated with the [`SessionState`] instead of overridden on a format-basis -/// -/// TODO: Deprecate and remove overrides -/// #[derive(Debug, Default)] pub struct ParquetFormat { - /// Override the global setting for `enable_pruning` - enable_pruning: Option, - /// Override the global setting for `metadata_size_hint` - metadata_size_hint: Option, - /// Override the global setting for `skip_metadata` - skip_metadata: Option, + options: TableParquetOptions, } impl ParquetFormat { @@ -112,15 +103,14 @@ impl ParquetFormat { /// Activate statistics based row group level pruning /// - If `None`, defaults to value on `config_options` - pub fn with_enable_pruning(mut self, enable: Option) -> Self { - self.enable_pruning = enable; + pub fn with_enable_pruning(mut self, enable: bool) -> Self { + self.options.global.pruning = enable; self } /// Return `true` if pruning is enabled - pub fn enable_pruning(&self, config_options: &ConfigOptions) -> bool { - self.enable_pruning - .unwrap_or(config_options.execution.parquet.pruning) + pub fn enable_pruning(&self) -> bool { + self.options.global.pruning } /// Provide a hint to the size of the file metadata. If a hint is provided @@ -130,14 +120,13 @@ impl ParquetFormat { /// /// - If `None`, defaults to value on `config_options` pub fn with_metadata_size_hint(mut self, size_hint: Option) -> Self { - self.metadata_size_hint = size_hint; + self.options.global.metadata_size_hint = size_hint; self } /// Return the metadata size hint if set - pub fn metadata_size_hint(&self, config_options: &ConfigOptions) -> Option { - let hint = config_options.execution.parquet.metadata_size_hint; - self.metadata_size_hint.or(hint) + pub fn metadata_size_hint(&self) -> Option { + self.options.global.metadata_size_hint } /// Tell the parquet reader to skip any metadata that may be in @@ -145,16 +134,26 @@ impl ParquetFormat { /// metadata. /// /// - If `None`, defaults to value on `config_options` - pub fn with_skip_metadata(mut self, skip_metadata: Option) -> Self { - self.skip_metadata = skip_metadata; + pub fn with_skip_metadata(mut self, skip_metadata: bool) -> Self { + self.options.global.skip_metadata = skip_metadata; self } /// Returns `true` if schema metadata will be cleared prior to /// schema merging. - pub fn skip_metadata(&self, config_options: &ConfigOptions) -> bool { - self.skip_metadata - .unwrap_or(config_options.execution.parquet.skip_metadata) + pub fn skip_metadata(&self) -> bool { + self.options.global.skip_metadata + } + + /// Set Parquet options for the ParquetFormat + pub fn with_options(mut self, options: TableParquetOptions) -> Self { + self.options = options; + self + } + + /// Parquet options + pub fn options(&self) -> &TableParquetOptions { + &self.options } } @@ -202,7 +201,7 @@ impl FileFormat for ParquetFormat { fetch_schema_with_location( store.as_ref(), object, - self.metadata_size_hint, + self.metadata_size_hint(), ) }) .boxed() // Workaround https://github.com/rust-lang/rust/issues/64552 @@ -223,7 +222,7 @@ impl FileFormat for ParquetFormat { .map(|(_, schema)| schema) .collect::>(); - let schema = if self.skip_metadata(state.config_options()) { + let schema = if self.skip_metadata() { Schema::try_merge(clear_metadata(schemas)) } else { Schema::try_merge(schemas) @@ -243,7 +242,7 @@ impl FileFormat for ParquetFormat { store.as_ref(), table_schema, object, - self.metadata_size_hint, + self.metadata_size_hint(), ) .await?; Ok(stats) @@ -251,22 +250,20 @@ impl FileFormat for ParquetFormat { async fn create_physical_plan( &self, - state: &SessionState, + _state: &SessionState, conf: FileScanConfig, filters: Option<&Arc>, ) -> Result> { // If enable pruning then combine the filters to build the predicate. // If disable pruning then set the predicate to None, thus readers // will not prune data based on the statistics. - let predicate = self - .enable_pruning(state.config_options()) - .then(|| filters.cloned()) - .flatten(); + let predicate = self.enable_pruning().then(|| filters.cloned()).flatten(); Ok(Arc::new(ParquetExec::new( conf, predicate, - self.metadata_size_hint(state.config_options()), + self.metadata_size_hint(), + self.options.clone(), ))) } @@ -282,7 +279,7 @@ impl FileFormat for ParquetFormat { } let sink_schema = conf.output_schema().clone(); - let sink = Arc::new(ParquetSink::new(conf)); + let sink = Arc::new(ParquetSink::new(conf, self.options.clone())); Ok(Arc::new(FileSinkExec::new( input, @@ -541,6 +538,11 @@ async fn fetch_statistics( pub struct ParquetSink { /// Config options for writing data config: FileSinkConfig, + /// + parquet_options: TableParquetOptions, + /// File metadata from successfully produced parquet files. The Mutex is only used + /// to allow inserting to HashMap from behind borrowed reference in DataSink::write_all. + written: Arc>>, } impl Debug for ParquetSink { @@ -563,14 +565,25 @@ impl DisplayAs for ParquetSink { impl ParquetSink { /// Create from config. - pub fn new(config: FileSinkConfig) -> Self { - Self { config } + pub fn new(config: FileSinkConfig, parquet_options: TableParquetOptions) -> Self { + Self { + config, + parquet_options, + written: Default::default(), + } } /// Retrieve the inner [`FileSinkConfig`]. pub fn config(&self) -> &FileSinkConfig { &self.config } + + /// Retrieve the file metadata for the written files, keyed to the path + /// which may be partitioned (in the case of hive style partitioning). + pub fn written(&self) -> HashMap { + self.written.lock().clone() + } + /// Converts table schema to writer schema, which may differ in the case /// of hive style partitioning where some columns are removed from the /// underlying files. @@ -616,8 +629,14 @@ impl ParquetSink { PARQUET_WRITER_BUFFER_SIZE, Some(parquet_props), )?; + Ok(writer) } + + /// Parquet options + pub fn parquet_options(&self) -> &TableParquetOptions { + &self.parquet_options + } } #[async_trait] @@ -635,18 +654,15 @@ impl DataSink for ParquetSink { data: SendableRecordBatchStream, context: &Arc, ) -> Result { - let parquet_props = self - .config - .file_type_writer_options - .try_into_parquet()? - .writer_options(); + let parquet_props = ParquetWriterOptions::try_from(&self.parquet_options)?; let object_store = context .runtime_env() .object_store(&self.config.object_store_url)?; - let parquet_opts = &context.session_config().options().execution.parquet; - let allow_single_file_parallelism = parquet_opts.allow_single_file_parallelism; + let parquet_opts = &self.parquet_options; + let allow_single_file_parallelism = + parquet_opts.global.allow_single_file_parallelism; let part_col = if !self.config.table_partition_cols.is_empty() { Some(self.config.table_partition_cols.clone()) @@ -655,8 +671,11 @@ impl DataSink for ParquetSink { }; let parallel_options = ParallelParquetWriterOptions { - max_parallel_row_groups: parquet_opts.maximum_parallel_row_group_writers, + max_parallel_row_groups: parquet_opts + .global + .maximum_parallel_row_group_writers, max_buffered_record_batches_per_stream: parquet_opts + .global .maximum_buffered_record_batches_per_stream, }; @@ -668,25 +687,28 @@ impl DataSink for ParquetSink { "parquet".into(), ); - let mut file_write_tasks: JoinSet> = - JoinSet::new(); + let mut file_write_tasks: JoinSet< + std::result::Result<(Path, FileMetaData), DataFusionError>, + > = JoinSet::new(); + while let Some((path, mut rx)) = file_stream_rx.recv().await { if !allow_single_file_parallelism { let mut writer = self .create_async_arrow_writer( &path, object_store.clone(), - parquet_props.clone(), + parquet_props.writer_options().clone(), ) .await?; file_write_tasks.spawn(async move { - let mut row_count = 0; while let Some(batch) = rx.recv().await { - row_count += batch.num_rows(); writer.write(&batch).await?; } - writer.close().await?; - Ok(row_count) + let file_metadata = writer + .close() + .await + .map_err(DataFusionError::ParquetError)?; + Ok((path, file_metadata)) }); } else { let writer = create_writer( @@ -701,14 +723,15 @@ impl DataSink for ParquetSink { let props = parquet_props.clone(); let parallel_options_clone = parallel_options.clone(); file_write_tasks.spawn(async move { - output_single_parquet_file_parallelized( + let file_metadata = output_single_parquet_file_parallelized( writer, rx, schema, - &props, + props.writer_options(), parallel_options_clone, ) - .await + .await?; + Ok((path, file_metadata)) }); } } @@ -717,7 +740,13 @@ impl DataSink for ParquetSink { while let Some(result) = file_write_tasks.join_next().await { match result { Ok(r) => { - row_count += r?; + let (path, file_metadata) = r?; + row_count += file_metadata.num_rows; + let mut written_files = self.written.lock(); + written_files + .try_insert(path.clone(), file_metadata) + .map_err(|e| internal_datafusion_err!("duplicate entry detected for partitioned file {path}: {e}"))?; + drop(written_files); } Err(e) => { if e.is_panic() { @@ -919,7 +948,7 @@ async fn concatenate_parallel_row_groups( schema: Arc, writer_props: Arc, mut object_store_writer: AbortableWrite>, -) -> Result { +) -> Result { let merged_buff = SharedBuffer::new(INITIAL_BUFFER_BYTES); let schema_desc = arrow_to_parquet_schema(schema.as_ref())?; @@ -929,13 +958,10 @@ async fn concatenate_parallel_row_groups( writer_props, )?; - let mut row_count = 0; - while let Some(task) = serialize_rx.recv().await { let result = task.join_unwind().await; let mut rg_out = parquet_writer.next_row_group()?; - let (serialized_columns, cnt) = result?; - row_count += cnt; + let (serialized_columns, _cnt) = result?; for chunk in serialized_columns { chunk.append_to_row_group(&mut rg_out)?; let mut buff_to_flush = merged_buff.buffer.try_lock().unwrap(); @@ -949,13 +975,13 @@ async fn concatenate_parallel_row_groups( rg_out.close()?; } - let inner_writer = parquet_writer.into_inner()?; - let final_buff = inner_writer.buffer.try_lock().unwrap(); + let file_metadata = parquet_writer.close()?; + let final_buff = merged_buff.buffer.try_lock().unwrap(); object_store_writer.write_all(final_buff.as_slice()).await?; object_store_writer.shutdown().await?; - Ok(row_count) + Ok(file_metadata) } /// Parallelizes the serialization of a single parquet file, by first serializing N @@ -968,7 +994,7 @@ async fn output_single_parquet_file_parallelized( output_schema: Arc, parquet_props: &WriterProperties, parallel_options: ParallelParquetWriterOptions, -) -> Result { +) -> Result { let max_rowgroups = parallel_options.max_parallel_row_groups; // Buffer size of this channel limits maximum number of RowGroups being worked on in parallel let (serialize_tx, serialize_rx) = @@ -982,7 +1008,7 @@ async fn output_single_parquet_file_parallelized( arc_props.clone(), parallel_options, ); - let row_count = concatenate_parallel_row_groups( + let file_metadata = concatenate_parallel_row_groups( serialize_rx, output_schema.clone(), arc_props.clone(), @@ -991,14 +1017,16 @@ async fn output_single_parquet_file_parallelized( .await?; launch_serialization_task.join_unwind().await?; - Ok(row_count) + Ok(file_metadata) } #[cfg(test)] pub(crate) mod test_util { use super::*; use crate::test::object_store::local_unpartitioned_file; + use arrow::record_batch::RecordBatch; + use parquet::arrow::ArrowWriter; use parquet::file::properties::WriterProperties; use tempfile::NamedTempFile; @@ -1077,6 +1105,7 @@ pub(crate) mod test_util { #[cfg(test)] mod tests { use super::super::test_util::scan_format; + use crate::datasource::listing::{ListingTableUrl, PartitionedFile}; use crate::physical_plan::collect; use std::fmt::{Display, Formatter}; use std::sync::atomic::{AtomicUsize, Ordering}; @@ -1088,13 +1117,19 @@ mod tests { use crate::prelude::{SessionConfig, SessionContext}; use arrow::array::{Array, ArrayRef, StringArray}; use arrow::record_batch::RecordBatch; + use arrow_schema::Field; use async_trait::async_trait; use bytes::Bytes; use datafusion_common::cast::{ as_binary_array, as_boolean_array, as_float32_array, as_float64_array, as_int32_array, as_timestamp_nanosecond_array, }; + use datafusion_common::config::ParquetOptions; + use datafusion_common::config::TableParquetOptions; use datafusion_common::ScalarValue; + use datafusion_execution::object_store::ObjectStoreUrl; + use datafusion_execution::runtime_env::RuntimeEnv; + use datafusion_physical_plan::stream::RecordBatchStreamAdapter; use futures::stream::BoxStream; use futures::StreamExt; use log::error; @@ -1789,4 +1824,183 @@ mod tests { let format = ParquetFormat::default(); scan_format(state, &format, &testdata, file_name, projection, limit).await } + + fn build_ctx(store_url: &url::Url) -> Arc { + let tmp_dir = tempfile::TempDir::new().unwrap(); + let local = Arc::new( + LocalFileSystem::new_with_prefix(&tmp_dir) + .expect("should create object store"), + ); + + let mut session = SessionConfig::default(); + let mut parquet_opts = ParquetOptions { + allow_single_file_parallelism: true, + ..Default::default() + }; + parquet_opts.allow_single_file_parallelism = true; + session.options_mut().execution.parquet = parquet_opts; + + let runtime = RuntimeEnv::default(); + runtime + .object_store_registry + .register_store(store_url, local); + + Arc::new( + TaskContext::default() + .with_session_config(session) + .with_runtime(Arc::new(runtime)), + ) + } + + #[tokio::test] + async fn parquet_sink_write() -> Result<()> { + let field_a = Field::new("a", DataType::Utf8, false); + let field_b = Field::new("b", DataType::Utf8, false); + let schema = Arc::new(Schema::new(vec![field_a, field_b])); + let object_store_url = ObjectStoreUrl::local_filesystem(); + + let file_sink_config = FileSinkConfig { + object_store_url: object_store_url.clone(), + file_groups: vec![PartitionedFile::new("/tmp".to_string(), 1)], + table_paths: vec![ListingTableUrl::parse("file:///")?], + output_schema: schema.clone(), + table_partition_cols: vec![], + overwrite: true, + }; + let parquet_sink = Arc::new(ParquetSink::new( + file_sink_config, + TableParquetOptions::default(), + )); + + // create data + let col_a: ArrayRef = Arc::new(StringArray::from(vec!["foo", "bar"])); + let col_b: ArrayRef = Arc::new(StringArray::from(vec!["baz", "baz"])); + let batch = RecordBatch::try_from_iter(vec![("a", col_a), ("b", col_b)]).unwrap(); + + // write stream + parquet_sink + .write_all( + Box::pin(RecordBatchStreamAdapter::new( + schema, + futures::stream::iter(vec![Ok(batch)]), + )), + &build_ctx(object_store_url.as_ref()), + ) + .await + .unwrap(); + + // assert written + let mut written = parquet_sink.written(); + let written = written.drain(); + assert_eq!( + written.len(), + 1, + "expected a single parquet files to be written, instead found {}", + written.len() + ); + + // check the file metadata + let ( + path, + FileMetaData { + num_rows, schema, .. + }, + ) = written.take(1).next().unwrap(); + let path_parts = path.parts().collect::>(); + assert_eq!(path_parts.len(), 1, "should not have path prefix"); + + assert_eq!(num_rows, 2, "file metdata to have 2 rows"); + assert!( + schema.iter().any(|col_schema| col_schema.name == "a"), + "output file metadata should contain col a" + ); + assert!( + schema.iter().any(|col_schema| col_schema.name == "b"), + "output file metadata should contain col b" + ); + + Ok(()) + } + + #[tokio::test] + async fn parquet_sink_write_partitions() -> Result<()> { + let field_a = Field::new("a", DataType::Utf8, false); + let field_b = Field::new("b", DataType::Utf8, false); + let schema = Arc::new(Schema::new(vec![field_a, field_b])); + let object_store_url = ObjectStoreUrl::local_filesystem(); + + // set file config to include partitioning on field_a + let file_sink_config = FileSinkConfig { + object_store_url: object_store_url.clone(), + file_groups: vec![PartitionedFile::new("/tmp".to_string(), 1)], + table_paths: vec![ListingTableUrl::parse("file:///")?], + output_schema: schema.clone(), + table_partition_cols: vec![("a".to_string(), DataType::Utf8)], // add partitioning + overwrite: true, + }; + let parquet_sink = Arc::new(ParquetSink::new( + file_sink_config, + TableParquetOptions::default(), + )); + + // create data with 2 partitions + let col_a: ArrayRef = Arc::new(StringArray::from(vec!["foo", "bar"])); + let col_b: ArrayRef = Arc::new(StringArray::from(vec!["baz", "baz"])); + let batch = RecordBatch::try_from_iter(vec![("a", col_a), ("b", col_b)]).unwrap(); + + // write stream + parquet_sink + .write_all( + Box::pin(RecordBatchStreamAdapter::new( + schema, + futures::stream::iter(vec![Ok(batch)]), + )), + &build_ctx(object_store_url.as_ref()), + ) + .await + .unwrap(); + + // assert written + let mut written = parquet_sink.written(); + let written = written.drain(); + assert_eq!( + written.len(), + 2, + "expected two parquet files to be written, instead found {}", + written.len() + ); + + // check the file metadata includes partitions + let mut expected_partitions = std::collections::HashSet::from(["a=foo", "a=bar"]); + for ( + path, + FileMetaData { + num_rows, schema, .. + }, + ) in written.take(2) + { + let path_parts = path.parts().collect::>(); + assert_eq!(path_parts.len(), 2, "should have path prefix"); + + let prefix = path_parts[0].as_ref(); + assert!( + expected_partitions.contains(prefix), + "expected path prefix to match partition, instead found {:?}", + prefix + ); + expected_partitions.remove(prefix); + + assert_eq!(num_rows, 1, "file metdata to have 1 row"); + assert!( + !schema.iter().any(|col_schema| col_schema.name == "a"), + "output file metadata will not contain partitioned col a" + ); + assert!( + schema.iter().any(|col_schema| col_schema.name == "b"), + "output file metadata should contain col b" + ); + } + + Ok(()) + } } diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 88476ffb0966..2a2551236e1b 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -51,8 +51,7 @@ use crate::{ use arrow::datatypes::{DataType, Field, SchemaBuilder, SchemaRef}; use arrow_schema::Schema; use datafusion_common::{ - internal_err, plan_err, project_schema, Constraints, FileType, FileTypeWriterOptions, - SchemaExt, ToDFSchema, + internal_err, plan_err, project_schema, Constraints, FileType, SchemaExt, ToDFSchema, }; use datafusion_execution::cache::cache_manager::FileStatisticsCache; use datafusion_execution::cache::cache_unit::DefaultFileStatisticsCache; @@ -247,9 +246,6 @@ pub struct ListingOptions { /// multiple equivalent orderings, the outer `Vec` will have a /// single element. pub file_sort_order: Vec>, - /// This setting holds file format specific options which should be used - /// when inserting into this table. - pub file_type_write_options: Option, } impl ListingOptions { @@ -267,7 +263,6 @@ impl ListingOptions { collect_stat: true, target_partitions: 1, file_sort_order: vec![], - file_type_write_options: None, } } @@ -418,15 +413,6 @@ impl ListingOptions { self } - /// Configure file format specific writing options. - pub fn with_write_options( - mut self, - file_type_write_options: FileTypeWriterOptions, - ) -> Self { - self.file_type_write_options = Some(file_type_write_options); - self - } - /// Infer the schema of the files at the given path on the provided object store. /// The inferred schema does not include the partitioning columns. /// @@ -760,15 +746,6 @@ impl TableProvider for ListingTable { .await?; let file_groups = file_list_stream.try_collect::>().await?; - let file_format = self.options().format.as_ref(); - - let file_type_writer_options = match &self.options().file_type_write_options { - Some(opt) => opt.clone(), - None => FileTypeWriterOptions::build_default( - &file_format.file_type(), - state.config_options(), - )?, - }; // Sink related option, apart from format let config = FileSinkConfig { @@ -778,7 +755,6 @@ impl TableProvider for ListingTable { output_schema: self.schema(), table_partition_cols: self.options.table_partition_cols.clone(), overwrite, - file_type_writer_options, }; let unsorted: Vec> = vec![]; diff --git a/datafusion/core/src/datasource/listing_table_factory.rs b/datafusion/core/src/datasource/listing_table_factory.rs index bcf1f81b3a0b..4e126bbba9f9 100644 --- a/datafusion/core/src/datasource/listing_table_factory.rs +++ b/datafusion/core/src/datasource/listing_table_factory.rs @@ -24,8 +24,7 @@ use std::sync::Arc; #[cfg(feature = "parquet")] use crate::datasource::file_format::parquet::ParquetFormat; use crate::datasource::file_format::{ - arrow::ArrowFormat, avro::AvroFormat, csv::CsvFormat, - file_compression_type::FileCompressionType, json::JsonFormat, FileFormat, + arrow::ArrowFormat, avro::AvroFormat, csv::CsvFormat, json::JsonFormat, FileFormat, }; use crate::datasource::listing::{ ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl, @@ -35,7 +34,7 @@ use crate::datasource::TableProvider; use crate::execution::context::SessionState; use arrow::datatypes::{DataType, SchemaRef}; -use datafusion_common::file_options::{FileTypeWriterOptions, StatementOptions}; +use datafusion_common::config::TableOptions; use datafusion_common::{arrow_datafusion_err, DataFusionError, FileType}; use datafusion_expr::CreateExternalTable; @@ -59,34 +58,32 @@ impl TableProviderFactory for ListingTableFactory { state: &SessionState, cmd: &CreateExternalTable, ) -> datafusion_common::Result> { - let file_compression_type = FileCompressionType::from(cmd.file_compression_type); + let mut table_options = + TableOptions::default_from_session_config(state.config_options()); let file_type = FileType::from_str(cmd.file_type.as_str()).map_err(|_| { DataFusionError::Execution(format!("Unknown FileType {}", cmd.file_type)) })?; - + table_options.set_file_format(file_type.clone()); + table_options.alter_with_string_hash_map(&cmd.options)?; let file_extension = get_extension(cmd.location.as_str()); - let file_format: Arc = match file_type { FileType::CSV => { - let mut statement_options = StatementOptions::from(&cmd.options); - let mut csv_format = CsvFormat::default() - .with_has_header(cmd.has_header) - .with_delimiter(cmd.delimiter as u8) - .with_file_compression_type(file_compression_type); - if let Some(quote) = statement_options.take_str_option("quote") { - csv_format = csv_format.with_quote(quote.as_bytes()[0]) - } - if let Some(escape) = statement_options.take_str_option("escape") { - csv_format = csv_format.with_escape(Some(escape.as_bytes()[0])) - } - Arc::new(csv_format) + let mut csv_options = table_options.csv; + csv_options.has_header = cmd.has_header; + csv_options.delimiter = cmd.delimiter as u8; + csv_options.compression = cmd.file_compression_type; + Arc::new(CsvFormat::default().with_options(csv_options)) } #[cfg(feature = "parquet")] - FileType::PARQUET => Arc::new(ParquetFormat::default()), + FileType::PARQUET => { + Arc::new(ParquetFormat::default().with_options(table_options.parquet)) + } FileType::AVRO => Arc::new(AvroFormat), - FileType::JSON => Arc::new( - JsonFormat::default().with_file_compression_type(file_compression_type), - ), + FileType::JSON => { + let mut json_options = table_options.json; + json_options.compression = cmd.file_compression_type; + Arc::new(JsonFormat::default().with_options(json_options)) + } FileType::ARROW => Arc::new(ArrowFormat), }; @@ -133,48 +130,6 @@ impl TableProviderFactory for ListingTableFactory { (Some(schema), table_partition_cols) }; - let mut statement_options = StatementOptions::from(&cmd.options); - - statement_options.take_str_option("unbounded"); - - let file_type = file_format.file_type(); - - // Use remaining options and session state to build FileTypeWriterOptions - let file_type_writer_options = FileTypeWriterOptions::build( - &file_type, - state.config_options(), - &statement_options, - )?; - - // Some options have special syntax which takes precedence - // e.g. "WITH HEADER ROW" overrides (header false, ...) - let file_type_writer_options = match file_type { - FileType::CSV => { - let mut csv_writer_options = - file_type_writer_options.try_into_csv()?.clone(); - csv_writer_options.writer_options = csv_writer_options - .writer_options - .with_header(cmd.has_header) - .with_delimiter(cmd.delimiter.try_into().map_err(|_| { - DataFusionError::Internal( - "Unable to convert CSV delimiter into u8".into(), - ) - })?); - csv_writer_options.compression = cmd.file_compression_type; - FileTypeWriterOptions::CSV(csv_writer_options) - } - FileType::JSON => { - let mut json_writer_options = - file_type_writer_options.try_into_json()?.clone(); - json_writer_options.compression = cmd.file_compression_type; - FileTypeWriterOptions::JSON(json_writer_options) - } - #[cfg(feature = "parquet")] - FileType::PARQUET => file_type_writer_options, - FileType::ARROW => file_type_writer_options, - FileType::AVRO => file_type_writer_options, - }; - let table_path = ListingTableUrl::parse(&cmd.location)?; let options = ListingOptions::new(file_format) @@ -182,8 +137,7 @@ impl TableProviderFactory for ListingTableFactory { .with_file_extension(file_extension) .with_target_partitions(state.config().target_partitions()) .with_table_partition_cols(table_partition_cols) - .with_file_sort_order(cmd.order_exprs.clone()) - .with_write_options(file_type_writer_options); + .with_file_sort_order(cmd.order_exprs.clone()); let resolved_schema = match provided_schema { None => options.infer_schema(state, &table_path).await?, @@ -258,4 +212,50 @@ mod tests { let listing_options = listing_table.options(); assert_eq!(".tbl", listing_options.file_extension); } + + #[tokio::test] + async fn test_create_using_non_std_file_ext_csv_options() { + let csv_file = tempfile::Builder::new() + .prefix("foo") + .suffix(".tbl") + .tempfile() + .unwrap(); + + let factory = ListingTableFactory::new(); + let context = SessionContext::new(); + let state = context.state(); + let name = OwnedTableReference::bare("foo".to_string()); + + let mut options = HashMap::new(); + options.insert("csv.schema_infer_max_rec".to_owned(), "1000".to_owned()); + let cmd = CreateExternalTable { + name, + location: csv_file.path().to_str().unwrap().to_string(), + file_type: "csv".to_string(), + has_header: true, + delimiter: ',', + schema: Arc::new(DFSchema::empty()), + table_partition_cols: vec![], + if_not_exists: false, + file_compression_type: CompressionTypeVariant::UNCOMPRESSED, + definition: None, + order_exprs: vec![], + unbounded: false, + options, + constraints: Constraints::empty(), + column_defaults: HashMap::new(), + }; + let table_provider = factory.create(&state, &cmd).await.unwrap(); + let listing_table = table_provider + .as_any() + .downcast_ref::() + .unwrap(); + + let format = listing_table.options().format.clone(); + let csv_format = format.as_any().downcast_ref::().unwrap(); + let csv_options = csv_format.options().clone(); + assert_eq!(csv_options.schema_infer_max_rec, 1000); + let listing_options = listing_table.options(); + assert_eq!(".tbl", listing_options.file_extension); + } } diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index ca466b5c6a92..068426e0fdcb 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -756,7 +756,7 @@ mod tests { let out_dir = tmp_dir.as_ref().to_str().unwrap().to_string() + "/out/"; let out_dir_url = "file://local/out/"; let df = ctx.sql("SELECT a, b FROM test").await?; - df.write_json(out_dir_url, DataFrameWriteOptions::new()) + df.write_json(out_dir_url, DataFrameWriteOptions::new(), None) .await?; // create a new context and verify that the results were saved to a partitioned csv file @@ -850,7 +850,7 @@ mod tests { let df = ctx.read_csv("tests/data/corrupt.csv", options).await?; let out_dir_url = "file://local/out"; let e = df - .write_json(out_dir_url, DataFrameWriteOptions::new()) + .write_json(out_dir_url, DataFrameWriteOptions::new(), None) .await .expect_err("should fail because input file does not match inferred schema"); assert_eq!(e.strip_backtrace(), "Arrow error: Parser error: Error while parsing value d for column 0 at line 4"); diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index 08f1cc9f2726..ddb8d032f3d8 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -26,16 +26,16 @@ mod file_stream; mod json; #[cfg(feature = "parquet")] pub mod parquet; -pub use file_groups::FileGroupPartitioner; pub(crate) use self::csv::plan_to_csv; -pub use self::csv::{CsvConfig, CsvExec, CsvOpener}; pub(crate) use self::json::plan_to_json; #[cfg(feature = "parquet")] pub use self::parquet::{ParquetExec, ParquetFileMetrics, ParquetFileReaderFactory}; pub use arrow_file::ArrowExec; pub use avro::AvroExec; +pub use csv::{CsvConfig, CsvExec, CsvOpener}; +pub use file_groups::FileGroupPartitioner; pub use file_scan_config::{ wrap_partition_type_in_dict, wrap_partition_value_in_dict, FileScanConfig, }; @@ -66,7 +66,7 @@ use arrow::{ datatypes::{DataType, Schema, SchemaRef}, record_batch::{RecordBatch, RecordBatchOptions}, }; -use datafusion_common::{file_options::FileTypeWriterOptions, plan_err}; +use datafusion_common::plan_err; use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::PhysicalSortExpr; @@ -90,8 +90,6 @@ pub struct FileSinkConfig { pub table_partition_cols: Vec<(String, DataType)>, /// Controls whether existing data should be overwritten by this sink pub overwrite: bool, - /// Contains settings specific to writing a given FileType, e.g. parquet max_row_group_size - pub file_type_writer_options: FileTypeWriterOptions, } impl FileSinkConfig { diff --git a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs index 2f3b151e7763..2cfbb578da66 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs @@ -31,7 +31,7 @@ use crate::datasource::physical_plan::{ FileMeta, FileScanConfig, SchemaAdapter, }; use crate::{ - config::ConfigOptions, + config::{ConfigOptions, TableParquetOptions}, datasource::listing::ListingTableUrl, error::{DataFusionError, Result}, execution::context::TaskContext, @@ -73,18 +73,6 @@ pub use metrics::ParquetFileMetrics; /// Execution plan for scanning one or more Parquet partitions #[derive(Debug, Clone)] pub struct ParquetExec { - /// Override for `Self::with_pushdown_filters`. If None, uses - /// values from base_config - pushdown_filters: Option, - /// Override for `Self::with_reorder_filters`. If None, uses - /// values from base_config - reorder_filters: Option, - /// Override for `Self::with_enable_page_index`. If None, uses - /// values from base_config - enable_page_index: Option, - /// Override for `Self::with_enable_bloom_filter`. If None, uses - /// values from base_config - enable_bloom_filter: Option, /// Base configuration for this scan base_config: FileScanConfig, projected_statistics: Statistics, @@ -101,6 +89,8 @@ pub struct ParquetExec { /// Optional user defined parquet file reader factory parquet_file_reader_factory: Option>, cache: PlanProperties, + /// Parquet Options + parquet_options: TableParquetOptions, } impl ParquetExec { @@ -109,6 +99,7 @@ impl ParquetExec { base_config: FileScanConfig, predicate: Option>, metadata_size_hint: Option, + parquet_options: TableParquetOptions, ) -> Self { debug!("Creating ParquetExec, files: {:?}, projection {:?}, predicate: {:?}, limit: {:?}", base_config.file_groups, base_config.projection, predicate, base_config.limit); @@ -154,10 +145,6 @@ impl ParquetExec { &base_config, ); Self { - pushdown_filters: None, - reorder_filters: None, - enable_page_index: None, - enable_bloom_filter: None, base_config, projected_statistics, metrics, @@ -167,6 +154,7 @@ impl ParquetExec { metadata_size_hint, parquet_file_reader_factory: None, cache, + parquet_options, } } @@ -208,14 +196,13 @@ impl ParquetExec { /// /// [`Expr`]: datafusion_expr::Expr pub fn with_pushdown_filters(mut self, pushdown_filters: bool) -> Self { - self.pushdown_filters = Some(pushdown_filters); + self.parquet_options.global.pushdown_filters = pushdown_filters; self } /// Return the value described in [`Self::with_pushdown_filters`] - fn pushdown_filters(&self, config_options: &ConfigOptions) -> bool { - self.pushdown_filters - .unwrap_or(config_options.execution.parquet.pushdown_filters) + fn pushdown_filters(&self) -> bool { + self.parquet_options.global.pushdown_filters } /// If true, the `RowFilter` made by `pushdown_filters` may try to @@ -225,14 +212,13 @@ impl ParquetExec { /// /// [`Expr`]: datafusion_expr::Expr pub fn with_reorder_filters(mut self, reorder_filters: bool) -> Self { - self.reorder_filters = Some(reorder_filters); + self.parquet_options.global.reorder_filters = reorder_filters; self } /// Return the value described in [`Self::with_reorder_filters`] - fn reorder_filters(&self, config_options: &ConfigOptions) -> bool { - self.reorder_filters - .unwrap_or(config_options.execution.parquet.reorder_filters) + fn reorder_filters(&self) -> bool { + self.parquet_options.global.reorder_filters } /// If enabled, the reader will read the page index @@ -240,26 +226,24 @@ impl ParquetExec { /// via `RowSelector` and `RowFilter` by /// eliminating unnecessary IO and decoding pub fn with_enable_page_index(mut self, enable_page_index: bool) -> Self { - self.enable_page_index = Some(enable_page_index); + self.parquet_options.global.enable_page_index = enable_page_index; self } /// Return the value described in [`Self::with_enable_page_index`] - fn enable_page_index(&self, config_options: &ConfigOptions) -> bool { - self.enable_page_index - .unwrap_or(config_options.execution.parquet.enable_page_index) + fn enable_page_index(&self) -> bool { + self.parquet_options.global.enable_page_index } /// If enabled, the reader will read by the bloom filter pub fn with_enable_bloom_filter(mut self, enable_bloom_filter: bool) -> Self { - self.enable_bloom_filter = Some(enable_bloom_filter); + self.parquet_options.global.bloom_filter_enabled = enable_bloom_filter; self } /// Return the value described in [`Self::with_enable_bloom_filter`] - fn enable_bloom_filter(&self, config_options: &ConfigOptions) -> bool { - self.enable_bloom_filter - .unwrap_or(config_options.execution.parquet.bloom_filter_enabled) + fn enable_bloom_filter(&self) -> bool { + self.parquet_options.global.bloom_filter_enabled } fn output_partitioning_helper(file_config: &FileScanConfig) -> Partitioning { @@ -397,8 +381,6 @@ impl ExecutionPlan for ParquetExec { }) })?; - let config_options = ctx.session_config().options(); - let opener = ParquetOpener { partition_index, projection: Arc::from(projection), @@ -411,10 +393,10 @@ impl ExecutionPlan for ParquetExec { metadata_size_hint: self.metadata_size_hint, metrics: self.metrics.clone(), parquet_file_reader_factory, - pushdown_filters: self.pushdown_filters(config_options), - reorder_filters: self.reorder_filters(config_options), - enable_page_index: self.enable_page_index(config_options), - enable_bloom_filter: self.enable_bloom_filter(config_options), + pushdown_filters: self.pushdown_filters(), + reorder_filters: self.reorder_filters(), + enable_page_index: self.enable_page_index(), + enable_bloom_filter: self.enable_bloom_filter(), }; let stream = @@ -917,6 +899,7 @@ mod tests { }, predicate, None, + Default::default(), ); if pushdown_predicate { @@ -1573,6 +1556,7 @@ mod tests { }, None, None, + Default::default(), ); assert_eq!( parquet_exec @@ -1693,6 +1677,7 @@ mod tests { }, None, None, + Default::default(), ); assert_eq!( parquet_exec.cache.output_partitioning().partition_count(), @@ -1759,6 +1744,7 @@ mod tests { }, None, None, + Default::default(), ); let mut results = parquet_exec.execute(0, state.task_ctx())?; @@ -2021,7 +2007,7 @@ mod tests { ctx.runtime_env().register_object_store(&local_url, local); // Configure listing options - let file_format = ParquetFormat::default().with_enable_pruning(Some(true)); + let file_format = ParquetFormat::default().with_enable_pruning(true); let listing_options = ListingOptions::new(Arc::new(file_format)) .with_file_extension(FileType::PARQUET.get_ext()); diff --git a/datafusion/core/src/datasource/provider.rs b/datafusion/core/src/datasource/provider.rs index e769084df636..f2e3e907e5ce 100644 --- a/datafusion/core/src/datasource/provider.rs +++ b/datafusion/core/src/datasource/provider.rs @@ -166,6 +166,7 @@ pub trait TableProvider: Sync + Send { /// Tests whether the table provider can make use of any or all filter expressions /// to optimise data retrieval. + /// Note: the returned vector much have the same size as the filters argument. #[allow(deprecated)] fn supports_filters_pushdown( &self, diff --git a/datafusion/core/src/execution/context/avro.rs b/datafusion/core/src/execution/context/avro.rs index d60e79862ef2..1eca3b133757 100644 --- a/datafusion/core/src/execution/context/avro.rs +++ b/datafusion/core/src/execution/context/avro.rs @@ -43,7 +43,8 @@ impl SessionContext { table_path: &str, options: AvroReadOptions<'_>, ) -> Result<()> { - let listing_options = options.to_listing_options(&self.copied_config()); + let listing_options = options + .to_listing_options(&self.copied_config(), self.copied_table_options()); self.register_listing_table( name, @@ -60,6 +61,7 @@ impl SessionContext { #[cfg(test)] mod tests { use super::*; + use async_trait::async_trait; // Test for compilation error when calling read_* functions from an #[async_trait] function. diff --git a/datafusion/core/src/execution/context/csv.rs b/datafusion/core/src/execution/context/csv.rs index f3675422c7d5..f59d77664645 100644 --- a/datafusion/core/src/execution/context/csv.rs +++ b/datafusion/core/src/execution/context/csv.rs @@ -59,7 +59,8 @@ impl SessionContext { table_path: &str, options: CsvReadOptions<'_>, ) -> Result<()> { - let listing_options = options.to_listing_options(&self.copied_config()); + let listing_options = options + .to_listing_options(&self.copied_config(), self.copied_table_options()); self.register_listing_table( name, @@ -88,6 +89,7 @@ mod tests { use super::*; use crate::assert_batches_eq; use crate::test_util::{plan_and_collect, populate_csv_partitions}; + use async_trait::async_trait; use tempfile::TempDir; diff --git a/datafusion/core/src/execution/context/json.rs b/datafusion/core/src/execution/context/json.rs index f67693aa8f31..c21e32cfdefb 100644 --- a/datafusion/core/src/execution/context/json.rs +++ b/datafusion/core/src/execution/context/json.rs @@ -45,7 +45,8 @@ impl SessionContext { table_path: &str, options: NdJsonReadOptions<'_>, ) -> Result<()> { - let listing_options = options.to_listing_options(&self.copied_config()); + let listing_options = options + .to_listing_options(&self.copied_config(), self.copied_table_options()); self.register_listing_table( name, diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index dc4e39d37c5f..8bc65a0ca2cc 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -17,109 +17,85 @@ //! [`SessionContext`] contains methods for registering data sources and executing queries -mod avro; -mod csv; -mod json; -#[cfg(feature = "parquet")] -mod parquet; +use std::collections::{hash_map::Entry, HashMap, HashSet}; +use std::fmt::Debug; +use std::ops::ControlFlow; +use std::string::String; +use std::sync::{Arc, Weak}; +use super::options::ReadOptions; use crate::{ - catalog::{CatalogProviderList, MemoryCatalogProviderList}, + catalog::information_schema::{InformationSchemaProvider, INFORMATION_SCHEMA}, + catalog::listing_schema::ListingSchemaProvider, + catalog::schema::{MemorySchemaProvider, SchemaProvider}, + catalog::{ + CatalogProvider, CatalogProviderList, MemoryCatalogProvider, + MemoryCatalogProviderList, + }, + config::ConfigOptions, + dataframe::DataFrame, datasource::{ cte_worktable::CteWorkTable, function::{TableFunction, TableFunctionImpl}, - listing::{ListingOptions, ListingTable}, - provider::TableProviderFactory, + listing::{ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl}, + object_store::ObjectStoreUrl, + provider::{DefaultTableFactory, TableProviderFactory}, + }, + datasource::{provider_as_source, MemTable, TableProvider, ViewTable}, + error::{DataFusionError, Result}, + execution::{options::ArrowReadOptions, runtime_env::RuntimeEnv, FunctionRegistry}, + logical_expr::{ + CreateCatalog, CreateCatalogSchema, CreateExternalTable, CreateFunction, + CreateMemoryTable, CreateView, DropCatalogSchema, DropFunction, DropTable, + DropView, Explain, LogicalPlan, LogicalPlanBuilder, PlanType, SetVariable, + TableSource, TableType, ToStringifiedPlan, UNNAMED_TABLE, }, - datasource::{MemTable, ViewTable}, - logical_expr::{PlanType, ToStringifiedPlan}, - optimizer::optimizer::Optimizer, + optimizer::analyzer::{Analyzer, AnalyzerRule}, + optimizer::optimizer::{Optimizer, OptimizerConfig, OptimizerRule}, physical_optimizer::optimizer::{PhysicalOptimizer, PhysicalOptimizerRule}, + physical_plan::{udaf::AggregateUDF, udf::ScalarUDF, ExecutionPlan}, + physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner}, + variable::{VarProvider, VarType}, }; + +use arrow::datatypes::{DataType, SchemaRef}; +use arrow::record_batch::RecordBatch; use arrow_schema::Schema; use datafusion_common::{ alias::AliasGenerator, + config::{ConfigExtension, TableOptions}, exec_err, not_impl_err, plan_datafusion_err, plan_err, tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor}, + OwnedTableReference, SchemaReference, }; use datafusion_execution::registry::SerializerRegistry; -pub use datafusion_expr::execution_props::ExecutionProps; -use datafusion_expr::var_provider::is_system_variables; use datafusion_expr::{ logical_plan::{DdlStatement, Statement}, + var_provider::is_system_variables, Expr, StringifiedPlan, UserDefinedLogicalNode, WindowUDF, }; -use parking_lot::RwLock; -use std::collections::hash_map::Entry; -use std::string::String; -use std::sync::Arc; -use std::{ - collections::{HashMap, HashSet}, - fmt::Debug, -}; -use std::{ops::ControlFlow, sync::Weak}; - -use arrow::datatypes::{DataType, SchemaRef}; -use arrow::record_batch::RecordBatch; - -use crate::catalog::{ - schema::{MemorySchemaProvider, SchemaProvider}, - {CatalogProvider, MemoryCatalogProvider}, -}; -use crate::dataframe::DataFrame; -use crate::datasource::{ - listing::{ListingTableConfig, ListingTableUrl}, - provider_as_source, TableProvider, -}; -use crate::error::{DataFusionError, Result}; -use crate::logical_expr::{ - CreateCatalog, CreateCatalogSchema, CreateExternalTable, CreateFunction, - CreateMemoryTable, CreateView, DropCatalogSchema, DropFunction, DropTable, DropView, - Explain, LogicalPlan, LogicalPlanBuilder, SetVariable, TableSource, TableType, - UNNAMED_TABLE, -}; -use crate::optimizer::OptimizerRule; use datafusion_sql::{ - parser::{CopyToSource, CopyToStatement}, - planner::ParserOptions, + parser::{CopyToSource, CopyToStatement, DFParser}, + planner::{object_name_to_table_reference, ContextProvider, ParserOptions, SqlToRel}, ResolvedTableReference, TableReference, }; -use sqlparser::dialect::dialect_from_str; -use crate::config::ConfigOptions; -use crate::execution::{runtime_env::RuntimeEnv, FunctionRegistry}; -use crate::physical_plan::udaf::AggregateUDF; -use crate::physical_plan::udf::ScalarUDF; -use crate::physical_plan::ExecutionPlan; -use crate::physical_planner::DefaultPhysicalPlanner; -use crate::physical_planner::PhysicalPlanner; -use crate::variable::{VarProvider, VarType}; use async_trait::async_trait; use chrono::{DateTime, Utc}; -use datafusion_common::{OwnedTableReference, SchemaReference}; -use datafusion_sql::{ - parser::DFParser, - planner::{ContextProvider, SqlToRel}, -}; +use parking_lot::RwLock; +use sqlparser::dialect::dialect_from_str; use url::Url; - -use crate::catalog::information_schema::{InformationSchemaProvider, INFORMATION_SCHEMA}; -use crate::catalog::listing_schema::ListingSchemaProvider; -use crate::datasource::object_store::ObjectStoreUrl; -use datafusion_optimizer::{ - analyzer::{Analyzer, AnalyzerRule}, - OptimizerConfig, -}; -use datafusion_sql::planner::object_name_to_table_reference; use uuid::Uuid; -// backwards compatibility -use crate::datasource::provider::DefaultTableFactory; -use crate::execution::options::ArrowReadOptions; pub use datafusion_execution::config::SessionConfig; pub use datafusion_execution::TaskContext; +pub use datafusion_expr::execution_props::ExecutionProps; -use super::options::ReadOptions; +mod avro; +mod csv; +mod json; +#[cfg(feature = "parquet")] +mod parquet; /// DataFilePaths adds a method to convert strings and vector of strings to vector of [`ListingTableUrl`] URLs. /// This allows methods such [`SessionContext::read_csv`] and [`SessionContext::read_avro`] @@ -407,6 +383,11 @@ impl SessionContext { self.state.read().config.clone() } + /// Return a copied version of config for this Session + pub fn copied_table_options(&self) -> TableOptions { + self.state.read().default_table_options().clone() + } + /// Creates a [`DataFrame`] from SQL query text. /// /// Note: This API implements DDL statements such as `CREATE TABLE` and @@ -936,7 +917,8 @@ impl SessionContext { ) -> Result { let table_paths = table_paths.to_urls()?; let session_config = self.copied_config(); - let listing_options = options.to_listing_options(&session_config); + let listing_options = + options.to_listing_options(&session_config, self.copied_table_options()); let option_extension = listing_options.file_extension.clone(); @@ -1073,7 +1055,8 @@ impl SessionContext { table_path: &str, options: ArrowReadOptions<'_>, ) -> Result<()> { - let listing_options = options.to_listing_options(&self.copied_config()); + let listing_options = options + .to_listing_options(&self.copied_config(), self.copied_table_options()); self.register_listing_table( name, @@ -1262,6 +1245,16 @@ impl SessionContext { pub fn register_catalog_list(&mut self, catalog_list: Arc) { self.state.write().catalog_list = catalog_list; } + + /// Registers a [`ConfigExtension`] as a table option extention that can be + /// referenced from SQL statements executed against this context. + pub fn register_table_options_extension(&self, extension: T) { + self.state + .write() + .table_option_namespace + .extensions + .insert(extension) + } } impl FunctionRegistry for SessionContext { @@ -1378,6 +1371,8 @@ pub struct SessionState { serializer_registry: Arc, /// Session configuration config: SessionConfig, + /// Table options + table_option_namespace: TableOptions, /// Execution properties execution_props: ExecutionProps, /// TableProviderFactories for different file formats. @@ -1478,6 +1473,9 @@ impl SessionState { aggregate_functions: HashMap::new(), window_functions: HashMap::new(), serializer_registry: Arc::new(EmptySerializerRegistry), + table_option_namespace: TableOptions::default_from_session_config( + config.options(), + ), config, execution_props: ExecutionProps::new(), runtime_env: runtime, @@ -1662,6 +1660,15 @@ impl SessionState { self } + /// Adds a new [`ConfigExtension`] to TableOptions + pub fn add_table_options_extension( + mut self, + extension: T, + ) -> Self { + self.table_option_namespace.extensions.insert(extension); + self + } + /// Registers a [`FunctionFactory`] to handle `CREATE FUNCTION` statements pub fn with_function_factory( mut self, @@ -1990,6 +1997,11 @@ impl SessionState { self.config.options() } + /// return the TableOptions options with its extensions + pub fn default_table_options(&self) -> &TableOptions { + &self.table_option_namespace + } + /// Get a new TaskContext to run in this session pub fn task_ctx(&self) -> Arc { Arc::new(TaskContext::from(self)) @@ -2155,10 +2167,16 @@ impl FunctionRegistry for SessionState { &mut self, udaf: Arc, ) -> Result>> { + udaf.aliases().iter().for_each(|alias| { + self.aggregate_functions.insert(alias.clone(), udaf.clone()); + }); Ok(self.aggregate_functions.insert(udaf.name().into(), udaf)) } fn register_udwf(&mut self, udwf: Arc) -> Result>> { + udwf.aliases().iter().for_each(|alias| { + self.window_functions.insert(alias.clone(), udwf.clone()); + }); Ok(self.window_functions.insert(udwf.name().into(), udwf)) } @@ -2173,11 +2191,23 @@ impl FunctionRegistry for SessionState { } fn deregister_udaf(&mut self, name: &str) -> Result>> { - Ok(self.aggregate_functions.remove(name)) + let udaf = self.aggregate_functions.remove(name); + if let Some(udaf) = &udaf { + for alias in udaf.aliases() { + self.aggregate_functions.remove(alias); + } + } + Ok(udaf) } fn deregister_udwf(&mut self, name: &str) -> Result>> { - Ok(self.window_functions.remove(name)) + let udwf = self.window_functions.remove(name); + if let Some(udwf) = &udwf { + for alias in udwf.aliases() { + self.window_functions.remove(alias); + } + } + Ok(udwf) } } @@ -2332,8 +2362,11 @@ impl<'a> TreeNodeVisitor for BadPlanVisitor<'a> { #[cfg(test)] mod tests { - use super::super::options::CsvReadOptions; - use super::*; + use std::env; + use std::path::PathBuf; + use std::sync::Weak; + + use super::{super::options::CsvReadOptions, *}; use crate::assert_batches_eq; use crate::execution::context::QueryPlanner; use crate::execution::memory_pool::MemoryConsumer; @@ -2341,12 +2374,11 @@ mod tests { use crate::test; use crate::test_util::{plan_and_collect, populate_csv_partitions}; use crate::variable::VarType; - use async_trait::async_trait; + use datafusion_common_runtime::SpawnedTask; use datafusion_expr::Expr; - use std::env; - use std::path::PathBuf; - use std::sync::Weak; + + use async_trait::async_trait; use tempfile::TempDir; #[tokio::test] diff --git a/datafusion/core/src/execution/context/parquet.rs b/datafusion/core/src/execution/context/parquet.rs index 7825d9b88297..528bb0fa05af 100644 --- a/datafusion/core/src/execution/context/parquet.rs +++ b/datafusion/core/src/execution/context/parquet.rs @@ -17,11 +17,11 @@ use std::sync::Arc; -use crate::datasource::physical_plan::parquet::plan_to_parquet; -use parquet::file::properties::WriterProperties; - use super::super::options::{ParquetReadOptions, ReadOptions}; use super::{DataFilePaths, DataFrame, ExecutionPlan, Result, SessionContext}; +use crate::datasource::physical_plan::parquet::plan_to_parquet; + +use parquet::file::properties::WriterProperties; impl SessionContext { /// Creates a [`DataFrame`] for reading a Parquet data source. @@ -46,7 +46,8 @@ impl SessionContext { table_path: &str, options: ParquetReadOptions<'_>, ) -> Result<()> { - let listing_options = options.to_listing_options(&self.state.read().config); + let listing_options = options + .to_listing_options(&self.copied_config(), self.copied_table_options()); self.register_listing_table( name, @@ -72,18 +73,19 @@ impl SessionContext { #[cfg(test)] mod tests { - use async_trait::async_trait; - + use super::*; use crate::arrow::array::{Float32Array, Int32Array}; use crate::arrow::datatypes::{DataType, Field, Schema}; use crate::arrow::record_batch::RecordBatch; use crate::dataframe::DataFrameWriteOptions; use crate::parquet::basic::Compression; use crate::test_util::parquet_test_data; + + use datafusion_common::config::TableParquetOptions; use datafusion_execution::config::SessionConfig; - use tempfile::tempdir; - use super::*; + use async_trait::async_trait; + use tempfile::tempdir; #[tokio::test] async fn read_with_glob_path() -> Result<()> { @@ -199,17 +201,16 @@ mod tests { .to_string(); std::fs::create_dir(dir).expect("create dir failed"); + let mut options = TableParquetOptions::default(); + options.global.compression = Some(Compression::SNAPPY.to_string()); + // Write the dataframe to a parquet file named 'output1.parquet' write_df .clone() .write_parquet( &path1, DataFrameWriteOptions::new().with_single_file_output(true), - Some( - WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(), - ), + Some(options.clone()), ) .await?; @@ -219,11 +220,7 @@ mod tests { .write_parquet( &path2, DataFrameWriteOptions::new().with_single_file_output(true), - Some( - WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(), - ), + Some(options.clone()), ) .await?; @@ -233,11 +230,7 @@ mod tests { .write_parquet( &path3, DataFrameWriteOptions::new().with_single_file_output(true), - Some( - WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(), - ), + Some(options.clone()), ) .await?; @@ -246,11 +239,7 @@ mod tests { .write_parquet( &path5, DataFrameWriteOptions::new().with_single_file_output(true), - Some( - WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(), - ), + Some(options), ) .await?; diff --git a/datafusion/core/src/physical_optimizer/combine_partial_final_agg.rs b/datafusion/core/src/physical_optimizer/combine_partial_final_agg.rs index c45e14100e82..1cba8f025895 100644 --- a/datafusion/core/src/physical_optimizer/combine_partial_final_agg.rs +++ b/datafusion/core/src/physical_optimizer/combine_partial_final_agg.rs @@ -262,6 +262,7 @@ mod tests { }, None, None, + Default::default(), )) } diff --git a/datafusion/core/src/physical_optimizer/enforce_distribution.rs b/datafusion/core/src/physical_optimizer/enforce_distribution.rs index 3b4f7acf1be6..60ee1e20a9ac 100644 --- a/datafusion/core/src/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/src/physical_optimizer/enforce_distribution.rs @@ -1474,6 +1474,7 @@ pub(crate) mod tests { }, None, None, + Default::default(), )) } @@ -1501,6 +1502,7 @@ pub(crate) mod tests { }, None, None, + Default::default(), )) } diff --git a/datafusion/core/src/physical_optimizer/test_utils.rs b/datafusion/core/src/physical_optimizer/test_utils.rs index d280726d5acd..2e6e3af5dfe2 100644 --- a/datafusion/core/src/physical_optimizer/test_utils.rs +++ b/datafusion/core/src/physical_optimizer/test_utils.rs @@ -287,6 +287,7 @@ pub fn parquet_exec(schema: &SchemaRef) -> Arc { }, None, None, + Default::default(), )) } @@ -310,6 +311,7 @@ pub fn parquet_exec_sorted( }, None, None, + Default::default(), )) } diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 6d49287debb4..0feff860fd93 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -75,11 +75,10 @@ use arrow::datatypes::{Schema, SchemaRef}; use arrow_array::builder::StringBuilder; use arrow_array::RecordBatch; use datafusion_common::display::ToStringifiedPlan; -use datafusion_common::file_options::FileTypeWriterOptions; use datafusion_common::{ exec_err, internal_err, not_impl_err, plan_err, DFSchema, FileType, ScalarValue, }; -use datafusion_expr::dml::{CopyOptions, CopyTo}; +use datafusion_expr::dml::CopyTo; use datafusion_expr::expr::{ self, AggregateFunction, AggregateFunctionDefinition, Alias, Between, BinaryExpr, Cast, GetFieldAccess, GetIndexedField, GroupingSet, InList, Like, TryCast, @@ -96,6 +95,7 @@ use datafusion_physical_plan::placeholder_row::PlaceholderRowExec; use datafusion_sql::utils::window_expr_common_partition_keys; use async_trait::async_trait; +use datafusion_common::config::FormatOptions; use futures::future::BoxFuture; use futures::{FutureExt, StreamExt, TryStreamExt}; use itertools::{multiunzip, Itertools}; @@ -568,9 +568,9 @@ impl DefaultPhysicalPlanner { LogicalPlan::Copy(CopyTo{ input, output_url, - file_format, - copy_options, + format_options, partition_by, + options: source_option_tuples }) => { let input_exec = self.create_initial_plan(input, session_state).await?; let parsed_url = ListingTableUrl::parse(output_url)?; @@ -578,16 +578,6 @@ impl DefaultPhysicalPlanner { let schema: Schema = (**input.schema()).clone().into(); - let file_type_writer_options = match copy_options{ - CopyOptions::SQLOptions(statement_options) => { - FileTypeWriterOptions::build( - file_format, - session_state.config_options(), - statement_options)? - }, - CopyOptions::WriterOptions(writer_options) => *writer_options.clone() - }; - // Note: the DataType passed here is ignored for the purposes of writing and inferred instead // from the schema of the RecordBatch being written. This allows COPY statements to specify only // the column name rather than column name + explicit data type. @@ -603,16 +593,30 @@ impl DefaultPhysicalPlanner { output_schema: Arc::new(schema), table_partition_cols, overwrite: false, - file_type_writer_options }; - - let sink_format: Arc = match file_format { - FileType::CSV => Arc::new(CsvFormat::default()), + let mut table_options = session_state.default_table_options().clone(); + let sink_format: Arc = match format_options { + FormatOptions::CSV(options) => { + table_options.csv = options.clone(); + table_options.set_file_format(FileType::CSV); + table_options.alter_with_string_hash_map(source_option_tuples)?; + Arc::new(CsvFormat::default().with_options(table_options.csv)) + }, + FormatOptions::JSON(options) => { + table_options.json = options.clone(); + table_options.set_file_format(FileType::JSON); + table_options.alter_with_string_hash_map(source_option_tuples)?; + Arc::new(JsonFormat::default().with_options(table_options.json)) + }, #[cfg(feature = "parquet")] - FileType::PARQUET => Arc::new(ParquetFormat::default()), - FileType::JSON => Arc::new(JsonFormat::default()), - FileType::AVRO => Arc::new(AvroFormat {} ), - FileType::ARROW => Arc::new(ArrowFormat {}), + FormatOptions::PARQUET(options) => { + table_options.parquet = options.clone(); + table_options.set_file_format(FileType::PARQUET); + table_options.alter_with_string_hash_map(source_option_tuples)?; + Arc::new(ParquetFormat::default().with_options(table_options.parquet)) + }, + FormatOptions::AVRO => Arc::new(AvroFormat {} ), + FormatOptions::ARROW => Arc::new(ArrowFormat {}), }; sink_format.create_writer_physical_plan(input_exec, session_state, config, None).await diff --git a/datafusion/core/src/test_util/parquet.rs b/datafusion/core/src/test_util/parquet.rs index 6d0711610b5a..7a466a666d8d 100644 --- a/datafusion/core/src/test_util/parquet.rs +++ b/datafusion/core/src/test_util/parquet.rs @@ -35,7 +35,7 @@ use crate::physical_expr::create_physical_expr; use crate::physical_plan::filter::FilterExec; use crate::physical_plan::metrics::MetricsSet; use crate::physical_plan::ExecutionPlan; -use crate::prelude::{Expr, SessionConfig}; +use crate::prelude::{Expr, SessionConfig, SessionContext}; use datafusion_common::Statistics; @@ -141,6 +141,7 @@ impl TestParquetFile { /// Otherwise if `maybe_filter` is None, return just a `ParquetExec` pub async fn create_scan( &self, + ctx: &SessionContext, maybe_filter: Option, ) -> Result> { let scan_config = FileScanConfig { @@ -164,6 +165,7 @@ impl TestParquetFile { // run coercion on the filters to coerce types etc. let props = ExecutionProps::new(); let context = SimplifyContext::new(&props).with_schema(df_schema.clone()); + let parquet_options = ctx.state().default_table_options().parquet.clone(); if let Some(filter) = maybe_filter { let simplifier = ExprSimplifier::new(context); let filter = simplifier.coerce(filter, df_schema.clone()).unwrap(); @@ -173,12 +175,18 @@ impl TestParquetFile { scan_config, Some(physical_filter_expr.clone()), None, + parquet_options, )); let exec = Arc::new(FilterExec::try_new(physical_filter_expr, parquet_exec)?); Ok(exec) } else { - Ok(Arc::new(ParquetExec::new(scan_config, None, None))) + Ok(Arc::new(ParquetExec::new( + scan_config, + None, + None, + parquet_options, + ))) } } diff --git a/datafusion/core/tests/parquet/custom_reader.rs b/datafusion/core/tests/parquet/custom_reader.rs index e76b201e0222..4bacc80579ed 100644 --- a/datafusion/core/tests/parquet/custom_reader.rs +++ b/datafusion/core/tests/parquet/custom_reader.rs @@ -23,7 +23,6 @@ use std::time::SystemTime; use arrow::array::{ArrayRef, Int64Array, Int8Array, StringArray}; use arrow::datatypes::{Field, Schema, SchemaBuilder}; use arrow::record_batch::RecordBatch; -use bytes::Bytes; use datafusion::assert_batches_sorted_eq; use datafusion::datasource::file_format::parquet::fetch_parquet_metadata; use datafusion::datasource::listing::PartitionedFile; @@ -36,6 +35,7 @@ use datafusion::physical_plan::{collect, Statistics}; use datafusion::prelude::SessionContext; use datafusion_common::Result; +use bytes::Bytes; use futures::future::BoxFuture; use futures::{FutureExt, TryFutureExt}; use object_store::memory::InMemory; @@ -88,6 +88,7 @@ async fn route_data_access_ops_to_parquet_file_reader_factory() { }, None, None, + Default::default(), ) .with_parquet_file_reader_factory(Arc::new(InMemoryParquetFileReaderFactory( Arc::clone(&in_memory_object_store), diff --git a/datafusion/core/tests/parquet/filter_pushdown.rs b/datafusion/core/tests/parquet/filter_pushdown.rs index 64d3f45dee12..c0193fe04f04 100644 --- a/datafusion/core/tests/parquet/filter_pushdown.rs +++ b/datafusion/core/tests/parquet/filter_pushdown.rs @@ -34,6 +34,7 @@ use datafusion::prelude::{col, lit, lit_timestamp_nano, Expr, SessionContext}; use datafusion::test_util::parquet::{ParquetScanOptions, TestParquetFile}; use datafusion_common::instant::Instant; use datafusion_expr::utils::{conjunction, disjunction, split_conjunction}; + use itertools::Itertools; use parquet::file::properties::WriterProperties; use tempfile::TempDir; @@ -509,7 +510,7 @@ impl<'a> TestCase<'a> { let ctx = SessionContext::new_with_config(scan_options.config()); let exec = self .test_parquet_file - .create_scan(Some(filter.clone())) + .create_scan(&ctx, Some(filter.clone())) .await .unwrap(); let result = collect(exec.clone(), ctx.task_ctx()).await.unwrap(); diff --git a/datafusion/core/tests/parquet/page_pruning.rs b/datafusion/core/tests/parquet/page_pruning.rs index ac66d34798e4..3a43428f5bcf 100644 --- a/datafusion/core/tests/parquet/page_pruning.rs +++ b/datafusion/core/tests/parquet/page_pruning.rs @@ -83,6 +83,7 @@ async fn get_parquet_exec(state: &SessionState, filter: Expr) -> ParquetExec { }, Some(predicate), None, + Default::default(), ); parquet_exec.with_enable_page_index(true) } diff --git a/datafusion/core/tests/parquet/schema_coercion.rs b/datafusion/core/tests/parquet/schema_coercion.rs index 00f3eada496e..88f795d2a4fe 100644 --- a/datafusion/core/tests/parquet/schema_coercion.rs +++ b/datafusion/core/tests/parquet/schema_coercion.rs @@ -72,6 +72,7 @@ async fn multi_parquet_coercion() { }, None, None, + Default::default(), ); let session_ctx = SessionContext::new(); @@ -135,6 +136,7 @@ async fn multi_parquet_coercion_projection() { }, None, None, + Default::default(), ); let session_ctx = SessionContext::new(); diff --git a/datafusion/core/tests/user_defined/user_defined_aggregates.rs b/datafusion/core/tests/user_defined/user_defined_aggregates.rs index 9e231d25f298..3f40c55a3ed7 100644 --- a/datafusion/core/tests/user_defined/user_defined_aggregates.rs +++ b/datafusion/core/tests/user_defined/user_defined_aggregates.rs @@ -27,6 +27,7 @@ use std::sync::{ }; use datafusion::datasource::MemTable; +use datafusion::test_util::plan_and_collect; use datafusion::{ arrow::{ array::{ArrayRef, Float64Array, TimestampNanosecondArray}, @@ -320,6 +321,42 @@ async fn case_sensitive_identifiers_user_defined_aggregates() -> Result<()> { Ok(()) } +#[tokio::test] +async fn test_user_defined_functions_with_alias() -> Result<()> { + let ctx = SessionContext::new(); + let arr = Int32Array::from(vec![1]); + let batch = RecordBatch::try_from_iter(vec![("i", Arc::new(arr) as _)])?; + ctx.register_batch("t", batch).unwrap(); + + let my_avg = create_udaf( + "dummy", + vec![DataType::Float64], + Arc::new(DataType::Float64), + Volatility::Immutable, + Arc::new(|_| Ok(Box::::default())), + Arc::new(vec![DataType::UInt64, DataType::Float64]), + ) + .with_aliases(vec!["dummy_alias"]); + + ctx.register_udaf(my_avg); + + let expected = [ + "+------------+", + "| dummy(t.i) |", + "+------------+", + "| 1.0 |", + "+------------+", + ]; + + let result = plan_and_collect(&ctx, "SELECT dummy(i) FROM t").await?; + assert_batches_eq!(expected, &result); + + let alias_result = plan_and_collect(&ctx, "SELECT dummy_alias(i) FROM t").await?; + assert_batches_eq!(expected, &alias_result); + + Ok(()) +} + #[tokio::test] async fn test_groups_accumulator() -> Result<()> { let ctx = SessionContext::new(); diff --git a/datafusion/core/tests/user_defined/user_defined_window_functions.rs b/datafusion/core/tests/user_defined/user_defined_window_functions.rs index cfd74f8861e3..3c607301fc98 100644 --- a/datafusion/core/tests/user_defined/user_defined_window_functions.rs +++ b/datafusion/core/tests/user_defined/user_defined_window_functions.rs @@ -41,6 +41,10 @@ const UNBOUNDED_WINDOW_QUERY: &str = "SELECT x, y, val, \ odd_counter(val) OVER (PARTITION BY x ORDER BY y) \ from t ORDER BY x, y"; +const UNBOUNDED_WINDOW_QUERY_WITH_ALIAS: &str = "SELECT x, y, val, \ + odd_counter_alias(val) OVER (PARTITION BY x ORDER BY y) \ + from t ORDER BY x, y"; + /// A query with a window function evaluated over a moving window const BOUNDED_WINDOW_QUERY: &str = "SELECT x, y, val, \ @@ -118,6 +122,35 @@ async fn test_deregister_udwf() -> Result<()> { Ok(()) } +#[tokio::test] +async fn test_udwf_with_alias() { + let test_state = TestState::new(); + let TestContext { ctx, .. } = TestContext::new(test_state); + + let expected = vec![ + "+---+---+-----+-----------------------------------------------------------------------------------------------------------------------+", + "| x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW |", + "+---+---+-----+-----------------------------------------------------------------------------------------------------------------------+", + "| 1 | a | 0 | 1 |", + "| 1 | b | 1 | 1 |", + "| 1 | c | 2 | 1 |", + "| 2 | d | 3 | 2 |", + "| 2 | e | 4 | 2 |", + "| 2 | f | 5 | 2 |", + "| 2 | g | 6 | 2 |", + "| 2 | h | 6 | 2 |", + "| 2 | i | 6 | 2 |", + "| 2 | j | 6 | 2 |", + "+---+---+-----+-----------------------------------------------------------------------------------------------------------------------+", + ]; + assert_batches_eq!( + expected, + &execute(&ctx, UNBOUNDED_WINDOW_QUERY_WITH_ALIAS) + .await + .unwrap() + ); +} + /// Basic user defined window function with bounded window #[tokio::test] async fn test_udwf_bounded_window_ignores_frame() { @@ -491,6 +524,7 @@ impl OddCounter { signature: Signature, return_type: DataType, test_state: Arc, + aliases: Vec, } impl SimpleWindowUDF { @@ -502,6 +536,7 @@ impl OddCounter { signature, return_type, test_state, + aliases: vec!["odd_counter_alias".to_string()], } } } @@ -526,6 +561,10 @@ impl OddCounter { fn partition_evaluator(&self) -> Result> { Ok(Box::new(OddCounter::new(Arc::clone(&self.test_state)))) } + + fn aliases(&self) -> &[String] { + &self.aliases + } } ctx.register_udwf(WindowUDF::from(SimpleWindowUDF::new(test_state))) diff --git a/datafusion/execution/src/task.rs b/datafusion/execution/src/task.rs index b39b4a00327b..cae410655d10 100644 --- a/datafusion/execution/src/task.rs +++ b/datafusion/execution/src/task.rs @@ -207,9 +207,15 @@ impl FunctionRegistry for TaskContext { &mut self, udaf: Arc, ) -> Result>> { + udaf.aliases().iter().for_each(|alias| { + self.aggregate_functions.insert(alias.clone(), udaf.clone()); + }); Ok(self.aggregate_functions.insert(udaf.name().into(), udaf)) } fn register_udwf(&mut self, udwf: Arc) -> Result>> { + udwf.aliases().iter().for_each(|alias| { + self.window_functions.insert(alias.clone(), udwf.clone()); + }); Ok(self.window_functions.insert(udwf.name().into(), udwf)) } fn register_udf(&mut self, udf: Arc) -> Result>> { diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index 15dfe48b34f6..b881af18d92c 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -17,18 +17,16 @@ //! Built-in functions module contains all the built-in functions definitions. -use std::cmp::Ordering; use std::collections::HashMap; use std::fmt; use std::str::FromStr; use std::sync::{Arc, OnceLock}; use crate::signature::TIMEZONE_WILDCARD; -use crate::type_coercion::binary::get_wider_type; use crate::type_coercion::functions::data_types; use crate::{FuncMonotonicity, Signature, TypeSignature, Volatility}; -use arrow::datatypes::{DataType, Field, Fields, TimeUnit}; +use arrow::datatypes::{DataType, Field, TimeUnit}; use datafusion_common::{plan_err, DataFusionError, Result}; use strum::IntoEnumIterator; @@ -102,44 +100,28 @@ pub enum BuiltinScalarFunction { Sinh, /// sqrt Sqrt, - /// tan - Tan, - /// tanh - Tanh, /// trunc Trunc, /// cot Cot, // array functions - /// array_append - ArrayAppend, - /// array_sort - ArraySort, - /// array_concat - ArrayConcat, /// array_pop_front ArrayPopFront, /// array_pop_back ArrayPopBack, - /// array_distinct - ArrayDistinct, /// array_element ArrayElement, /// array_position ArrayPosition, /// array_positions ArrayPositions, - /// array_prepend - ArrayPrepend, /// array_remove ArrayRemove, /// array_remove_n ArrayRemoveN, /// array_remove_all ArrayRemoveAll, - /// array_repeat - ArrayRepeat, /// array_replace ArrayReplace, /// array_replace_n @@ -158,12 +140,6 @@ pub enum BuiltinScalarFunction { ArrayExcept, /// array_resize ArrayResize, - /// construct an array from columns - MakeArray, - - // struct functions - /// struct - Struct, // string functions /// ascii @@ -220,8 +196,6 @@ pub enum BuiltinScalarFunction { SHA512, /// split_part SplitPart, - /// string_to_array - StringToArray, /// starts_with StartsWith, /// strpos @@ -240,8 +214,6 @@ pub enum BuiltinScalarFunction { Upper, /// uuid Uuid, - /// arrow_typeof - ArrowTypeof, /// overlay OverLay, /// levenshtein @@ -334,21 +306,13 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Sqrt => Volatility::Immutable, BuiltinScalarFunction::Cbrt => Volatility::Immutable, BuiltinScalarFunction::Cot => Volatility::Immutable, - BuiltinScalarFunction::Tan => Volatility::Immutable, - BuiltinScalarFunction::Tanh => Volatility::Immutable, BuiltinScalarFunction::Trunc => Volatility::Immutable, - BuiltinScalarFunction::ArrayAppend => Volatility::Immutable, - BuiltinScalarFunction::ArraySort => Volatility::Immutable, - BuiltinScalarFunction::ArrayConcat => Volatility::Immutable, - BuiltinScalarFunction::ArrayDistinct => Volatility::Immutable, BuiltinScalarFunction::ArrayElement => Volatility::Immutable, BuiltinScalarFunction::ArrayExcept => Volatility::Immutable, BuiltinScalarFunction::ArrayPopFront => Volatility::Immutable, BuiltinScalarFunction::ArrayPopBack => Volatility::Immutable, BuiltinScalarFunction::ArrayPosition => Volatility::Immutable, BuiltinScalarFunction::ArrayPositions => Volatility::Immutable, - BuiltinScalarFunction::ArrayPrepend => Volatility::Immutable, - BuiltinScalarFunction::ArrayRepeat => Volatility::Immutable, BuiltinScalarFunction::ArrayRemove => Volatility::Immutable, BuiltinScalarFunction::ArrayRemoveN => Volatility::Immutable, BuiltinScalarFunction::ArrayRemoveAll => Volatility::Immutable, @@ -360,7 +324,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::ArrayIntersect => Volatility::Immutable, BuiltinScalarFunction::ArrayUnion => Volatility::Immutable, BuiltinScalarFunction::ArrayResize => Volatility::Immutable, - BuiltinScalarFunction::MakeArray => Volatility::Immutable, BuiltinScalarFunction::Ascii => Volatility::Immutable, BuiltinScalarFunction::BitLength => Volatility::Immutable, BuiltinScalarFunction::Btrim => Volatility::Immutable, @@ -389,7 +352,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::SHA512 => Volatility::Immutable, BuiltinScalarFunction::Digest => Volatility::Immutable, BuiltinScalarFunction::SplitPart => Volatility::Immutable, - BuiltinScalarFunction::StringToArray => Volatility::Immutable, BuiltinScalarFunction::StartsWith => Volatility::Immutable, BuiltinScalarFunction::Strpos => Volatility::Immutable, BuiltinScalarFunction::Substr => Volatility::Immutable, @@ -399,9 +361,7 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Translate => Volatility::Immutable, BuiltinScalarFunction::Trim => Volatility::Immutable, BuiltinScalarFunction::Upper => Volatility::Immutable, - BuiltinScalarFunction::Struct => Volatility::Immutable, - BuiltinScalarFunction::ArrowTypeof => Volatility::Immutable, - BuiltinScalarFunction::OverLay => Volatility::Immutable, + BuiltinScalarFunction::OverLay => Volatility::Immutable, BuiltinScalarFunction::Levenshtein => Volatility::Immutable, BuiltinScalarFunction::SubstrIndex => Volatility::Immutable, BuiltinScalarFunction::FindInSet => Volatility::Immutable, @@ -412,25 +372,6 @@ impl BuiltinScalarFunction { } } - /// Returns the dimension [`DataType`] of [`DataType::List`] if - /// treated as a N-dimensional array. - /// - /// ## Examples: - /// - /// * `Int64` has dimension 1 - /// * `List(Int64)` has dimension 2 - /// * `List(List(Int64))` has dimension 3 - /// * etc. - fn return_dimension(self, input_expr_type: &DataType) -> u64 { - let mut result: u64 = 1; - let mut current_data_type = input_expr_type; - while let DataType::List(field) = current_data_type { - current_data_type = field.data_type(); - result += 1; - } - result - } - /// Returns the output [`DataType`] of this function /// /// This method should be invoked only after `input_expr_types` have been validated @@ -448,39 +389,6 @@ impl BuiltinScalarFunction { // the return type of the built in function. // Some built-in functions' return type depends on the incoming type. match self { - BuiltinScalarFunction::ArrayAppend => Ok(input_expr_types[0].clone()), - BuiltinScalarFunction::ArraySort => Ok(input_expr_types[0].clone()), - BuiltinScalarFunction::ArrayConcat => { - let mut expr_type = Null; - let mut max_dims = 0; - for input_expr_type in input_expr_types { - match input_expr_type { - List(field) => { - if !field.data_type().equals_datatype(&Null) { - let dims = self.return_dimension(input_expr_type); - expr_type = match max_dims.cmp(&dims) { - Ordering::Greater => expr_type, - Ordering::Equal => { - get_wider_type(&expr_type, input_expr_type)? - } - Ordering::Less => { - max_dims = dims; - input_expr_type.clone() - } - }; - } - } - _ => { - return plan_err!( - "The {self} function can only accept list as the args." - ); - } - } - } - - Ok(expr_type) - } - BuiltinScalarFunction::ArrayDistinct => Ok(input_expr_types[0].clone()), BuiltinScalarFunction::ArrayElement => match &input_expr_types[0] { List(field) | LargeList(field) @@ -495,12 +403,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::ArrayPositions => { Ok(List(Arc::new(Field::new("item", UInt64, true)))) } - BuiltinScalarFunction::ArrayPrepend => Ok(input_expr_types[1].clone()), - BuiltinScalarFunction::ArrayRepeat => Ok(List(Arc::new(Field::new( - "item", - input_expr_types[0].clone(), - true, - )))), BuiltinScalarFunction::ArrayRemove => Ok(input_expr_types[0].clone()), BuiltinScalarFunction::ArrayRemoveN => Ok(input_expr_types[0].clone()), BuiltinScalarFunction::ArrayRemoveAll => Ok(input_expr_types[0].clone()), @@ -536,20 +438,6 @@ impl BuiltinScalarFunction { (dt, _) => Ok(dt), } } - BuiltinScalarFunction::MakeArray => match input_expr_types.len() { - 0 => Ok(List(Arc::new(Field::new("item", Null, true)))), - _ => { - let mut expr_type = Null; - for input_expr_type in input_expr_types { - if !input_expr_type.equals_datatype(&Null) { - expr_type = input_expr_type.clone(); - break; - } - } - - Ok(List(Arc::new(Field::new("item", expr_type, true)))) - } - }, BuiltinScalarFunction::Ascii => Ok(Int32), BuiltinScalarFunction::BitLength => { utf8_to_int_type(&input_expr_types[0], "bit_length") @@ -620,11 +508,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::SplitPart => { utf8_to_str_type(&input_expr_types[0], "split_part") } - BuiltinScalarFunction::StringToArray => Ok(List(Arc::new(Field::new( - "item", - input_expr_types[0].clone(), - true, - )))), BuiltinScalarFunction::StartsWith => Ok(Boolean), BuiltinScalarFunction::EndsWith => Ok(Boolean), BuiltinScalarFunction::Strpos => { @@ -664,14 +547,7 @@ impl BuiltinScalarFunction { _ => Ok(Float64), }, - BuiltinScalarFunction::Struct => { - let return_fields = input_expr_types - .iter() - .enumerate() - .map(|(pos, dt)| Field::new(format!("c{pos}"), dt.clone(), true)) - .collect::>(); - Ok(Struct(Fields::from(return_fields))) - } + BuiltinScalarFunction::Atan2 => match &input_expr_types[0] { Float32 => Ok(Float32), @@ -690,8 +566,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Iszero => Ok(Boolean), - BuiltinScalarFunction::ArrowTypeof => Ok(Utf8), - BuiltinScalarFunction::OverLay => { utf8_to_str_type(&input_expr_types[0], "overlay") } @@ -720,8 +594,6 @@ impl BuiltinScalarFunction { | BuiltinScalarFunction::Sinh | BuiltinScalarFunction::Sqrt | BuiltinScalarFunction::Cbrt - | BuiltinScalarFunction::Tan - | BuiltinScalarFunction::Tanh | BuiltinScalarFunction::Trunc | BuiltinScalarFunction::Cot => match input_expr_types[0] { Float32 => Ok(Float32), @@ -739,36 +611,18 @@ impl BuiltinScalarFunction { // for now, the list is small, as we do not have many built-in functions. match self { - BuiltinScalarFunction::ArraySort => { - Signature::variadic_any(self.volatility()) - } - BuiltinScalarFunction::ArrayAppend => { - Signature::array_and_element(self.volatility()) - } - BuiltinScalarFunction::MakeArray => { - // 0 or more arguments of arbitrary type - Signature::one_of(vec![VariadicEqual, Any(0)], self.volatility()) - } BuiltinScalarFunction::ArrayPopFront => Signature::array(self.volatility()), BuiltinScalarFunction::ArrayPopBack => Signature::array(self.volatility()), - BuiltinScalarFunction::ArrayConcat => { - Signature::variadic_any(self.volatility()) - } BuiltinScalarFunction::ArrayElement => { Signature::array_and_index(self.volatility()) } BuiltinScalarFunction::ArrayExcept => Signature::any(2, self.volatility()), - BuiltinScalarFunction::ArrayDistinct => Signature::array(self.volatility()), BuiltinScalarFunction::ArrayPosition => { Signature::array_and_element_and_optional_index(self.volatility()) } BuiltinScalarFunction::ArrayPositions => { Signature::array_and_element(self.volatility()) } - BuiltinScalarFunction::ArrayPrepend => { - Signature::element_and_array(self.volatility()) - } - BuiltinScalarFunction::ArrayRepeat => Signature::any(2, self.volatility()), BuiltinScalarFunction::ArrayRemove => { Signature::array_and_element(self.volatility()) } @@ -792,7 +646,6 @@ impl BuiltinScalarFunction { Signature::variadic_any(self.volatility()) } - BuiltinScalarFunction::Struct => Signature::variadic_any(self.volatility()), BuiltinScalarFunction::Concat | BuiltinScalarFunction::ConcatWithSeparator => { Signature::variadic(vec![Utf8], self.volatility()) @@ -901,13 +754,6 @@ impl BuiltinScalarFunction { ], self.volatility(), ), - BuiltinScalarFunction::StringToArray => Signature::one_of( - vec![ - TypeSignature::Uniform(2, vec![Utf8, LargeUtf8]), - TypeSignature::Uniform(3, vec![Utf8, LargeUtf8]), - ], - self.volatility(), - ), BuiltinScalarFunction::EndsWith | BuiltinScalarFunction::Strpos @@ -994,7 +840,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Gcd | BuiltinScalarFunction::Lcm => { Signature::uniform(2, vec![Int64], self.volatility()) } - BuiltinScalarFunction::ArrowTypeof => Signature::any(1, self.volatility()), BuiltinScalarFunction::OverLay => Signature::one_of( vec![ Exact(vec![Utf8, Utf8, Int64, Int64]), @@ -1027,8 +872,6 @@ impl BuiltinScalarFunction { | BuiltinScalarFunction::Sin | BuiltinScalarFunction::Sinh | BuiltinScalarFunction::Sqrt - | BuiltinScalarFunction::Tan - | BuiltinScalarFunction::Tanh | BuiltinScalarFunction::Cot => { // math expressions expect 1 argument of type f64 or f32 // priority is given to f64 because e.g. `sqrt(1i32)` is in IR (real numbers) and thus we @@ -1073,7 +916,6 @@ impl BuiltinScalarFunction { | BuiltinScalarFunction::Sinh | BuiltinScalarFunction::Sqrt | BuiltinScalarFunction::Cbrt - | BuiltinScalarFunction::Tanh | BuiltinScalarFunction::Trunc | BuiltinScalarFunction::Pi ) { @@ -1119,8 +961,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Sin => &["sin"], BuiltinScalarFunction::Sinh => &["sinh"], BuiltinScalarFunction::Sqrt => &["sqrt"], - BuiltinScalarFunction::Tan => &["tan"], - BuiltinScalarFunction::Tanh => &["tanh"], BuiltinScalarFunction::Trunc => &["trunc"], // conditional functions @@ -1150,9 +990,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Rpad => &["rpad"], BuiltinScalarFunction::Rtrim => &["rtrim"], BuiltinScalarFunction::SplitPart => &["split_part"], - BuiltinScalarFunction::StringToArray => { - &["string_to_array", "string_to_list"] - } BuiltinScalarFunction::StartsWith => &["starts_with"], BuiltinScalarFunction::Strpos => &["strpos", "instr", "position"], BuiltinScalarFunction::Substr => &["substr"], @@ -1176,22 +1013,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::SHA256 => &["sha256"], BuiltinScalarFunction::SHA384 => &["sha384"], BuiltinScalarFunction::SHA512 => &["sha512"], - - // other functions - BuiltinScalarFunction::ArrowTypeof => &["arrow_typeof"], - - // array functions - BuiltinScalarFunction::ArrayAppend => &[ - "array_append", - "list_append", - "array_push_back", - "list_push_back", - ], - BuiltinScalarFunction::ArraySort => &["array_sort", "list_sort"], - BuiltinScalarFunction::ArrayConcat => { - &["array_concat", "array_cat", "list_concat", "list_cat"] - } - BuiltinScalarFunction::ArrayDistinct => &["array_distinct", "list_distinct"], BuiltinScalarFunction::ArrayElement => &[ "array_element", "array_extract", @@ -1212,13 +1033,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::ArrayPositions => { &["array_positions", "list_positions"] } - BuiltinScalarFunction::ArrayPrepend => &[ - "array_prepend", - "list_prepend", - "array_push_front", - "list_push_front", - ], - BuiltinScalarFunction::ArrayRepeat => &["array_repeat", "list_repeat"], BuiltinScalarFunction::ArrayRemove => &["array_remove", "list_remove"], BuiltinScalarFunction::ArrayRemoveN => &["array_remove_n", "list_remove_n"], BuiltinScalarFunction::ArrayRemoveAll => { @@ -1235,14 +1049,10 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::ArraySlice => &["array_slice", "list_slice"], BuiltinScalarFunction::ArrayUnion => &["array_union", "list_union"], BuiltinScalarFunction::ArrayResize => &["array_resize", "list_resize"], - BuiltinScalarFunction::MakeArray => &["make_array", "make_list"], BuiltinScalarFunction::ArrayIntersect => { &["array_intersect", "list_intersect"] } BuiltinScalarFunction::OverLay => &["overlay"], - - // struct functions - BuiltinScalarFunction::Struct => &["struct"], } } } diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index d32d1e9c5cce..8212f75583ea 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -538,11 +538,9 @@ scalar_expr!(Sqrt, sqrt, num, "square root of a number"); scalar_expr!(Cbrt, cbrt, num, "cube root of a number"); scalar_expr!(Sin, sin, num, "sine"); scalar_expr!(Cos, cos, num, "cosine"); -scalar_expr!(Tan, tan, num, "tangent"); scalar_expr!(Cot, cot, num, "cotangent"); scalar_expr!(Sinh, sinh, num, "hyperbolic sine"); scalar_expr!(Cosh, cosh, num, "hyperbolic cosine"); -scalar_expr!(Tanh, tanh, num, "hyperbolic tangent"); scalar_expr!(Atan, atan, num, "inverse tangent"); scalar_expr!(Asinh, asinh, num, "inverse hyperbolic sine"); scalar_expr!(Acosh, acosh, num, "inverse hyperbolic cosine"); @@ -586,16 +584,6 @@ scalar_expr!( scalar_expr!(Uuid, uuid, , "returns uuid v4 as a string value"); scalar_expr!(Log, log, base x, "logarithm of a `x` for a particular `base`"); -// array functions -scalar_expr!( - ArrayAppend, - array_append, - array element, - "appends an element to the end of an array." -); - -scalar_expr!(ArraySort, array_sort, array desc null_first, "returns sorted array."); - scalar_expr!( ArrayPopBack, array_pop_back, @@ -610,7 +598,6 @@ scalar_expr!( "returns the array without the first element." ); -nary_scalar_expr!(ArrayConcat, array_concat, "concatenates arrays."); scalar_expr!( ArrayElement, array_element, @@ -623,12 +610,6 @@ scalar_expr!( first_array second_array, "Returns an array of the elements that appear in the first array but not in the second." ); -scalar_expr!( - ArrayDistinct, - array_distinct, - array, - "return distinct values from the array after removing duplicates." -); scalar_expr!( ArrayPosition, array_position, @@ -641,18 +622,6 @@ scalar_expr!( array element, "searches for an element in the array, returns all occurrences." ); -scalar_expr!( - ArrayPrepend, - array_prepend, - array element, - "prepends an element to the beginning of an array." -); -scalar_expr!( - ArrayRepeat, - array_repeat, - element count, - "returns an array containing element `count` times." -); scalar_expr!( ArrayRemove, array_remove, @@ -710,11 +679,6 @@ scalar_expr!( "returns an array with the specified size filled with the given value." ); -nary_scalar_expr!( - MakeArray, - array, - "returns an Arrow array using the specified input expressions." -); scalar_expr!( ArrayIntersect, array_intersect, @@ -774,7 +738,6 @@ scalar_expr!(SHA256, sha256, string, "SHA-256 hash"); scalar_expr!(SHA384, sha384, string, "SHA-384 hash"); scalar_expr!(SHA512, sha512, string, "SHA-512 hash"); scalar_expr!(SplitPart, split_part, string delimiter index, "splits a string based on a delimiter and picks out the desired field based on the index."); -scalar_expr!(StringToArray, string_to_array, string delimiter null_string, "splits a `string` based on a `delimiter` and returns an array of parts. Any parts matching the optional `null_string` will be replaced with `NULL`"); scalar_expr!(StartsWith, starts_with, string prefix, "whether the `string` starts with the `prefix`"); scalar_expr!(EndsWith, ends_with, string suffix, "whether the `string` ends with the `suffix`"); scalar_expr!(Strpos, strpos, string substring, "finds the position from where the `substring` matches the `string`"); @@ -834,18 +797,10 @@ scalar_expr!( "returns true if a given number is +0.0 or -0.0 otherwise returns false" ); -scalar_expr!(ArrowTypeof, arrow_typeof, val, "data type"); scalar_expr!(Levenshtein, levenshtein, string1 string2, "Returns the Levenshtein distance between the two given strings"); scalar_expr!(SubstrIndex, substr_index, string delimiter count, "Returns the substring from str before count occurrences of the delimiter"); scalar_expr!(FindInSet, find_in_set, str strlist, "Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings"); -scalar_expr!( - Struct, - struct_fun, - val, - "returns a vector of fields from the struct" -); - /// Create a CASE WHEN statement with literal WHEN expressions for comparison to the base expression. pub fn case(expr: Expr) -> CaseBuilder { CaseBuilder::new(Some(Box::new(expr)), vec![], vec![], None) @@ -1230,11 +1185,9 @@ mod test { test_unary_scalar_expr!(Cbrt, cbrt); test_unary_scalar_expr!(Sin, sin); test_unary_scalar_expr!(Cos, cos); - test_unary_scalar_expr!(Tan, tan); test_unary_scalar_expr!(Cot, cot); test_unary_scalar_expr!(Sinh, sinh); test_unary_scalar_expr!(Cosh, cosh); - test_unary_scalar_expr!(Tanh, tanh); test_unary_scalar_expr!(Atan, atan); test_unary_scalar_expr!(Asinh, asinh); test_unary_scalar_expr!(Acosh, acosh); @@ -1286,7 +1239,6 @@ mod test { test_scalar_expr!(SHA384, sha384, string); test_scalar_expr!(SHA512, sha512, string); test_scalar_expr!(SplitPart, split_part, expr, delimiter, index); - test_scalar_expr!(StringToArray, string_to_array, expr, delimiter, null_value); test_scalar_expr!(StartsWith, starts_with, string, characters); test_scalar_expr!(EndsWith, ends_with, string, characters); test_scalar_expr!(Strpos, strpos, string, substring); @@ -1297,23 +1249,17 @@ mod test { test_scalar_expr!(Trim, trim, string); test_scalar_expr!(Upper, upper, string); - test_scalar_expr!(ArrayAppend, array_append, array, element); - test_scalar_expr!(ArraySort, array_sort, array, desc, null_first); test_scalar_expr!(ArrayPopFront, array_pop_front, array); test_scalar_expr!(ArrayPopBack, array_pop_back, array); test_scalar_expr!(ArrayPosition, array_position, array, element, index); test_scalar_expr!(ArrayPositions, array_positions, array, element); - test_scalar_expr!(ArrayPrepend, array_prepend, array, element); - test_scalar_expr!(ArrayRepeat, array_repeat, element, count); test_scalar_expr!(ArrayRemove, array_remove, array, element); test_scalar_expr!(ArrayRemoveN, array_remove_n, array, element, max); test_scalar_expr!(ArrayRemoveAll, array_remove_all, array, element); test_scalar_expr!(ArrayReplace, array_replace, array, from, to); test_scalar_expr!(ArrayReplaceN, array_replace_n, array, from, to, max); test_scalar_expr!(ArrayReplaceAll, array_replace_all, array, from, to); - test_nary_scalar_expr!(MakeArray, array, input); - test_unary_scalar_expr!(ArrowTypeof, arrow_typeof); test_nary_scalar_expr!(OverLay, overlay, string, characters, position, len); test_nary_scalar_expr!(OverLay, overlay, string, characters, position); test_scalar_expr!(Levenshtein, levenshtein, string1, string2); diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 0662396f611b..01e6af948762 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -24,7 +24,7 @@ use std::convert::TryFrom; use std::iter::zip; use std::sync::Arc; -use crate::dml::{CopyOptions, CopyTo}; +use crate::dml::CopyTo; use crate::expr::Alias; use crate::expr_rewriter::{ coerce_plan_expr_for_schema, normalize_col, @@ -43,20 +43,19 @@ use crate::utils::{ expand_wildcard, find_valid_equijoin_key_pair, group_window_expr_by_sort_keys, }; use crate::{ - and, binary_expr, DmlStatement, Expr, ExprSchemable, Operator, + and, binary_expr, DmlStatement, Expr, ExprSchemable, Operator, RecursiveQuery, TableProviderFilterPushDown, TableSource, WriteOp, }; use arrow::datatypes::{DataType, Schema, SchemaRef}; +use datafusion_common::config::FormatOptions; use datafusion_common::display::ToStringifiedPlan; use datafusion_common::{ get_target_functional_dependencies, plan_datafusion_err, plan_err, Column, DFField, - DFSchema, DFSchemaRef, DataFusionError, FileType, OwnedTableReference, Result, - ScalarValue, TableReference, ToDFSchema, UnnestOptions, + DFSchema, DFSchemaRef, DataFusionError, OwnedTableReference, Result, ScalarValue, + TableReference, ToDFSchema, UnnestOptions, }; -use super::plan::RecursiveQuery; - /// Default table name for unnamed table pub const UNNAMED_TABLE: &str = "?table?"; @@ -262,16 +261,16 @@ impl LogicalPlanBuilder { pub fn copy_to( input: LogicalPlan, output_url: String, - file_format: FileType, + format_options: FormatOptions, + options: HashMap, partition_by: Vec, - copy_options: CopyOptions, ) -> Result { Ok(Self::from(LogicalPlan::Copy(CopyTo { input: Arc::new(input), output_url, - file_format, + format_options, + options, partition_by, - copy_options, }))) } diff --git a/datafusion/expr/src/logical_plan/dml.rs b/datafusion/expr/src/logical_plan/dml.rs index 7f04bd8973d6..6ab06a57c1c2 100644 --- a/datafusion/expr/src/logical_plan/dml.rs +++ b/datafusion/expr/src/logical_plan/dml.rs @@ -15,70 +15,46 @@ // specific language governing permissions and limitations // under the License. -use std::{ - fmt::{self, Display}, - sync::Arc, -}; +use std::collections::HashMap; +use std::fmt::{self, Display}; +use std::hash::{Hash, Hasher}; +use std::sync::Arc; -use datafusion_common::{ - file_options::StatementOptions, DFSchemaRef, FileType, FileTypeWriterOptions, - OwnedTableReference, -}; +use datafusion_common::config::FormatOptions; +use datafusion_common::{DFSchemaRef, OwnedTableReference}; use crate::LogicalPlan; /// Operator that copies the contents of a database to file(s) -#[derive(Clone, PartialEq, Eq, Hash)] +#[derive(Clone)] pub struct CopyTo { /// The relation that determines the tuples to write to the output file(s) pub input: Arc, /// The location to write the file(s) pub output_url: String, - /// The file format to output (explicitly defined or inferred from file extension) - pub file_format: FileType, /// Determines which, if any, columns should be used for hive-style partitioned writes pub partition_by: Vec, - /// Arbitrary options as tuples - pub copy_options: CopyOptions, -} - -/// When the logical plan is constructed from SQL, CopyOptions -/// will contain arbitrary string tuples which must be parsed into -/// FileTypeWriterOptions. When the logical plan is constructed directly -/// from rust code (such as via the DataFrame API), FileTypeWriterOptions -/// can be provided directly, avoiding the run time cost and fallibility of -/// parsing string based options. -#[derive(Clone)] -pub enum CopyOptions { - /// Holds StatementOptions parsed from a SQL statement - SQLOptions(StatementOptions), - /// Holds FileTypeWriterOptions directly provided - WriterOptions(Box), + /// File format options. + pub format_options: FormatOptions, + /// SQL Options that can affect the formats + pub options: HashMap, } -impl PartialEq for CopyOptions { - fn eq(&self, other: &CopyOptions) -> bool { - match self { - Self::SQLOptions(statement1) => match other { - Self::SQLOptions(statement2) => statement1.eq(statement2), - Self::WriterOptions(_) => false, - }, - Self::WriterOptions(_) => false, - } +// Implement PartialEq manually +impl PartialEq for CopyTo { + fn eq(&self, other: &Self) -> bool { + self.input == other.input && self.output_url == other.output_url } } -impl Eq for CopyOptions {} +// Implement Eq (no need for additional logic over PartialEq) +impl Eq for CopyTo {} -impl std::hash::Hash for CopyOptions { - fn hash(&self, hasher: &mut H) - where - H: std::hash::Hasher, - { - match self { - Self::SQLOptions(statement) => statement.hash(hasher), - Self::WriterOptions(_) => (), - } +// Implement Hash manually +impl Hash for CopyTo { + fn hash(&self, state: &mut H) { + self.input.hash(state); + self.output_url.hash(state); } } diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index ca021c4bfc28..a3f027d9fdb2 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -25,7 +25,6 @@ use std::sync::Arc; use super::dml::CopyTo; use super::DdlStatement; use crate::builder::change_redundant_column; -use crate::dml::CopyOptions; use crate::expr::{ Alias, Exists, InSubquery, Placeholder, Sort as SortExpr, WindowFunction, }; @@ -613,15 +612,15 @@ impl LogicalPlan { LogicalPlan::Copy(CopyTo { input: _, output_url, - file_format, + format_options, + options, partition_by, - copy_options, }) => Ok(LogicalPlan::Copy(CopyTo { input: Arc::new(inputs.swap_remove(0)), output_url: output_url.clone(), - file_format: file_format.clone(), + format_options: format_options.clone(), + options: options.clone(), partition_by: partition_by.clone(), - copy_options: copy_options.clone(), })), LogicalPlan::Values(Values { schema, .. }) => { Ok(LogicalPlan::Values(Values { @@ -1544,22 +1543,17 @@ impl LogicalPlan { LogicalPlan::Copy(CopyTo { input: _, output_url, - file_format, - partition_by: _, - copy_options, + format_options, + options, + .. }) => { - let op_str = match copy_options { - CopyOptions::SQLOptions(statement) => statement - .clone() - .into_inner() - .iter() - .map(|(k, v)| format!("{k} {v}")) - .collect::>() - .join(", "), - CopyOptions::WriterOptions(_) => "".into(), - }; + let op_str = options + .iter() + .map(|(k, v)| format!("{k} {v}")) + .collect::>() + .join(", "); - write!(f, "CopyTo: format={file_format} output_url={output_url} options: ({op_str})") + write!(f, "CopyTo: format={format_options} output_url={output_url} options: ({op_str})") } LogicalPlan::Ddl(ddl) => { write!(f, "{}", ddl.display()) diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs index e56723063e41..c46dd9cd3a6f 100644 --- a/datafusion/expr/src/udaf.rs +++ b/datafusion/expr/src/udaf.rs @@ -118,6 +118,14 @@ impl AggregateUDF { self.inner.clone() } + /// Adds additional names that can be used to invoke this function, in + /// addition to `name` + /// + /// If you implement [`AggregateUDFImpl`] directly you should return aliases directly. + pub fn with_aliases(self, aliases: impl IntoIterator) -> Self { + Self::new_from_impl(AliasedAggregateUDFImpl::new(self.inner.clone(), aliases)) + } + /// creates an [`Expr`] that calls the aggregate function. /// /// This utility allows using the UDAF without requiring access to @@ -139,6 +147,11 @@ impl AggregateUDF { self.inner.name() } + /// Returns the aliases for this function. + pub fn aliases(&self) -> &[String] { + self.inner.aliases() + } + /// Returns this function's signature (what input types are accepted) /// /// See [`AggregateUDFImpl::signature`] for more details. @@ -277,6 +290,64 @@ pub trait AggregateUDFImpl: Debug + Send + Sync { fn create_groups_accumulator(&self) -> Result> { not_impl_err!("GroupsAccumulator hasn't been implemented for {self:?} yet") } + + /// Returns any aliases (alternate names) for this function. + /// + /// Note: `aliases` should only include names other than [`Self::name`]. + /// Defaults to `[]` (no aliases) + fn aliases(&self) -> &[String] { + &[] + } +} + +/// AggregateUDF that adds an alias to the underlying function. It is better to +/// implement [`AggregateUDFImpl`], which supports aliases, directly if possible. +#[derive(Debug)] +struct AliasedAggregateUDFImpl { + inner: Arc, + aliases: Vec, +} + +impl AliasedAggregateUDFImpl { + pub fn new( + inner: Arc, + new_aliases: impl IntoIterator, + ) -> Self { + let mut aliases = inner.aliases().to_vec(); + aliases.extend(new_aliases.into_iter().map(|s| s.to_string())); + + Self { inner, aliases } + } +} + +impl AggregateUDFImpl for AliasedAggregateUDFImpl { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + self.inner.name() + } + + fn signature(&self) -> &Signature { + self.inner.signature() + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + self.inner.return_type(arg_types) + } + + fn accumulator(&self, arg: &DataType) -> Result> { + self.inner.accumulator(arg) + } + + fn state_type(&self, return_type: &DataType) -> Result> { + self.inner.state_type(return_type) + } + + fn aliases(&self) -> &[String] { + &self.aliases + } } /// Implementation of [`AggregateUDFImpl`] that wraps the function style pointers diff --git a/datafusion/expr/src/udwf.rs b/datafusion/expr/src/udwf.rs index 3ab40fe70a91..d3925f2e1925 100644 --- a/datafusion/expr/src/udwf.rs +++ b/datafusion/expr/src/udwf.rs @@ -80,7 +80,7 @@ impl WindowUDF { /// /// See [`WindowUDFImpl`] for a more convenient way to create a /// `WindowUDF` using trait objects - #[deprecated(since = "34.0.0", note = "please implement ScalarUDFImpl instead")] + #[deprecated(since = "34.0.0", note = "please implement WindowUDFImpl instead")] pub fn new( name: &str, signature: &Signature, @@ -112,6 +112,14 @@ impl WindowUDF { self.inner.clone() } + /// Adds additional names that can be used to invoke this function, in + /// addition to `name` + /// + /// If you implement [`WindowUDFImpl`] directly you should return aliases directly. + pub fn with_aliases(self, aliases: impl IntoIterator) -> Self { + Self::new_from_impl(AliasedWindowUDFImpl::new(self.inner.clone(), aliases)) + } + /// creates a [`Expr`] that calls the window function given /// the `partition_by`, `order_by`, and `window_frame` definition /// @@ -143,6 +151,11 @@ impl WindowUDF { self.inner.name() } + /// Returns the aliases for this function. + pub fn aliases(&self) -> &[String] { + self.inner.aliases() + } + /// Returns this function's signature (what input types are accepted) /// /// See [`WindowUDFImpl::signature`] for more details. @@ -217,7 +230,7 @@ where /// fn partition_evaluator(&self) -> Result> { unimplemented!() } /// } /// -/// // Create a new ScalarUDF from the implementation +/// // Create a new WindowUDF from the implementation /// let smooth_it = WindowUDF::from(SmoothIt::new()); /// /// // Call the function `add_one(col)` @@ -245,6 +258,60 @@ pub trait WindowUDFImpl: Debug + Send + Sync { /// Invoke the function, returning the [`PartitionEvaluator`] instance fn partition_evaluator(&self) -> Result>; + + /// Returns any aliases (alternate names) for this function. + /// + /// Note: `aliases` should only include names other than [`Self::name`]. + /// Defaults to `[]` (no aliases) + fn aliases(&self) -> &[String] { + &[] + } +} + +/// WindowUDF that adds an alias to the underlying function. It is better to +/// implement [`WindowUDFImpl`], which supports aliases, directly if possible. +#[derive(Debug)] +struct AliasedWindowUDFImpl { + inner: Arc, + aliases: Vec, +} + +impl AliasedWindowUDFImpl { + pub fn new( + inner: Arc, + new_aliases: impl IntoIterator, + ) -> Self { + let mut aliases = inner.aliases().to_vec(); + aliases.extend(new_aliases.into_iter().map(|s| s.to_string())); + + Self { inner, aliases } + } +} + +impl WindowUDFImpl for AliasedWindowUDFImpl { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + self.inner.name() + } + + fn signature(&self) -> &Signature { + self.inner.signature() + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + self.inner.return_type(arg_types) + } + + fn partition_evaluator(&self) -> Result> { + self.inner.partition_evaluator() + } + + fn aliases(&self) -> &[String] { + &self.aliases + } } /// Implementation of [`WindowUDFImpl`] that wraps the function style pointers diff --git a/datafusion/functions-array/Cargo.toml b/datafusion/functions-array/Cargo.toml index 17be817238c2..ba7d9e26ecaf 100644 --- a/datafusion/functions-array/Cargo.toml +++ b/datafusion/functions-array/Cargo.toml @@ -38,6 +38,9 @@ path = "src/lib.rs" [dependencies] arrow = { workspace = true } +arrow-array = { workspace = true } +arrow-buffer = { workspace = true } +arrow-schema = { workspace = true } datafusion-common = { workspace = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } diff --git a/datafusion/functions-array/src/concat.rs b/datafusion/functions-array/src/concat.rs new file mode 100644 index 000000000000..a8e7d1008f46 --- /dev/null +++ b/datafusion/functions-array/src/concat.rs @@ -0,0 +1,436 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Includes `array append`, `array prepend`, and `array concat` functions + +use std::{any::Any, cmp::Ordering, sync::Arc}; + +use arrow::array::{Capacities, MutableArrayData}; +use arrow_array::{Array, ArrayRef, GenericListArray, OffsetSizeTrait}; +use arrow_buffer::{BooleanBufferBuilder, NullBuffer, OffsetBuffer}; +use arrow_schema::{DataType, Field}; +use datafusion_common::Result; +use datafusion_common::{ + cast::as_generic_list_array, exec_err, not_impl_err, plan_err, utils::list_ndims, +}; +use datafusion_expr::expr::ScalarFunction; +use datafusion_expr::Expr; +use datafusion_expr::{ + type_coercion::binary::get_wider_type, ColumnarValue, ScalarUDFImpl, Signature, + Volatility, +}; + +use crate::utils::{align_array_dimensions, check_datatypes, make_scalar_function}; + +make_udf_function!( + ArrayAppend, + array_append, + array element, // arg name + "appends an element to the end of an array.", // doc + array_append_udf // internal function name +); + +#[derive(Debug)] +pub(super) struct ArrayAppend { + signature: Signature, + aliases: Vec, +} + +impl ArrayAppend { + pub fn new() -> Self { + Self { + signature: Signature::array_and_element(Volatility::Immutable), + aliases: vec![ + String::from("array_append"), + String::from("list_append"), + String::from("array_push_back"), + String::from("list_push_back"), + ], + } + } +} + +impl ScalarUDFImpl for ArrayAppend { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "array_append" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + Ok(arg_types[0].clone()) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + make_scalar_function(array_append_inner)(args) + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} + +make_udf_function!( + ArrayPrepend, + array_prepend, + element array, + "Prepends an element to the beginning of an array.", + array_prepend_udf +); + +#[derive(Debug)] +pub(super) struct ArrayPrepend { + signature: Signature, + aliases: Vec, +} + +impl ArrayPrepend { + pub fn new() -> Self { + Self { + signature: Signature::element_and_array(Volatility::Immutable), + aliases: vec![ + String::from("array_prepend"), + String::from("list_prepend"), + String::from("array_push_front"), + String::from("list_push_front"), + ], + } + } +} + +impl ScalarUDFImpl for ArrayPrepend { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "array_prepend" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + Ok(arg_types[1].clone()) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + make_scalar_function(array_prepend_inner)(args) + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} + +make_udf_function!( + ArrayConcat, + array_concat, + "Concatenates arrays.", + array_concat_udf +); + +#[derive(Debug)] +pub(super) struct ArrayConcat { + signature: Signature, + aliases: Vec, +} + +impl ArrayConcat { + pub fn new() -> Self { + Self { + signature: Signature::variadic_any(Volatility::Immutable), + aliases: vec![ + String::from("array_concat"), + String::from("array_cat"), + String::from("list_concat"), + String::from("list_cat"), + ], + } + } +} + +impl ScalarUDFImpl for ArrayConcat { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "array_concat" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + let mut expr_type = DataType::Null; + let mut max_dims = 0; + for arg_type in arg_types { + match arg_type { + DataType::List(field) => { + if !field.data_type().equals_datatype(&DataType::Null) { + let dims = list_ndims(arg_type); + expr_type = match max_dims.cmp(&dims) { + Ordering::Greater => expr_type, + Ordering::Equal => get_wider_type(&expr_type, arg_type)?, + Ordering::Less => { + max_dims = dims; + arg_type.clone() + } + }; + } + } + _ => { + return plan_err!( + "The array_concat function can only accept list as the args." + ) + } + } + } + + Ok(expr_type) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + make_scalar_function(array_concat_inner)(args) + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} + +/// Array_concat/Array_cat SQL function +pub(crate) fn array_concat_inner(args: &[ArrayRef]) -> Result { + if args.is_empty() { + return exec_err!("array_concat expects at least one arguments"); + } + + let mut new_args = vec![]; + for arg in args { + let ndim = list_ndims(arg.data_type()); + let base_type = datafusion_common::utils::base_type(arg.data_type()); + if ndim == 0 { + return not_impl_err!("Array is not type '{base_type:?}'."); + } + if !base_type.eq(&DataType::Null) { + new_args.push(arg.clone()); + } + } + + match &args[0].data_type() { + DataType::LargeList(_) => concat_internal::(new_args.as_slice()), + _ => concat_internal::(new_args.as_slice()), + } +} + +fn concat_internal(args: &[ArrayRef]) -> Result { + let args = align_array_dimensions::(args.to_vec())?; + + let list_arrays = args + .iter() + .map(|arg| as_generic_list_array::(arg)) + .collect::>>()?; + // Assume number of rows is the same for all arrays + let row_count = list_arrays[0].len(); + + let mut array_lengths = vec![]; + let mut arrays = vec![]; + let mut valid = BooleanBufferBuilder::new(row_count); + for i in 0..row_count { + let nulls = list_arrays + .iter() + .map(|arr| arr.is_null(i)) + .collect::>(); + + // If all the arrays are null, the concatenated array is null + let is_null = nulls.iter().all(|&x| x); + if is_null { + array_lengths.push(0); + valid.append(false); + } else { + // Get all the arrays on i-th row + let values = list_arrays + .iter() + .map(|arr| arr.value(i)) + .collect::>(); + + let elements = values + .iter() + .map(|a| a.as_ref()) + .collect::>(); + + // Concatenated array on i-th row + let concated_array = arrow::compute::concat(elements.as_slice())?; + array_lengths.push(concated_array.len()); + arrays.push(concated_array); + valid.append(true); + } + } + // Assume all arrays have the same data type + let data_type = list_arrays[0].value_type(); + let buffer = valid.finish(); + + let elements = arrays + .iter() + .map(|a| a.as_ref()) + .collect::>(); + + let list_arr = GenericListArray::::new( + Arc::new(Field::new("item", data_type, true)), + OffsetBuffer::from_lengths(array_lengths), + Arc::new(arrow::compute::concat(elements.as_slice())?), + Some(NullBuffer::new(buffer)), + ); + + Ok(Arc::new(list_arr)) +} + +/// Kernal functions + +/// Array_append SQL function +pub(crate) fn array_append_inner(args: &[ArrayRef]) -> Result { + if args.len() != 2 { + return exec_err!("array_append expects two arguments"); + } + + match args[0].data_type() { + DataType::LargeList(_) => general_append_and_prepend::(args, true), + _ => general_append_and_prepend::(args, true), + } +} + +/// Array_prepend SQL function +pub(crate) fn array_prepend_inner(args: &[ArrayRef]) -> Result { + if args.len() != 2 { + return exec_err!("array_prepend expects two arguments"); + } + + match args[1].data_type() { + DataType::LargeList(_) => general_append_and_prepend::(args, false), + _ => general_append_and_prepend::(args, false), + } +} + +fn general_append_and_prepend( + args: &[ArrayRef], + is_append: bool, +) -> Result +where + i64: TryInto, +{ + let (list_array, element_array) = if is_append { + let list_array = as_generic_list_array::(&args[0])?; + let element_array = &args[1]; + check_datatypes("array_append", &[element_array, list_array.values()])?; + (list_array, element_array) + } else { + let list_array = as_generic_list_array::(&args[1])?; + let element_array = &args[0]; + check_datatypes("array_prepend", &[list_array.values(), element_array])?; + (list_array, element_array) + }; + + let res = match list_array.value_type() { + DataType::List(_) => concat_internal::(args)?, + DataType::LargeList(_) => concat_internal::(args)?, + data_type => { + return generic_append_and_prepend::( + list_array, + element_array, + &data_type, + is_append, + ); + } + }; + + Ok(res) +} + +/// Appends or prepends elements to a ListArray. +/// +/// This function takes a ListArray, an ArrayRef, a FieldRef, and a boolean flag +/// indicating whether to append or prepend the elements. It returns a `Result` +/// representing the resulting ListArray after the operation. +/// +/// # Arguments +/// +/// * `list_array` - A reference to the ListArray to which elements will be appended/prepended. +/// * `element_array` - A reference to the Array containing elements to be appended/prepended. +/// * `field` - A reference to the Field describing the data type of the arrays. +/// * `is_append` - A boolean flag indicating whether to append (`true`) or prepend (`false`) elements. +/// +/// # Examples +/// +/// generic_append_and_prepend( +/// [1, 2, 3], 4, append => [1, 2, 3, 4] +/// 5, [6, 7, 8], prepend => [5, 6, 7, 8] +/// ) +fn generic_append_and_prepend( + list_array: &GenericListArray, + element_array: &ArrayRef, + data_type: &DataType, + is_append: bool, +) -> Result +where + i64: TryInto, +{ + let mut offsets = vec![O::usize_as(0)]; + let values = list_array.values(); + let original_data = values.to_data(); + let element_data = element_array.to_data(); + let capacity = Capacities::Array(original_data.len() + element_data.len()); + + let mut mutable = MutableArrayData::with_capacities( + vec![&original_data, &element_data], + false, + capacity, + ); + + let values_index = 0; + let element_index = 1; + + for (row_index, offset_window) in list_array.offsets().windows(2).enumerate() { + let start = offset_window[0].to_usize().unwrap(); + let end = offset_window[1].to_usize().unwrap(); + if is_append { + mutable.extend(values_index, start, end); + mutable.extend(element_index, row_index, row_index + 1); + } else { + mutable.extend(element_index, row_index, row_index + 1); + mutable.extend(values_index, start, end); + } + offsets.push(offsets[row_index] + O::usize_as(end - start + 1)); + } + + let data = mutable.freeze(); + + Ok(Arc::new(GenericListArray::::try_new( + Arc::new(Field::new("item", data_type.to_owned(), true)), + OffsetBuffer::new(offsets.into()), + arrow_array::make_array(data), + None, + )?)) +} diff --git a/datafusion/functions-array/src/kernels.rs b/datafusion/functions-array/src/kernels.rs index ad96d232aa4a..1a6ebdd9029d 100644 --- a/datafusion/functions-array/src/kernels.rs +++ b/datafusion/functions-array/src/kernels.rs @@ -18,22 +18,33 @@ //! implementation kernels for array functions use arrow::array::{ - Array, ArrayRef, BooleanArray, Date32Array, Float32Array, Float64Array, - GenericListArray, Int16Array, Int32Array, Int64Array, Int8Array, LargeStringArray, - OffsetSizeTrait, StringArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, + Array, ArrayRef, BooleanArray, Capacities, Date32Array, Float32Array, Float64Array, + GenericListArray, Int16Array, Int32Array, Int64Array, Int8Array, LargeListArray, + LargeStringArray, ListArray, ListBuilder, MutableArrayData, OffsetSizeTrait, + StringArray, StringBuilder, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; -use arrow::array::{LargeListArray, ListArray}; -use arrow::buffer::OffsetBuffer; -use arrow::datatypes::Field; -use arrow::datatypes::UInt64Type; -use arrow::datatypes::{DataType, Date32Type, IntervalMonthDayNanoType}; +use arrow::compute; +use arrow::datatypes::{ + DataType, Date32Type, Field, IntervalMonthDayNanoType, UInt64Type, +}; +use arrow::row::{RowConverter, SortField}; +use arrow_array::new_null_array; +use arrow_buffer::{BooleanBufferBuilder, NullBuffer, OffsetBuffer}; +use arrow_schema::FieldRef; +use arrow_schema::SortOptions; + use datafusion_common::cast::{ - as_date32_array, as_generic_list_array, as_int64_array, as_interval_mdn_array, - as_large_list_array, as_list_array, as_null_array, as_string_array, + as_date32_array, as_generic_list_array, as_generic_string_array, as_int64_array, + as_interval_mdn_array, as_large_list_array, as_list_array, as_null_array, + as_string_array, }; -use datafusion_common::{exec_err, not_impl_datafusion_err, DataFusionError, Result}; +use datafusion_common::{ + exec_err, internal_err, not_impl_datafusion_err, DataFusionError, Result, +}; +use itertools::Itertools; use std::any::type_name; use std::sync::Arc; + macro_rules! downcast_arg { ($ARG:expr, $ARRAY_TYPE:ident) => {{ $ARG.as_any().downcast_ref::<$ARRAY_TYPE>().ok_or_else(|| { @@ -259,6 +270,98 @@ pub(super) fn array_to_string(args: &[ArrayRef]) -> Result { Ok(Arc::new(string_arr)) } +/// Splits string at occurrences of delimiter and returns an array of parts +/// string_to_array('abc~@~def~@~ghi', '~@~') = '["abc", "def", "ghi"]' +pub fn string_to_array(args: &[ArrayRef]) -> Result { + if args.len() < 2 || args.len() > 3 { + return exec_err!("string_to_array expects two or three arguments"); + } + let string_array = as_generic_string_array::(&args[0])?; + let delimiter_array = as_generic_string_array::(&args[1])?; + + let mut list_builder = ListBuilder::new(StringBuilder::with_capacity( + string_array.len(), + string_array.get_buffer_memory_size(), + )); + + match args.len() { + 2 => { + string_array.iter().zip(delimiter_array.iter()).for_each( + |(string, delimiter)| { + match (string, delimiter) { + (Some(string), Some("")) => { + list_builder.values().append_value(string); + list_builder.append(true); + } + (Some(string), Some(delimiter)) => { + string.split(delimiter).for_each(|s| { + list_builder.values().append_value(s); + }); + list_builder.append(true); + } + (Some(string), None) => { + string.chars().map(|c| c.to_string()).for_each(|c| { + list_builder.values().append_value(c); + }); + list_builder.append(true); + } + _ => list_builder.append(false), // null value + } + }, + ); + } + + 3 => { + let null_value_array = as_generic_string_array::(&args[2])?; + string_array + .iter() + .zip(delimiter_array.iter()) + .zip(null_value_array.iter()) + .for_each(|((string, delimiter), null_value)| { + match (string, delimiter) { + (Some(string), Some("")) => { + if Some(string) == null_value { + list_builder.values().append_null(); + } else { + list_builder.values().append_value(string); + } + list_builder.append(true); + } + (Some(string), Some(delimiter)) => { + string.split(delimiter).for_each(|s| { + if Some(s) == null_value { + list_builder.values().append_null(); + } else { + list_builder.values().append_value(s); + } + }); + list_builder.append(true); + } + (Some(string), None) => { + string.chars().map(|c| c.to_string()).for_each(|c| { + if Some(c.as_str()) == null_value { + list_builder.values().append_null(); + } else { + list_builder.values().append_value(c); + } + }); + list_builder.append(true); + } + _ => list_builder.append(false), // null value + } + }); + } + _ => { + return exec_err!( + "Expect string_to_array function to take two or three parameters" + ) + } + } + + let list_array = list_builder.finish(); + Ok(Arc::new(list_array) as ArrayRef) +} + /// Generates an array of integers from start to stop with a given step. /// /// This function takes 1 to 3 ArrayRefs as arguments, representing start, stop, and step values. @@ -291,39 +394,66 @@ pub(super) fn gen_range(args: &[ArrayRef], include_upper: bool) -> Result { + return exec_err!( + "step can't be 0 for function {}(start [, stop, step])", + if include_upper { + "generate_series" + } else { + "range" + } + ); + } + Some((start, stop, step)) => { + // Below, we utilize `usize` to represent steps. + // On 32-bit targets, the absolute value of `i64` may fail to fit into `usize`. + let step_abs = usize::try_from(step.unsigned_abs()).map_err(|_| { + not_impl_datafusion_err!("step {} can't fit into usize", step) + })?; + values.extend( + gen_range_iter(start, stop, step < 0, include_upper) + .step_by(step_abs), + ); + offsets.push(values.len() as i32); + valid.append(true); + } + // If any of the arguments is NULL, append a NULL value to the result. + None => { + offsets.push(values.len() as i32); + valid.append(false); + } + }; } let arr = Arc::new(ListArray::try_new( Arc::new(Field::new("item", DataType::Int64, true)), OffsetBuffer::new(offsets.into()), Arc::new(Int64Array::from(values)), - None, + Some(NullBuffer::new(valid.finish())), )?); Ok(arr) } +/// Get the (start, stop, step) args for the range and generate_series function. +/// If any of the arguments is NULL, returns None. +fn retrieve_range_args( + start_array: Option<&Int64Array>, + stop: Option, + step_array: Option<&Int64Array>, + idx: usize, +) -> Option<(i64, i64, i64)> { + // Default start value is 0 if not provided + let start = + start_array.map_or(Some(0), |arr| arr.is_valid(idx).then(|| arr.value(idx)))?; + let stop = stop?; + // Default step value is 1 if not provided + let step = + step_array.map_or(Some(1), |arr| arr.is_valid(idx).then(|| arr.value(idx)))?; + Some((start, stop, step)) +} + /// Returns an iterator of i64 values from start to stop fn gen_range_iter( start: i64, @@ -604,6 +734,152 @@ fn general_array_length(array: &[ArrayRef]) -> Result Result { + if args.len() != 2 { + return exec_err!("array_repeat expects two arguments"); + } + + let element = &args[0]; + let count_array = as_int64_array(&args[1])?; + + match element.data_type() { + DataType::List(_) => { + let list_array = as_list_array(element)?; + general_list_repeat::(list_array, count_array) + } + DataType::LargeList(_) => { + let list_array = as_large_list_array(element)?; + general_list_repeat::(list_array, count_array) + } + _ => general_repeat::(element, count_array), + } +} + +/// For each element of `array[i]` repeat `count_array[i]` times. +/// +/// Assumption for the input: +/// 1. `count[i] >= 0` +/// 2. `array.len() == count_array.len()` +/// +/// For example, +/// ```text +/// array_repeat( +/// [1, 2, 3], [2, 0, 1] => [[1, 1], [], [3]] +/// ) +/// ``` +fn general_repeat( + array: &ArrayRef, + count_array: &Int64Array, +) -> Result { + let data_type = array.data_type(); + let mut new_values = vec![]; + + let count_vec = count_array + .values() + .to_vec() + .iter() + .map(|x| *x as usize) + .collect::>(); + + for (row_index, &count) in count_vec.iter().enumerate() { + let repeated_array = if array.is_null(row_index) { + new_null_array(data_type, count) + } else { + let original_data = array.to_data(); + let capacity = Capacities::Array(count); + let mut mutable = + MutableArrayData::with_capacities(vec![&original_data], false, capacity); + + for _ in 0..count { + mutable.extend(0, row_index, row_index + 1); + } + + let data = mutable.freeze(); + arrow_array::make_array(data) + }; + new_values.push(repeated_array); + } + + let new_values: Vec<_> = new_values.iter().map(|a| a.as_ref()).collect(); + let values = compute::concat(&new_values)?; + + Ok(Arc::new(GenericListArray::::try_new( + Arc::new(Field::new("item", data_type.to_owned(), true)), + OffsetBuffer::from_lengths(count_vec), + values, + None, + )?)) +} + +/// Handle List version of `general_repeat` +/// +/// For each element of `list_array[i]` repeat `count_array[i]` times. +/// +/// For example, +/// ```text +/// array_repeat( +/// [[1, 2, 3], [4, 5], [6]], [2, 0, 1] => [[[1, 2, 3], [1, 2, 3]], [], [[6]]] +/// ) +/// ``` +fn general_list_repeat( + list_array: &GenericListArray, + count_array: &Int64Array, +) -> Result { + let data_type = list_array.data_type(); + let value_type = list_array.value_type(); + let mut new_values = vec![]; + + let count_vec = count_array + .values() + .to_vec() + .iter() + .map(|x| *x as usize) + .collect::>(); + + for (list_array_row, &count) in list_array.iter().zip(count_vec.iter()) { + let list_arr = match list_array_row { + Some(list_array_row) => { + let original_data = list_array_row.to_data(); + let capacity = Capacities::Array(original_data.len() * count); + let mut mutable = MutableArrayData::with_capacities( + vec![&original_data], + false, + capacity, + ); + + for _ in 0..count { + mutable.extend(0, 0, original_data.len()); + } + + let data = mutable.freeze(); + let repeated_array = arrow_array::make_array(data); + + let list_arr = GenericListArray::::try_new( + Arc::new(Field::new("item", value_type.clone(), true)), + OffsetBuffer::::from_lengths(vec![original_data.len(); count]), + repeated_array, + None, + )?; + Arc::new(list_arr) as ArrayRef + } + None => new_null_array(data_type, count), + }; + new_values.push(list_arr); + } + + let lengths = new_values.iter().map(|a| a.len()).collect::>(); + let new_values: Vec<_> = new_values.iter().map(|a| a.as_ref()).collect(); + let values = compute::concat(&new_values)?; + + Ok(Arc::new(ListArray::try_new( + Arc::new(Field::new("item", data_type.to_owned(), true)), + OffsetBuffer::::from_lengths(lengths), + values, + None, + )?)) +} + /// Array_length SQL function pub fn array_length(args: &[ArrayRef]) -> Result { if args.len() != 1 && args.len() != 2 { @@ -617,6 +893,89 @@ pub fn array_length(args: &[ArrayRef]) -> Result { } } +/// Array_sort SQL function +pub fn array_sort(args: &[ArrayRef]) -> Result { + if args.is_empty() || args.len() > 3 { + return exec_err!("array_sort expects one to three arguments"); + } + + let sort_option = match args.len() { + 1 => None, + 2 => { + let sort = as_string_array(&args[1])?.value(0); + Some(SortOptions { + descending: order_desc(sort)?, + nulls_first: true, + }) + } + 3 => { + let sort = as_string_array(&args[1])?.value(0); + let nulls_first = as_string_array(&args[2])?.value(0); + Some(SortOptions { + descending: order_desc(sort)?, + nulls_first: order_nulls_first(nulls_first)?, + }) + } + _ => return exec_err!("array_sort expects 1 to 3 arguments"), + }; + + let list_array = as_list_array(&args[0])?; + let row_count = list_array.len(); + + let mut array_lengths = vec![]; + let mut arrays = vec![]; + let mut valid = BooleanBufferBuilder::new(row_count); + for i in 0..row_count { + if list_array.is_null(i) { + array_lengths.push(0); + valid.append(false); + } else { + let arr_ref = list_array.value(i); + let arr_ref = arr_ref.as_ref(); + + let sorted_array = compute::sort(arr_ref, sort_option)?; + array_lengths.push(sorted_array.len()); + arrays.push(sorted_array); + valid.append(true); + } + } + + // Assume all arrays have the same data type + let data_type = list_array.value_type(); + let buffer = valid.finish(); + + let elements = arrays + .iter() + .map(|a| a.as_ref()) + .collect::>(); + + let list_arr = ListArray::new( + Arc::new(Field::new("item", data_type, true)), + OffsetBuffer::from_lengths(array_lengths), + Arc::new(compute::concat(elements.as_slice())?), + Some(NullBuffer::new(buffer)), + ); + Ok(Arc::new(list_arr)) +} + +fn order_desc(modifier: &str) -> Result { + match modifier.to_uppercase().as_str() { + "DESC" => Ok(true), + "ASC" => Ok(false), + _ => exec_err!("the second parameter of array_sort expects DESC or ASC"), + } +} + +fn order_nulls_first(modifier: &str) -> Result { + match modifier.to_uppercase().as_str() { + "NULLS FIRST" => Ok(true), + "NULLS LAST" => Ok(false), + _ => exec_err!( + "the third parameter of array_sort expects NULLS FIRST or NULLS LAST" + ), + } +} + // Create new offsets that are euqiavlent to `flatten` the array. fn get_offsets_for_flatten( offsets: OffsetBuffer, @@ -685,3 +1044,65 @@ pub fn flatten(args: &[ArrayRef]) -> Result { } } } + +/// array_distinct SQL function +/// example: from list [1, 3, 2, 3, 1, 2, 4] to [1, 2, 3, 4] +pub fn array_distinct(args: &[ArrayRef]) -> Result { + if args.len() != 1 { + return exec_err!("array_distinct needs one argument"); + } + + // handle null + if args[0].data_type() == &DataType::Null { + return Ok(args[0].clone()); + } + + // handle for list & largelist + match args[0].data_type() { + DataType::List(field) => { + let array = as_list_array(&args[0])?; + general_array_distinct(array, field) + } + DataType::LargeList(field) => { + let array = as_large_list_array(&args[0])?; + general_array_distinct(array, field) + } + array_type => exec_err!("array_distinct does not support type '{array_type:?}'"), + } +} + +pub fn general_array_distinct( + array: &GenericListArray, + field: &FieldRef, +) -> Result { + let dt = array.value_type(); + let mut offsets = Vec::with_capacity(array.len()); + offsets.push(OffsetSize::usize_as(0)); + let mut new_arrays = Vec::with_capacity(array.len()); + let converter = RowConverter::new(vec![SortField::new(dt)])?; + // distinct for each list in ListArray + for arr in array.iter().flatten() { + let values = converter.convert_columns(&[arr])?; + // sort elements in list and remove duplicates + let rows = values.iter().sorted().dedup().collect::>(); + let last_offset: OffsetSize = offsets.last().copied().unwrap(); + offsets.push(last_offset + OffsetSize::usize_as(rows.len())); + let arrays = converter.convert_rows(rows)?; + let array = match arrays.first() { + Some(array) => array.clone(), + None => { + return internal_err!("array_distinct: failed to get array from rows") + } + }; + new_arrays.push(array); + } + let offsets = OffsetBuffer::new(offsets.into()); + let new_arrays_ref = new_arrays.iter().map(|v| v.as_ref()).collect::>(); + let values = compute::concat(&new_arrays_ref)?; + Ok(Arc::new(GenericListArray::::try_new( + field.clone(), + offsets, + values, + None, + )?)) +} diff --git a/datafusion/functions-array/src/lib.rs b/datafusion/functions-array/src/lib.rs index 73055966ee46..31b971a42297 100644 --- a/datafusion/functions-array/src/lib.rs +++ b/datafusion/functions-array/src/lib.rs @@ -29,7 +29,9 @@ pub mod macros; mod array_has; +mod concat; mod kernels; +mod make_array; mod udf; mod utils; @@ -44,32 +46,48 @@ pub mod expr_fn { pub use super::array_has::array_has; pub use super::array_has::array_has_all; pub use super::array_has::array_has_any; + pub use super::concat::array_append; + pub use super::concat::array_concat; + pub use super::concat::array_prepend; + pub use super::make_array::make_array; pub use super::udf::array_dims; + pub use super::udf::array_distinct; pub use super::udf::array_empty; pub use super::udf::array_length; pub use super::udf::array_ndims; + pub use super::udf::array_repeat; + pub use super::udf::array_sort; pub use super::udf::array_to_string; pub use super::udf::cardinality; pub use super::udf::flatten; pub use super::udf::gen_series; pub use super::udf::range; + pub use super::udf::string_to_array; } /// Registers all enabled packages with a [`FunctionRegistry`] pub fn register_all(registry: &mut dyn FunctionRegistry) -> Result<()> { let functions: Vec> = vec![ udf::array_to_string_udf(), + udf::string_to_array_udf(), udf::range_udf(), udf::gen_series_udf(), udf::array_dims_udf(), udf::cardinality_udf(), udf::array_ndims_udf(), + concat::array_append_udf(), + concat::array_prepend_udf(), + concat::array_concat_udf(), + make_array::make_array_udf(), array_has::array_has_udf(), array_has::array_has_all_udf(), array_has::array_has_any_udf(), udf::array_empty_udf(), udf::array_length_udf(), udf::flatten_udf(), + udf::array_sort_udf(), + udf::array_distinct_udf(), + udf::array_repeat_udf(), ]; functions.into_iter().try_for_each(|udf| { let existing_udf = registry.register_udf(udf)?; diff --git a/datafusion/functions-array/src/macros.rs b/datafusion/functions-array/src/macros.rs index c503fde05b18..c49f5830b8d5 100644 --- a/datafusion/functions-array/src/macros.rs +++ b/datafusion/functions-array/src/macros.rs @@ -76,4 +76,34 @@ macro_rules! make_udf_function { } } }; + ($UDF:ty, $EXPR_FN:ident, $DOC:expr , $SCALAR_UDF_FN:ident) => { + paste::paste! { + // "fluent expr_fn" style function + #[doc = $DOC] + pub fn $EXPR_FN(arg: Vec) -> Expr { + Expr::ScalarFunction(ScalarFunction::new_udf( + $SCALAR_UDF_FN(), + arg, + )) + } + + /// Singleton instance of [`$UDF`], ensures the UDF is only created once + /// named STATIC_$(UDF). For example `STATIC_ArrayToString` + #[allow(non_upper_case_globals)] + static [< STATIC_ $UDF >]: std::sync::OnceLock> = + std::sync::OnceLock::new(); + /// ScalarFunction that returns a [`ScalarUDF`] for [`$UDF`] + /// + /// [`ScalarUDF`]: datafusion_expr::ScalarUDF + pub fn $SCALAR_UDF_FN() -> std::sync::Arc { + [< STATIC_ $UDF >] + .get_or_init(|| { + std::sync::Arc::new(datafusion_expr::ScalarUDF::new_from_impl( + <$UDF>::new(), + )) + }) + .clone() + } + } + }; } diff --git a/datafusion/functions-array/src/make_array.rs b/datafusion/functions-array/src/make_array.rs new file mode 100644 index 000000000000..a371ea767b15 --- /dev/null +++ b/datafusion/functions-array/src/make_array.rs @@ -0,0 +1,221 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// core array function like `make_array` + +use std::{any::Any, sync::Arc}; + +use arrow::array::{ArrayData, Capacities, MutableArrayData}; +use arrow_array::{ + new_null_array, Array, ArrayRef, GenericListArray, NullArray, OffsetSizeTrait, +}; +use arrow_buffer::OffsetBuffer; +use arrow_schema::{DataType, Field}; +use datafusion_common::Result; +use datafusion_common::{plan_err, utils::array_into_list_array}; +use datafusion_expr::expr::ScalarFunction; +use datafusion_expr::Expr; +use datafusion_expr::{ + ColumnarValue, ScalarUDFImpl, Signature, TypeSignature, Volatility, +}; + +use crate::utils::make_scalar_function; + +make_udf_function!( + MakeArray, + make_array, + "Returns an Arrow array using the specified input expressions.", + make_array_udf +); + +#[derive(Debug)] +pub struct MakeArray { + signature: Signature, + aliases: Vec, +} + +impl MakeArray { + pub fn new() -> Self { + Self { + signature: Signature::one_of( + vec![TypeSignature::VariadicEqual, TypeSignature::Any(0)], + Volatility::Immutable, + ), + aliases: vec![String::from("make_array"), String::from("make_list")], + } + } +} + +impl ScalarUDFImpl for MakeArray { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "make_array" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> datafusion_common::Result { + match arg_types.len() { + 0 => Ok(DataType::List(Arc::new(Field::new( + "item", + DataType::Null, + true, + )))), + _ => { + let mut expr_type = DataType::Null; + for arg_type in arg_types { + if !arg_type.equals_datatype(&DataType::Null) { + expr_type = arg_type.clone(); + break; + } + } + + Ok(DataType::List(Arc::new(Field::new( + "item", expr_type, true, + )))) + } + } + } + + fn invoke(&self, args: &[ColumnarValue]) -> datafusion_common::Result { + make_scalar_function(make_array_inner)(args) + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} + +/// `make_array` SQL function +pub(crate) fn make_array_inner(arrays: &[ArrayRef]) -> Result { + let mut data_type = DataType::Null; + for arg in arrays { + let arg_data_type = arg.data_type(); + if !arg_data_type.equals_datatype(&DataType::Null) { + data_type = arg_data_type.clone(); + break; + } + } + + match data_type { + // Either an empty array or all nulls: + DataType::Null => { + let array = + new_null_array(&DataType::Null, arrays.iter().map(|a| a.len()).sum()); + Ok(Arc::new(array_into_list_array(array))) + } + DataType::LargeList(..) => array_array::(arrays, data_type), + _ => array_array::(arrays, data_type), + } +} + +/// Convert one or more [`ArrayRef`] of the same type into a +/// `ListArray` or 'LargeListArray' depending on the offset size. +/// +/// # Example (non nested) +/// +/// Calling `array(col1, col2)` where col1 and col2 are non nested +/// would return a single new `ListArray`, where each row was a list +/// of 2 elements: +/// +/// ```text +/// ┌─────────┐ ┌─────────┐ ┌──────────────┐ +/// │ ┌─────┐ │ │ ┌─────┐ │ │ ┌──────────┐ │ +/// │ │ A │ │ │ │ X │ │ │ │ [A, X] │ │ +/// │ ├─────┤ │ │ ├─────┤ │ │ ├──────────┤ │ +/// │ │NULL │ │ │ │ Y │ │──────────▶│ │[NULL, Y] │ │ +/// │ ├─────┤ │ │ ├─────┤ │ │ ├──────────┤ │ +/// │ │ C │ │ │ │ Z │ │ │ │ [C, Z] │ │ +/// │ └─────┘ │ │ └─────┘ │ │ └──────────┘ │ +/// └─────────┘ └─────────┘ └──────────────┘ +/// col1 col2 output +/// ``` +/// +/// # Example (nested) +/// +/// Calling `array(col1, col2)` where col1 and col2 are lists +/// would return a single new `ListArray`, where each row was a list +/// of the corresponding elements of col1 and col2. +/// +/// ``` text +/// ┌──────────────┐ ┌──────────────┐ ┌─────────────────────────────┐ +/// │ ┌──────────┐ │ │ ┌──────────┐ │ │ ┌────────────────────────┐ │ +/// │ │ [A, X] │ │ │ │ [] │ │ │ │ [[A, X], []] │ │ +/// │ ├──────────┤ │ │ ├──────────┤ │ │ ├────────────────────────┤ │ +/// │ │[NULL, Y] │ │ │ │[Q, R, S] │ │───────▶│ │ [[NULL, Y], [Q, R, S]] │ │ +/// │ ├──────────┤ │ │ ├──────────┤ │ │ ├────────────────────────│ │ +/// │ │ [C, Z] │ │ │ │ NULL │ │ │ │ [[C, Z], NULL] │ │ +/// │ └──────────┘ │ │ └──────────┘ │ │ └────────────────────────┘ │ +/// └──────────────┘ └──────────────┘ └─────────────────────────────┘ +/// col1 col2 output +/// ``` +fn array_array( + args: &[ArrayRef], + data_type: DataType, +) -> Result { + // do not accept 0 arguments. + if args.is_empty() { + return plan_err!("Array requires at least one argument"); + } + + let mut data = vec![]; + let mut total_len = 0; + for arg in args { + let arg_data = if arg.as_any().is::() { + ArrayData::new_empty(&data_type) + } else { + arg.to_data() + }; + total_len += arg_data.len(); + data.push(arg_data); + } + + let mut offsets: Vec = Vec::with_capacity(total_len); + offsets.push(O::usize_as(0)); + + let capacity = Capacities::Array(total_len); + let data_ref = data.iter().collect::>(); + let mut mutable = MutableArrayData::with_capacities(data_ref, true, capacity); + + let num_rows = args[0].len(); + for row_idx in 0..num_rows { + for (arr_idx, arg) in args.iter().enumerate() { + if !arg.as_any().is::() + && !arg.is_null(row_idx) + && arg.is_valid(row_idx) + { + mutable.extend(arr_idx, row_idx, row_idx + 1); + } else { + mutable.extend_nulls(1); + } + } + offsets.push(O::usize_as(mutable.len())); + } + let data = mutable.freeze(); + + Ok(Arc::new(GenericListArray::::try_new( + Arc::new(Field::new("item", data_type, true)), + OffsetBuffer::new(offsets.into()), + arrow_array::make_array(data), + None, + )?)) +} diff --git a/datafusion/functions-array/src/udf.rs b/datafusion/functions-array/src/udf.rs index b2c310e1701d..9fd9e0309bde 100644 --- a/datafusion/functions-array/src/udf.rs +++ b/datafusion/functions-array/src/udf.rs @@ -17,15 +17,17 @@ //! [`ScalarUDFImpl`] definitions for array functions. +use arrow::array::{NullArray, StringArray}; use arrow::datatypes::DataType; use arrow::datatypes::Field; use arrow::datatypes::IntervalUnit::MonthDayNano; +use arrow_schema::DataType::{LargeUtf8, List, Utf8}; use datafusion_common::exec_err; use datafusion_common::plan_err; use datafusion_common::Result; use datafusion_expr::expr::ScalarFunction; use datafusion_expr::Expr; -use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::TypeSignature; use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; use std::any::Any; use std::sync::Arc; @@ -89,6 +91,81 @@ impl ScalarUDFImpl for ArrayToString { } } +make_udf_function!(StringToArray, + string_to_array, + string delimiter null_string, // arg name + "splits a `string` based on a `delimiter` and returns an array of parts. Any parts matching the optional `null_string` will be replaced with `NULL`", // doc + string_to_array_udf // internal function name +); +#[derive(Debug)] +pub(super) struct StringToArray { + signature: Signature, + aliases: Vec, +} + +impl StringToArray { + pub fn new() -> Self { + Self { + signature: Signature::variadic_any(Volatility::Immutable), + aliases: vec![ + String::from("string_to_array"), + String::from("string_to_list"), + ], + } + } +} + +impl ScalarUDFImpl for StringToArray { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "string_to_array" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + use DataType::*; + Ok(match arg_types[0] { + Utf8 | LargeUtf8 => { + List(Arc::new(Field::new("item", arg_types[0].clone(), true))) + } + _ => { + return plan_err!( + "The string_to_array function can only accept Utf8 or LargeUtf8." + ); + } + }) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let mut args = ColumnarValue::values_to_arrays(args)?; + // Case: delimiter is NULL, needs to be handled as well. + if args[1].as_any().is::() { + args[1] = Arc::new(StringArray::new_null(args[1].len())); + }; + + match args[0].data_type() { + Utf8 => { + crate::kernels::string_to_array::(&args).map(ColumnarValue::Array) + } + LargeUtf8 => { + crate::kernels::string_to_array::(&args).map(ColumnarValue::Array) + } + other => { + exec_err!("unsupported type for string_to_array function as {other}") + } + } + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} + make_udf_function!( Range, range, @@ -107,10 +184,10 @@ impl Range { Self { signature: Signature::one_of( vec![ - Exact(vec![Int64]), - Exact(vec![Int64, Int64]), - Exact(vec![Int64, Int64, Int64]), - Exact(vec![Date32, Date32, Interval(MonthDayNano)]), + TypeSignature::Exact(vec![Int64]), + TypeSignature::Exact(vec![Int64, Int64]), + TypeSignature::Exact(vec![Int64, Int64, Int64]), + TypeSignature::Exact(vec![Date32, Date32, Interval(MonthDayNano)]), ], Volatility::Immutable, ), @@ -177,10 +254,10 @@ impl GenSeries { Self { signature: Signature::one_of( vec![ - Exact(vec![Int64]), - Exact(vec![Int64, Int64]), - Exact(vec![Int64, Int64, Int64]), - Exact(vec![Date32, Date32, Interval(MonthDayNano)]), + TypeSignature::Exact(vec![Int64]), + TypeSignature::Exact(vec![Int64, Int64]), + TypeSignature::Exact(vec![Int64, Int64, Int64]), + TypeSignature::Exact(vec![Date32, Date32, Interval(MonthDayNano)]), ], Volatility::Immutable, ), @@ -286,6 +363,70 @@ impl ScalarUDFImpl for ArrayDims { } } +make_udf_function!( + ArraySort, + array_sort, + array desc null_first, + "returns sorted array.", + array_sort_udf +); + +#[derive(Debug)] +pub(super) struct ArraySort { + signature: Signature, + aliases: Vec, +} + +impl ArraySort { + pub fn new() -> Self { + Self { + signature: Signature::variadic_any(Volatility::Immutable), + aliases: vec!["array_sort".to_string(), "list_sort".to_string()], + } + } +} + +impl ScalarUDFImpl for ArraySort { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "array_sort" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + use DataType::*; + match &arg_types[0] { + List(field) | FixedSizeList(field, _) => Ok(List(Arc::new(Field::new( + "item", + field.data_type().clone(), + true, + )))), + LargeList(field) => Ok(LargeList(Arc::new(Field::new( + "item", + field.data_type().clone(), + true, + )))), + _ => exec_err!( + "Not reachable, data_type should be List, LargeList or FixedSizeList" + ), + } + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let args = ColumnarValue::values_to_arrays(args)?; + crate::kernels::array_sort(&args).map(ColumnarValue::Array) + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} + make_udf_function!( Cardinality, cardinality, @@ -448,6 +589,58 @@ impl ScalarUDFImpl for ArrayEmpty { } } +make_udf_function!( + ArrayRepeat, + array_repeat, + element count, // arg name + "returns an array containing element `count` times.", // doc + array_repeat_udf // internal function name +); +#[derive(Debug)] +pub(super) struct ArrayRepeat { + signature: Signature, + aliases: Vec, +} + +impl ArrayRepeat { + pub fn new() -> Self { + Self { + signature: Signature::variadic_any(Volatility::Immutable), + aliases: vec![String::from("array_repeat"), String::from("list_repeat")], + } + } +} + +impl ScalarUDFImpl for ArrayRepeat { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "array_repeat" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + Ok(List(Arc::new(Field::new( + "item", + arg_types[0].clone(), + true, + )))) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let args = ColumnarValue::values_to_arrays(args)?; + crate::kernels::array_repeat(&args).map(ColumnarValue::Array) + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} + make_udf_function!( ArrayLength, array_length, @@ -569,3 +762,67 @@ impl ScalarUDFImpl for Flatten { &self.aliases } } + +make_udf_function!( + ArrayDistinct, + array_distinct, + array, + "return distinct values from the array after removing duplicates.", + array_distinct_udf +); + +#[derive(Debug)] +pub(super) struct ArrayDistinct { + signature: Signature, + aliases: Vec, +} + +impl crate::udf::ArrayDistinct { + pub fn new() -> Self { + Self { + signature: Signature::array(Volatility::Immutable), + aliases: vec!["array_distinct".to_string(), "list_distinct".to_string()], + } + } +} + +impl ScalarUDFImpl for crate::udf::ArrayDistinct { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "array_distinct" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + use DataType::*; + match &arg_types[0] { + List(field) | FixedSizeList(field, _) => Ok(List(Arc::new(Field::new( + "item", + field.data_type().clone(), + true, + )))), + LargeList(field) => Ok(LargeList(Arc::new(Field::new( + "item", + field.data_type().clone(), + true, + )))), + _ => exec_err!( + "Not reachable, data_type should be List, LargeList or FixedSizeList" + ), + } + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let args = ColumnarValue::values_to_arrays(args)?; + crate::kernels::array_distinct(&args).map(ColumnarValue::Array) + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} diff --git a/datafusion/functions-array/src/utils.rs b/datafusion/functions-array/src/utils.rs index d374a9f66be0..3a6bb723c1fa 100644 --- a/datafusion/functions-array/src/utils.rs +++ b/datafusion/functions-array/src/utils.rs @@ -17,8 +17,14 @@ //! array function utils +use std::sync::Arc; + use arrow::{array::ArrayRef, datatypes::DataType}; -use datafusion_common::{plan_err, Result}; +use arrow_array::{GenericListArray, OffsetSizeTrait}; +use arrow_buffer::OffsetBuffer; +use arrow_schema::Field; +use datafusion_common::{plan_err, Result, ScalarValue}; +use datafusion_expr::{ColumnarValue, ScalarFunctionImplementation}; pub(crate) fn check_datatypes(name: &str, args: &[&ArrayRef]) -> Result<()> { let data_type = args[0].data_type(); @@ -32,3 +38,124 @@ pub(crate) fn check_datatypes(name: &str, args: &[&ArrayRef]) -> Result<()> { Ok(()) } + +pub(crate) fn make_scalar_function(inner: F) -> ScalarFunctionImplementation +where + F: Fn(&[ArrayRef]) -> Result + Sync + Send + 'static, +{ + Arc::new(move |args: &[ColumnarValue]| { + // first, identify if any of the arguments is an Array. If yes, store its `len`, + // as any scalar will need to be converted to an array of len `len`. + let len = args + .iter() + .fold(Option::::None, |acc, arg| match arg { + ColumnarValue::Scalar(_) => acc, + ColumnarValue::Array(a) => Some(a.len()), + }); + + let is_scalar = len.is_none(); + + let args = ColumnarValue::values_to_arrays(args)?; + + let result = (inner)(&args); + + if is_scalar { + // If all inputs are scalar, keeps output as scalar + let result = result.and_then(|arr| ScalarValue::try_from_array(&arr, 0)); + result.map(ColumnarValue::Scalar) + } else { + result.map(ColumnarValue::Array) + } + }) +} + +pub(crate) fn align_array_dimensions( + args: Vec, +) -> Result> { + let args_ndim = args + .iter() + .map(|arg| datafusion_common::utils::list_ndims(arg.data_type())) + .collect::>(); + let max_ndim = args_ndim.iter().max().unwrap_or(&0); + + // Align the dimensions of the arrays + let aligned_args: Result> = args + .into_iter() + .zip(args_ndim.iter()) + .map(|(array, ndim)| { + if ndim < max_ndim { + let mut aligned_array = array.clone(); + for _ in 0..(max_ndim - ndim) { + let data_type = aligned_array.data_type().to_owned(); + let array_lengths = vec![1; aligned_array.len()]; + let offsets = OffsetBuffer::::from_lengths(array_lengths); + + aligned_array = Arc::new(GenericListArray::::try_new( + Arc::new(Field::new("item", data_type, true)), + offsets, + aligned_array, + None, + )?) + } + Ok(aligned_array) + } else { + Ok(array.clone()) + } + }) + .collect(); + + aligned_args +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::datatypes::Int64Type; + use arrow_array::ListArray; + use datafusion_common::{cast::as_list_array, utils::array_into_list_array}; + + /// Only test internal functions, array-related sql functions will be tested in sqllogictest `array.slt` + #[test] + fn test_align_array_dimensions() { + let array1d_1 = + Arc::new(ListArray::from_iter_primitive::(vec![ + Some(vec![Some(1), Some(2), Some(3)]), + Some(vec![Some(4), Some(5)]), + ])); + let array1d_2 = + Arc::new(ListArray::from_iter_primitive::(vec![ + Some(vec![Some(6), Some(7), Some(8)]), + ])); + + let array2d_1 = Arc::new(array_into_list_array(array1d_1.clone())) as ArrayRef; + let array2d_2 = Arc::new(array_into_list_array(array1d_2.clone())) as ArrayRef; + + let res = align_array_dimensions::(vec![ + array1d_1.to_owned(), + array2d_2.to_owned(), + ]) + .unwrap(); + + let expected = as_list_array(&array2d_1).unwrap(); + let expected_dim = datafusion_common::utils::list_ndims(array2d_1.data_type()); + assert_ne!(as_list_array(&res[0]).unwrap(), expected); + assert_eq!( + datafusion_common::utils::list_ndims(res[0].data_type()), + expected_dim + ); + + let array3d_1 = Arc::new(array_into_list_array(array2d_1)) as ArrayRef; + let array3d_2 = array_into_list_array(array2d_2.to_owned()); + let res = + align_array_dimensions::(vec![array1d_1, Arc::new(array3d_2.clone())]) + .unwrap(); + + let expected = as_list_array(&array3d_1).unwrap(); + let expected_dim = datafusion_common::utils::list_ndims(array3d_1.data_type()); + assert_ne!(as_list_array(&res[0]).unwrap(), expected); + assert_eq!( + datafusion_common::utils::list_ndims(res[0].data_type()), + expected_dim + ); + } +} diff --git a/datafusion/functions/src/core/arrowtypeof.rs b/datafusion/functions/src/core/arrowtypeof.rs new file mode 100644 index 000000000000..89702d3267ec --- /dev/null +++ b/datafusion/functions/src/core/arrowtypeof.rs @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::datatypes::DataType; +use datafusion_common::{exec_err, Result, ScalarValue}; +use datafusion_expr::ColumnarValue; +use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; +use std::any::Any; + +#[derive(Debug)] +pub(super) struct ArrowTypeOfFunc { + signature: Signature, +} + +impl ArrowTypeOfFunc { + pub fn new() -> Self { + Self { + signature: Signature::any(1, Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for ArrowTypeOfFunc { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "arrow_typeof" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Utf8) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + if args.len() != 1 { + return exec_err!( + "arrow_typeof function requires 1 arguments, got {}", + args.len() + ); + } + + let input_data_type = args[0].data_type(); + Ok(ColumnarValue::Scalar(ScalarValue::from(format!( + "{input_data_type}" + )))) + } +} diff --git a/datafusion/functions/src/core/mod.rs b/datafusion/functions/src/core/mod.rs index 842a1db3e0d0..3f13067a4a07 100644 --- a/datafusion/functions/src/core/mod.rs +++ b/datafusion/functions/src/core/mod.rs @@ -17,18 +17,24 @@ //! "core" DataFusion functions +mod arrowtypeof; mod nullif; mod nvl; mod nvl2; +pub mod r#struct; // create UDFs make_udf_function!(nullif::NullIfFunc, NULLIF, nullif); make_udf_function!(nvl::NVLFunc, NVL, nvl); make_udf_function!(nvl2::NVL2Func, NVL2, nvl2); +make_udf_function!(arrowtypeof::ArrowTypeOfFunc, ARROWTYPEOF, arrow_typeof); +make_udf_function!(r#struct::StructFunc, STRUCT, r#struct); // Export the functions out of this package, both as expr_fn as well as a list of functions export_functions!( (nullif, arg_1 arg_2, "returns NULL if value1 equals value2; otherwise it returns value1. This can be used to perform the inverse operation of the COALESCE expression."), (nvl, arg_1 arg_2, "returns value2 if value1 is NULL; otherwise it returns value1"), - (nvl2, arg_1 arg_2 arg_3, "Returns value2 if value1 is not NULL; otherwise, it returns value3.") + (nvl2, arg_1 arg_2 arg_3, "Returns value2 if value1 is not NULL; otherwise, it returns value3."), + (arrow_typeof, arg_1, "Returns the Arrow type of the input expression."), + (r#struct, args, "Returns a struct with the given arguments") ); diff --git a/datafusion/functions/src/core/nullif.rs b/datafusion/functions/src/core/nullif.rs index 3ff8dbd942ff..1e903d7a881d 100644 --- a/datafusion/functions/src/core/nullif.rs +++ b/datafusion/functions/src/core/nullif.rs @@ -15,8 +15,6 @@ // specific language governing permissions and limitations // under the License. -//! Encoding expressions - use arrow::datatypes::DataType; use datafusion_common::{exec_err, Result}; use datafusion_expr::ColumnarValue; diff --git a/datafusion/physical-expr/src/struct_expressions.rs b/datafusion/functions/src/core/struct.rs similarity index 73% rename from datafusion/physical-expr/src/struct_expressions.rs rename to datafusion/functions/src/core/struct.rs index f420e062ef91..6236f98794bb 100644 --- a/datafusion/physical-expr/src/struct_expressions.rs +++ b/datafusion/functions/src/core/struct.rs @@ -15,12 +15,12 @@ // specific language governing permissions and limitations // under the License. -//! Struct expressions - -use arrow::array::*; -use arrow::datatypes::Field; +use arrow::datatypes::{DataType, Field, Fields}; +use arrow_array::{ArrayRef, StructArray}; use datafusion_common::{exec_err, Result}; use datafusion_expr::ColumnarValue; +use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; +use std::any::Any; use std::sync::Arc; fn array_struct(args: &[ArrayRef]) -> Result { @@ -47,10 +47,9 @@ fn array_struct(args: &[ArrayRef]) -> Result { Ok(Arc::new(StructArray::from(vec))) } - /// put values in a struct array. -pub fn struct_expr(values: &[ColumnarValue]) -> Result { - let arrays = values +fn struct_expr(args: &[ColumnarValue]) -> Result { + let arrays = args .iter() .map(|x| { Ok(match x { @@ -61,10 +60,55 @@ pub fn struct_expr(values: &[ColumnarValue]) -> Result { .collect::>>()?; Ok(ColumnarValue::Array(array_struct(arrays.as_slice())?)) } +#[derive(Debug)] +pub struct StructFunc { + signature: Signature, +} + +impl StructFunc { + pub fn new() -> Self { + Self { + signature: Signature::variadic_any(Volatility::Immutable), + } + } +} + +impl Default for StructFunc { + fn default() -> Self { + Self::new() + } +} + +impl ScalarUDFImpl for StructFunc { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "struct" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + let return_fields = arg_types + .iter() + .enumerate() + .map(|(pos, dt)| Field::new(format!("c{pos}"), dt.clone(), true)) + .collect::>(); + Ok(DataType::Struct(Fields::from(return_fields))) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + struct_expr(args) + } +} #[cfg(test)] mod tests { use super::*; + use arrow_array::Int64Array; use datafusion_common::cast::as_struct_array; use datafusion_common::ScalarValue; diff --git a/datafusion/functions/src/macros.rs b/datafusion/functions/src/macros.rs index 1984ae659a81..859964b5b8d5 100644 --- a/datafusion/functions/src/macros.rs +++ b/datafusion/functions/src/macros.rs @@ -72,7 +72,7 @@ macro_rules! make_udf_function { /// Return a [`ScalarUDF`] for [`$UDF`] /// /// [`ScalarUDF`]: datafusion_expr::ScalarUDF - fn $NAME() -> std::sync::Arc { + pub fn $NAME() -> std::sync::Arc { $GNAME .get_or_init(|| { std::sync::Arc::new(datafusion_expr::ScalarUDF::new_from_impl( diff --git a/datafusion/functions/src/math/mod.rs b/datafusion/functions/src/math/mod.rs index 3741cc2802bb..e7ede6043a59 100644 --- a/datafusion/functions/src/math/mod.rs +++ b/datafusion/functions/src/math/mod.rs @@ -21,12 +21,16 @@ mod abs; mod acos; mod asin; mod nans; +mod tan; +mod tanh; // create UDFs make_udf_function!(nans::IsNanFunc, ISNAN, isnan); make_udf_function!(abs::AbsFunc, ABS, abs); make_udf_function!(acos::AcosFunc, ACOS, acos); make_udf_function!(asin::AsinFunc, ASIN, asin); +make_udf_function!(tan::TanFunc, TAN, tan); +make_udf_function!(tanh::TanhFunc, TANH, tanh); // Export the functions out of this package, both as expr_fn as well as a list of functions export_functions!( @@ -45,5 +49,7 @@ export_functions!( asin, num, "returns the arc sine or inverse sine of a number" - ) + ), + (tan, num, "returns the tangent of a number"), + (tanh, num, "returns the hyperbolic tangent of a number") ); diff --git a/datafusion/functions/src/math/tan.rs b/datafusion/functions/src/math/tan.rs new file mode 100644 index 000000000000..ea3e002f8489 --- /dev/null +++ b/datafusion/functions/src/math/tan.rs @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Math function: `tan()`. + +use std::any::Any; +use std::sync::Arc; + +use arrow::datatypes::DataType; +use arrow_array::{ArrayRef, Float32Array, Float64Array}; +use datafusion_common::{exec_err, DataFusionError, Result}; +use datafusion_expr::Volatility; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature}; + +#[derive(Debug)] +pub struct TanFunc { + signature: Signature, +} + +impl TanFunc { + pub fn new() -> Self { + Self { + signature: Signature::uniform( + 1, + vec![DataType::Float64, DataType::Float32], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for TanFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "tan" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + let arg_type = &arg_types[0]; + + match arg_type { + DataType::Float64 => Ok(DataType::Float64), + DataType::Float32 => Ok(DataType::Float32), + + // For other types (possible values null/int), use Float 64 + _ => Ok(DataType::Float64), + } + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let args = ColumnarValue::values_to_arrays(args)?; + + let arr: ArrayRef = match args[0].data_type() { + DataType::Float64 => Arc::new(make_function_scalar_inputs_return_type!( + &args[0], + self.name(), + Float64Array, + Float64Array, + { f64::tan } + )), + DataType::Float32 => Arc::new(make_function_scalar_inputs_return_type!( + &args[0], + self.name(), + Float32Array, + Float32Array, + { f32::tan } + )), + other => { + return exec_err!( + "Unsupported data type {other:?} for function {}", + self.name() + ) + } + }; + Ok(ColumnarValue::Array(arr)) + } +} diff --git a/datafusion/functions/src/math/tanh.rs b/datafusion/functions/src/math/tanh.rs new file mode 100644 index 000000000000..af34681919ab --- /dev/null +++ b/datafusion/functions/src/math/tanh.rs @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Math function: `tanh()`. + +use std::any::Any; +use std::sync::Arc; + +use arrow::datatypes::DataType; +use arrow_array::{ArrayRef, Float32Array, Float64Array}; +use datafusion_common::{exec_err, DataFusionError, Result}; +use datafusion_expr::Volatility; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature}; + +#[derive(Debug)] +pub struct TanhFunc { + signature: Signature, +} + +impl TanhFunc { + pub fn new() -> Self { + Self { + signature: Signature::uniform( + 1, + vec![DataType::Float64, DataType::Float32], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for TanhFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "tanh" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + let arg_type = &arg_types[0]; + + match arg_type { + DataType::Float64 => Ok(DataType::Float64), + DataType::Float32 => Ok(DataType::Float32), + + // For other types (possible values null/int), use Float 64 + _ => Ok(DataType::Float64), + } + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let args = ColumnarValue::values_to_arrays(args)?; + + let arr: ArrayRef = match args[0].data_type() { + DataType::Float64 => Arc::new(make_function_scalar_inputs_return_type!( + &args[0], + self.name(), + Float64Array, + Float64Array, + { f64::tanh } + )), + DataType::Float32 => Arc::new(make_function_scalar_inputs_return_type!( + &args[0], + self.name(), + Float32Array, + Float32Array, + { f32::tanh } + )), + other => { + return exec_err!( + "Unsupported data type {other:?} for function {}", + self.name() + ) + } + }; + Ok(ColumnarValue::Array(arr)) + } +} diff --git a/datafusion/optimizer/src/analyzer/rewrite_expr.rs b/datafusion/optimizer/src/analyzer/rewrite_expr.rs index 6f856fa9bd16..99578e91183c 100644 --- a/datafusion/optimizer/src/analyzer/rewrite_expr.rs +++ b/datafusion/optimizer/src/analyzer/rewrite_expr.rs @@ -17,23 +17,27 @@ //! Analyzer rule for to replace operators with function calls (e.g `||` to array_concat`) +#[cfg(feature = "array_expressions")] use std::sync::Arc; use super::AnalyzerRule; use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::{Transformed, TreeNodeRewriter}; -use datafusion_common::utils::list_ndims; -use datafusion_common::{DFSchema, DFSchemaRef, Result}; +#[cfg(feature = "array_expressions")] +use datafusion_common::{utils::list_ndims, DFSchemaRef}; +use datafusion_common::{DFSchema, Result}; use datafusion_expr::expr::ScalarFunction; use datafusion_expr::expr_rewriter::rewrite_preserving_name; use datafusion_expr::utils::merge_schema; use datafusion_expr::BuiltinScalarFunction; use datafusion_expr::GetFieldAccess; use datafusion_expr::GetIndexedField; -use datafusion_expr::Operator; -use datafusion_expr::ScalarFunctionDefinition; -use datafusion_expr::{BinaryExpr, Expr, LogicalPlan}; +#[cfg(feature = "array_expressions")] +use datafusion_expr::{BinaryExpr, Operator, ScalarFunctionDefinition}; +use datafusion_expr::{Expr, LogicalPlan}; +#[cfg(feature = "array_expressions")] +use datafusion_functions_array::expr_fn::{array_append, array_concat, array_prepend}; #[derive(Default)] pub struct OperatorToFunction {} @@ -73,6 +77,7 @@ fn analyze_internal(plan: &LogicalPlan) -> Result { } let mut expr_rewrite = OperatorToFunctionRewriter { + #[cfg(feature = "array_expressions")] schema: Arc::new(schema), }; @@ -90,6 +95,7 @@ fn analyze_internal(plan: &LogicalPlan) -> Result { } pub(crate) struct OperatorToFunctionRewriter { + #[cfg(feature = "array_expressions")] pub(crate) schema: DFSchemaRef, } @@ -97,13 +103,14 @@ impl TreeNodeRewriter for OperatorToFunctionRewriter { type Node = Expr; fn f_up(&mut self, expr: Expr) -> Result> { + #[cfg(feature = "array_expressions")] if let Expr::BinaryExpr(BinaryExpr { ref left, op, ref right, }) = expr { - if let Some(fun) = rewrite_array_concat_operator_to_func_for_column( + if let Some(expr) = rewrite_array_concat_operator_to_func_for_column( left.as_ref(), op, right.as_ref(), @@ -113,12 +120,7 @@ impl TreeNodeRewriter for OperatorToFunctionRewriter { rewrite_array_concat_operator_to_func(left.as_ref(), op, right.as_ref()) }) { // Convert &Box -> Expr - let left = (**left).clone(); - let right = (**right).clone(); - return Ok(Transformed::yes(Expr::ScalarFunction(ScalarFunction { - func_def: ScalarFunctionDefinition::BuiltIn(fun), - args: vec![left, right], - }))); + return Ok(Transformed::yes(expr)); } // TODO: change OperatorToFunction to OperatoToArrayFunction and configure it with array_expressions feature @@ -185,16 +187,14 @@ fn rewrite_array_has_all_operator_to_func( // array1 <@ array2 -> array_has_all(array2, array1) ( Expr::ScalarFunction(ScalarFunction { - func_def: - ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::MakeArray), + func_def: ScalarFunctionDefinition::UDF(left_fun), args: _left_args, }), Expr::ScalarFunction(ScalarFunction { - func_def: - ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::MakeArray), + func_def: ScalarFunctionDefinition::UDF(right_fun), args: _right_args, }), - ) => { + ) if left_fun.name() == "make_array" && right_fun.name() == "make_array" => { let left = left.clone(); let right = right.clone(); @@ -220,11 +220,12 @@ fn rewrite_array_has_all_operator_to_func( /// 4) (arry concat, array append, array prepend) || array -> array concat /// /// 5) (arry concat, array append, array prepend) || scalar -> array append +#[cfg(feature = "array_expressions")] fn rewrite_array_concat_operator_to_func( left: &Expr, op: Operator, right: &Expr, -) -> Option { +) -> Option { // Convert `Array StringConcat Array` to ScalarFunction::ArrayConcat if op != Operator::StringConcat { @@ -236,97 +237,65 @@ fn rewrite_array_concat_operator_to_func( // (arry concat, array append, array prepend) || array -> array concat ( Expr::ScalarFunction(ScalarFunction { - func_def: - ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::ArrayConcat), - args: _left_args, - }), - Expr::ScalarFunction(ScalarFunction { - func_def: - ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::MakeArray), - args: _right_args, - }), - ) - | ( - Expr::ScalarFunction(ScalarFunction { - func_def: - ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::ArrayAppend), + func_def: ScalarFunctionDefinition::UDF(left_fun), args: _left_args, }), Expr::ScalarFunction(ScalarFunction { - func_def: - ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::MakeArray), + func_def: ScalarFunctionDefinition::UDF(right_fun), args: _right_args, }), - ) - | ( - Expr::ScalarFunction(ScalarFunction { - func_def: - ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::ArrayPrepend), - args: _left_args, - }), - Expr::ScalarFunction(ScalarFunction { - func_def: - ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::MakeArray), - args: _right_args, - }), - ) => Some(BuiltinScalarFunction::ArrayConcat), + ) if ["array_append", "array_prepend", "array_concat"] + .contains(&left_fun.name()) + && right_fun.name() == "make_array" => + { + Some(array_concat(vec![left.clone(), right.clone()])) + } // Chain concat operator (a || b) || scalar, // (arry concat, array append, array prepend) || scalar -> array append ( Expr::ScalarFunction(ScalarFunction { - func_def: - ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::ArrayConcat), - args: _left_args, - }), - _scalar, - ) - | ( - Expr::ScalarFunction(ScalarFunction { - func_def: - ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::ArrayAppend), + func_def: ScalarFunctionDefinition::UDF(left_fun), args: _left_args, }), _scalar, - ) - | ( - Expr::ScalarFunction(ScalarFunction { - func_def: - ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::ArrayPrepend), - args: _left_args, - }), - _scalar, - ) => Some(BuiltinScalarFunction::ArrayAppend), + ) if ["array_append", "array_prepend", "array_concat"] + .contains(&left_fun.name()) => + { + Some(array_append(left.clone(), right.clone())) + } // array || array -> array concat ( Expr::ScalarFunction(ScalarFunction { - func_def: - ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::MakeArray), + func_def: ScalarFunctionDefinition::UDF(left_fun), args: _left_args, }), Expr::ScalarFunction(ScalarFunction { - func_def: - ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::MakeArray), + func_def: ScalarFunctionDefinition::UDF(right_fun), args: _right_args, }), - ) => Some(BuiltinScalarFunction::ArrayConcat), + ) if left_fun.name() == "make_array" && right_fun.name() == "make_array" => { + Some(array_concat(vec![left.clone(), right.clone()])) + } // array || scalar -> array append ( Expr::ScalarFunction(ScalarFunction { - func_def: - ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::MakeArray), + func_def: ScalarFunctionDefinition::UDF(left_fun), args: _left_args, }), _right_scalar, - ) => Some(BuiltinScalarFunction::ArrayAppend), + ) if left_fun.name() == "make_array" => { + Some(array_append(left.clone(), right.clone())) + } // scalar || array -> array prepend ( _left_scalar, Expr::ScalarFunction(ScalarFunction { - func_def: - ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::MakeArray), + func_def: ScalarFunctionDefinition::UDF(right_fun), args: _right_args, }), - ) => Some(BuiltinScalarFunction::ArrayPrepend), + ) if right_fun.name() == "make_array" => { + Some(array_prepend(left.clone(), right.clone())) + } _ => None, } @@ -337,12 +306,13 @@ fn rewrite_array_concat_operator_to_func( /// 1) (arry concat, array append, array prepend) || column -> (array append, array concat) /// /// 2) column1 || column2 -> (array prepend, array append, array concat) +#[cfg(feature = "array_expressions")] fn rewrite_array_concat_operator_to_func_for_column( left: &Expr, op: Operator, right: &Expr, schema: &DFSchema, -) -> Result> { +) -> Result> { if op != Operator::StringConcat { return Ok(None); } @@ -352,33 +322,18 @@ fn rewrite_array_concat_operator_to_func_for_column( // 1) array_prepend/append/concat || column ( Expr::ScalarFunction(ScalarFunction { - func_def: - ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::ArrayPrepend), - args: _left_args, - }), - Expr::Column(c), - ) - | ( - Expr::ScalarFunction(ScalarFunction { - func_def: - ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::ArrayAppend), + func_def: ScalarFunctionDefinition::UDF(left_fun), args: _left_args, }), Expr::Column(c), - ) - | ( - Expr::ScalarFunction(ScalarFunction { - func_def: - ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::ArrayConcat), - args: _left_args, - }), - Expr::Column(c), - ) => { + ) if ["array_append", "array_prepend", "array_concat"] + .contains(&left_fun.name()) => + { let d = schema.field_from_column(c)?.data_type(); let ndim = list_ndims(d); match ndim { - 0 => Ok(Some(BuiltinScalarFunction::ArrayAppend)), - _ => Ok(Some(BuiltinScalarFunction::ArrayConcat)), + 0 => Ok(Some(array_append(left.clone(), right.clone()))), + _ => Ok(Some(array_concat(vec![left.clone(), right.clone()]))), } } // 2) select column1 || column2 @@ -388,9 +343,9 @@ fn rewrite_array_concat_operator_to_func_for_column( let ndim1 = list_ndims(d1); let ndim2 = list_ndims(d2); match (ndim1, ndim2) { - (0, _) => Ok(Some(BuiltinScalarFunction::ArrayPrepend)), - (_, 0) => Ok(Some(BuiltinScalarFunction::ArrayAppend)), - _ => Ok(Some(BuiltinScalarFunction::ArrayConcat)), + (0, _) => Ok(Some(array_prepend(left.clone(), right.clone()))), + (_, 0) => Ok(Some(array_append(left.clone(), right.clone()))), + _ => Ok(Some(array_concat(vec![left.clone(), right.clone()]))), } } _ => Ok(None), diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 496def95e1bc..fabeba439370 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -46,8 +46,8 @@ use datafusion_expr::type_coercion::{is_datetime, is_utf8_or_large_utf8}; use datafusion_expr::utils::merge_schema; use datafusion_expr::{ is_false, is_not_false, is_not_true, is_not_unknown, is_true, is_unknown, not, - type_coercion, AggregateFunction, BuiltinScalarFunction, Expr, ExprSchemable, - LogicalPlan, Operator, Projection, ScalarFunctionDefinition, Signature, WindowFrame, + type_coercion, AggregateFunction, Expr, ExprSchemable, LogicalPlan, Operator, + Projection, ScalarFunctionDefinition, ScalarUDF, Signature, WindowFrame, WindowFrameBound, WindowFrameUnits, }; @@ -316,11 +316,6 @@ impl TreeNodeRewriter for TypeCoercionRewriter { &self.schema, &fun.signature(), )?; - let new_args = coerce_arguments_for_fun( - new_args.as_slice(), - &self.schema, - &fun, - )?; Ok(Transformed::yes(Expr::ScalarFunction(ScalarFunction::new( fun, new_args, )))) @@ -331,6 +326,11 @@ impl TreeNodeRewriter for TypeCoercionRewriter { &self.schema, fun.signature(), )?; + let new_expr = coerce_arguments_for_fun( + new_expr.as_slice(), + &self.schema, + &fun, + )?; Ok(Transformed::yes(Expr::ScalarFunction( ScalarFunction::new_udf(fun, new_expr), ))) @@ -583,7 +583,7 @@ fn coerce_arguments_for_signature( fn coerce_arguments_for_fun( expressions: &[Expr], schema: &DFSchema, - fun: &BuiltinScalarFunction, + fun: &Arc, ) -> Result> { if expressions.is_empty() { return Ok(vec![]); @@ -591,7 +591,7 @@ fn coerce_arguments_for_fun( let mut expressions: Vec = expressions.to_vec(); // Cast Fixedsizelist to List for array functions - if *fun == BuiltinScalarFunction::MakeArray { + if fun.name() == "make_array" { expressions = expressions .into_iter() .map(|expr| { @@ -776,6 +776,7 @@ mod test { LogicalPlan, Operator, ScalarUDF, ScalarUDFImpl, Signature, SimpleAggregateUDF, Subquery, Volatility, }; + use datafusion_functions_array::expr_fn::make_array; use datafusion_physical_expr::expressions::AvgAccumulator; fn empty() -> Arc { @@ -1266,10 +1267,7 @@ mod test { None, ), ))); - let expr = Expr::ScalarFunction(ScalarFunction::new( - BuiltinScalarFunction::MakeArray, - vec![val.clone()], - )); + let expr = make_array(vec![val.clone()]); let schema = Arc::new(DFSchema::new_with_metadata( vec![DFField::new_unqualified( "item", @@ -1298,10 +1296,7 @@ mod test { &schema, )?; - let expected = Expr::ScalarFunction(ScalarFunction::new( - BuiltinScalarFunction::MakeArray, - vec![expected_casted_expr], - )); + let expected = make_array(vec![expected_casted_expr]); assert_eq!(result, expected); Ok(()) diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 30c184a28e33..7b8eccad5133 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -33,9 +33,7 @@ use datafusion_common::{ DataFusionError, Result, }; use datafusion_expr::expr::Alias; -use datafusion_expr::logical_plan::{ - Aggregate, Filter, LogicalPlan, Projection, Sort, Window, -}; +use datafusion_expr::logical_plan::{Aggregate, LogicalPlan, Projection, Window}; use datafusion_expr::{col, Expr, ExprSchemable}; /// A map from expression's identifier to tuple including @@ -44,13 +42,13 @@ use datafusion_expr::{col, Expr, ExprSchemable}; /// - DataType of this expression. type ExprSet = HashMap; -/// Identifier type. Current implementation use describe of a expression (type String) as +/// Identifier type. Current implementation use describe of an expression (type String) as /// Identifier. /// -/// A Identifier should (ideally) be able to "hash", "accumulate", "equal" and "have no +/// An identifier should (ideally) be able to "hash", "accumulate", "equal" and "have no /// collision (as low as possible)" /// -/// Since a identifier is likely to be copied many times, it is better that a identifier +/// Since an identifier is likely to be copied many times, it is better that an identifier /// is small or "copy". otherwise some kinds of reference count is needed. String description /// here is not such a good choose. type Identifier = String; @@ -108,61 +106,6 @@ impl CommonSubexprEliminate { Ok((rewrite_exprs, new_input)) } - fn try_optimize_projection( - &self, - projection: &Projection, - config: &dyn OptimizerConfig, - ) -> Result { - let Projection { expr, input, .. } = projection; - let input_schema = Arc::clone(input.schema()); - let mut expr_set = ExprSet::new(); - - // Visit expr list and build expr identifier to occuring count map (`expr_set`). - let arrays = to_arrays(expr, input_schema, &mut expr_set, ExprMask::Normal)?; - - let (mut new_expr, new_input) = - self.rewrite_expr(&[expr], &[&arrays], input, &expr_set, config)?; - - // Since projection expr changes, schema changes also. Use try_new method. - Projection::try_new(pop_expr(&mut new_expr)?, Arc::new(new_input)) - .map(LogicalPlan::Projection) - } - - fn try_optimize_filter( - &self, - filter: &Filter, - config: &dyn OptimizerConfig, - ) -> Result { - let mut expr_set = ExprSet::new(); - let predicate = &filter.predicate; - let input_schema = Arc::clone(filter.input.schema()); - let mut id_array = vec![]; - expr_to_identifier( - predicate, - &mut expr_set, - &mut id_array, - input_schema, - ExprMask::Normal, - )?; - - let (mut new_expr, new_input) = self.rewrite_expr( - &[&[predicate.clone()]], - &[&[id_array]], - &filter.input, - &expr_set, - config, - )?; - - if let Some(predicate) = pop_expr(&mut new_expr)?.pop() { - Ok(LogicalPlan::Filter(Filter::try_new( - predicate, - Arc::new(new_input), - )?)) - } else { - internal_err!("Failed to pop predicate expr") - } - } - fn try_optimize_window( &self, window: &Window, @@ -354,25 +297,24 @@ impl CommonSubexprEliminate { } } - fn try_optimize_sort( + fn try_unary_plan( &self, - sort: &Sort, + plan: &LogicalPlan, config: &dyn OptimizerConfig, ) -> Result { - let Sort { expr, input, fetch } = sort; + let expr = plan.expressions(); + let inputs = plan.inputs(); + let input = inputs[0]; + let input_schema = Arc::clone(input.schema()); let mut expr_set = ExprSet::new(); - let input_schema = Arc::clone(input.schema()); - let arrays = to_arrays(expr, input_schema, &mut expr_set, ExprMask::Normal)?; + // Visit expr list and build expr identifier to occuring count map (`expr_set`). + let arrays = to_arrays(&expr, input_schema, &mut expr_set, ExprMask::Normal)?; let (mut new_expr, new_input) = - self.rewrite_expr(&[expr], &[&arrays], input, &expr_set, config)?; + self.rewrite_expr(&[&expr], &[&arrays], input, &expr_set, config)?; - Ok(LogicalPlan::Sort(Sort { - expr: pop_expr(&mut new_expr)?, - input: Arc::new(new_input), - fetch: *fetch, - })) + plan.with_new_exprs(pop_expr(&mut new_expr)?, vec![new_input]) } } @@ -383,19 +325,15 @@ impl OptimizerRule for CommonSubexprEliminate { config: &dyn OptimizerConfig, ) -> Result> { let optimized_plan = match plan { - LogicalPlan::Projection(projection) => { - Some(self.try_optimize_projection(projection, config)?) - } - LogicalPlan::Filter(filter) => { - Some(self.try_optimize_filter(filter, config)?) - } + LogicalPlan::Projection(_) + | LogicalPlan::Sort(_) + | LogicalPlan::Filter(_) => Some(self.try_unary_plan(plan, config)?), LogicalPlan::Window(window) => { Some(self.try_optimize_window(window, config)?) } LogicalPlan::Aggregate(aggregate) => { Some(self.try_optimize_aggregate(aggregate, config)?) } - LogicalPlan::Sort(sort) => Some(self.try_optimize_sort(sort, config)?), LogicalPlan::Join(_) | LogicalPlan::CrossJoin(_) | LogicalPlan::Repartition(_) @@ -1321,7 +1259,8 @@ mod test { .build()?; let expected = "Projection: test.a, test.b, test.c\ - \n Filter: Int32(1) + test.atest.aInt32(1) AS Int32(1) + test.a - Int32(10) > Int32(1) + test.atest.aInt32(1) AS Int32(1) + test.a\n Projection: Int32(1) + test.a AS Int32(1) + test.atest.aInt32(1), test.a, test.b, test.c\ + \n Filter: Int32(1) + test.atest.aInt32(1) - Int32(10) > Int32(1) + test.atest.aInt32(1)\ + \n Projection: Int32(1) + test.a AS Int32(1) + test.atest.aInt32(1), test.a, test.b, test.c\ \n TableScan: test"; assert_optimized_plan_eq(expected, &plan); diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index a63133c5166f..e93e171e0324 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -859,6 +859,13 @@ impl OptimizerRule for PushDownFilter { let results = scan .source .supports_filters_pushdown(filter_predicates.as_slice())?; + if filter_predicates.len() != results.len() { + return internal_err!( + "Vec returned length: {} from supports_filters_pushdown is not the same size as the filters passed, which length is: {}", + results.len(), + filter_predicates.len()); + } + let zip = filter_predicates.iter().zip(results); let new_scan_filters = zip diff --git a/datafusion/physical-expr/src/aggregate/approx_percentile_cont.rs b/datafusion/physical-expr/src/aggregate/approx_percentile_cont.rs index b3de7b0b4d36..3dbf1679e230 100644 --- a/datafusion/physical-expr/src/aggregate/approx_percentile_cont.rs +++ b/datafusion/physical-expr/src/aggregate/approx_percentile_cont.rs @@ -30,8 +30,8 @@ use arrow::{ use arrow_array::RecordBatch; use arrow_schema::Schema; use datafusion_common::{ - downcast_value, exec_err, internal_err, not_impl_err, plan_err, DataFusionError, - Result, ScalarValue, + downcast_value, internal_err, not_impl_err, plan_err, DataFusionError, Result, + ScalarValue, }; use datafusion_expr::{Accumulator, ColumnarValue}; use std::{any::Any, iter, sync::Arc}; @@ -391,7 +391,7 @@ impl Accumulator for ApproxPercentileAccumulator { fn evaluate(&mut self) -> Result { if self.digest.count() == 0.0 { - return exec_err!("aggregate function needs at least one non-null element"); + return ScalarValue::try_from(self.return_type.clone()); } let q = self.digest.estimate_quantile(self.percentile); diff --git a/datafusion/physical-expr/src/array_expressions.rs b/datafusion/physical-expr/src/array_expressions.rs index 5be72b0559d3..c846674e752f 100644 --- a/datafusion/physical-expr/src/array_expressions.rs +++ b/datafusion/physical-expr/src/array_expressions.rs @@ -28,15 +28,14 @@ use arrow::datatypes::{DataType, Field, UInt64Type}; use arrow::row::{RowConverter, SortField}; use arrow_buffer::{ArrowNativeType, NullBuffer}; -use arrow_schema::{FieldRef, SortOptions}; +use arrow_schema::FieldRef; use datafusion_common::cast::{ - as_generic_list_array, as_generic_string_array, as_int64_array, as_large_list_array, - as_list_array, as_string_array, + as_generic_list_array, as_int64_array, as_large_list_array, as_list_array, }; -use datafusion_common::utils::{array_into_list_array, list_ndims}; +use datafusion_common::utils::array_into_list_array; use datafusion_common::{ - exec_err, internal_datafusion_err, internal_err, not_impl_err, plan_err, - DataFusionError, Result, ScalarValue, + exec_err, internal_datafusion_err, internal_err, plan_err, DataFusionError, Result, + ScalarValue, }; use itertools::Itertools; @@ -746,484 +745,6 @@ pub fn array_pop_back(args: &[ArrayRef]) -> Result { } } -/// Appends or prepends elements to a ListArray. -/// -/// This function takes a ListArray, an ArrayRef, a FieldRef, and a boolean flag -/// indicating whether to append or prepend the elements. It returns a `Result` -/// representing the resulting ListArray after the operation. -/// -/// # Arguments -/// -/// * `list_array` - A reference to the ListArray to which elements will be appended/prepended. -/// * `element_array` - A reference to the Array containing elements to be appended/prepended. -/// * `field` - A reference to the Field describing the data type of the arrays. -/// * `is_append` - A boolean flag indicating whether to append (`true`) or prepend (`false`) elements. -/// -/// # Examples -/// -/// generic_append_and_prepend( -/// [1, 2, 3], 4, append => [1, 2, 3, 4] -/// 5, [6, 7, 8], prepend => [5, 6, 7, 8] -/// ) -fn generic_append_and_prepend( - list_array: &GenericListArray, - element_array: &ArrayRef, - data_type: &DataType, - is_append: bool, -) -> Result -where - i64: TryInto, -{ - let mut offsets = vec![O::usize_as(0)]; - let values = list_array.values(); - let original_data = values.to_data(); - let element_data = element_array.to_data(); - let capacity = Capacities::Array(original_data.len() + element_data.len()); - - let mut mutable = MutableArrayData::with_capacities( - vec![&original_data, &element_data], - false, - capacity, - ); - - let values_index = 0; - let element_index = 1; - - for (row_index, offset_window) in list_array.offsets().windows(2).enumerate() { - let start = offset_window[0].to_usize().unwrap(); - let end = offset_window[1].to_usize().unwrap(); - if is_append { - mutable.extend(values_index, start, end); - mutable.extend(element_index, row_index, row_index + 1); - } else { - mutable.extend(element_index, row_index, row_index + 1); - mutable.extend(values_index, start, end); - } - offsets.push(offsets[row_index] + O::usize_as(end - start + 1)); - } - - let data = mutable.freeze(); - - Ok(Arc::new(GenericListArray::::try_new( - Arc::new(Field::new("item", data_type.to_owned(), true)), - OffsetBuffer::new(offsets.into()), - arrow_array::make_array(data), - None, - )?)) -} - -/// Array_sort SQL function -pub fn array_sort(args: &[ArrayRef]) -> Result { - if args.is_empty() || args.len() > 3 { - return exec_err!("array_sort expects one to three arguments"); - } - - let sort_option = match args.len() { - 1 => None, - 2 => { - let sort = as_string_array(&args[1])?.value(0); - Some(SortOptions { - descending: order_desc(sort)?, - nulls_first: true, - }) - } - 3 => { - let sort = as_string_array(&args[1])?.value(0); - let nulls_first = as_string_array(&args[2])?.value(0); - Some(SortOptions { - descending: order_desc(sort)?, - nulls_first: order_nulls_first(nulls_first)?, - }) - } - _ => return exec_err!("array_sort expects 1 to 3 arguments"), - }; - - let list_array = as_list_array(&args[0])?; - let row_count = list_array.len(); - - let mut array_lengths = vec![]; - let mut arrays = vec![]; - let mut valid = BooleanBufferBuilder::new(row_count); - for i in 0..row_count { - if list_array.is_null(i) { - array_lengths.push(0); - valid.append(false); - } else { - let arr_ref = list_array.value(i); - let arr_ref = arr_ref.as_ref(); - - let sorted_array = compute::sort(arr_ref, sort_option)?; - array_lengths.push(sorted_array.len()); - arrays.push(sorted_array); - valid.append(true); - } - } - - // Assume all arrays have the same data type - let data_type = list_array.value_type(); - let buffer = valid.finish(); - - let elements = arrays - .iter() - .map(|a| a.as_ref()) - .collect::>(); - - let list_arr = ListArray::new( - Arc::new(Field::new("item", data_type, true)), - OffsetBuffer::from_lengths(array_lengths), - Arc::new(compute::concat(elements.as_slice())?), - Some(NullBuffer::new(buffer)), - ); - Ok(Arc::new(list_arr)) -} - -fn order_desc(modifier: &str) -> Result { - match modifier.to_uppercase().as_str() { - "DESC" => Ok(true), - "ASC" => Ok(false), - _ => exec_err!("the second parameter of array_sort expects DESC or ASC"), - } -} - -fn order_nulls_first(modifier: &str) -> Result { - match modifier.to_uppercase().as_str() { - "NULLS FIRST" => Ok(true), - "NULLS LAST" => Ok(false), - _ => exec_err!( - "the third parameter of array_sort expects NULLS FIRST or NULLS LAST" - ), - } -} - -fn general_append_and_prepend( - args: &[ArrayRef], - is_append: bool, -) -> Result -where - i64: TryInto, -{ - let (list_array, element_array) = if is_append { - let list_array = as_generic_list_array::(&args[0])?; - let element_array = &args[1]; - check_datatypes("array_append", &[element_array, list_array.values()])?; - (list_array, element_array) - } else { - let list_array = as_generic_list_array::(&args[1])?; - let element_array = &args[0]; - check_datatypes("array_prepend", &[list_array.values(), element_array])?; - (list_array, element_array) - }; - - let res = match list_array.value_type() { - DataType::List(_) => concat_internal::(args)?, - DataType::LargeList(_) => concat_internal::(args)?, - data_type => { - return generic_append_and_prepend::( - list_array, - element_array, - &data_type, - is_append, - ); - } - }; - - Ok(res) -} - -/// Array_append SQL function -pub fn array_append(args: &[ArrayRef]) -> Result { - if args.len() != 2 { - return exec_err!("array_append expects two arguments"); - } - - match args[0].data_type() { - DataType::LargeList(_) => general_append_and_prepend::(args, true), - _ => general_append_and_prepend::(args, true), - } -} - -/// Array_prepend SQL function -pub fn array_prepend(args: &[ArrayRef]) -> Result { - if args.len() != 2 { - return exec_err!("array_prepend expects two arguments"); - } - - match args[1].data_type() { - DataType::LargeList(_) => general_append_and_prepend::(args, false), - _ => general_append_and_prepend::(args, false), - } -} - -fn align_array_dimensions( - args: Vec, -) -> Result> { - let args_ndim = args - .iter() - .map(|arg| datafusion_common::utils::list_ndims(arg.data_type())) - .collect::>(); - let max_ndim = args_ndim.iter().max().unwrap_or(&0); - - // Align the dimensions of the arrays - let aligned_args: Result> = args - .into_iter() - .zip(args_ndim.iter()) - .map(|(array, ndim)| { - if ndim < max_ndim { - let mut aligned_array = array.clone(); - for _ in 0..(max_ndim - ndim) { - let data_type = aligned_array.data_type().to_owned(); - let array_lengths = vec![1; aligned_array.len()]; - let offsets = OffsetBuffer::::from_lengths(array_lengths); - - aligned_array = Arc::new(GenericListArray::::try_new( - Arc::new(Field::new("item", data_type, true)), - offsets, - aligned_array, - None, - )?) - } - Ok(aligned_array) - } else { - Ok(array.clone()) - } - }) - .collect(); - - aligned_args -} - -// Concatenate arrays on the same row. -fn concat_internal(args: &[ArrayRef]) -> Result { - let args = align_array_dimensions::(args.to_vec())?; - - let list_arrays = args - .iter() - .map(|arg| as_generic_list_array::(arg)) - .collect::>>()?; - // Assume number of rows is the same for all arrays - let row_count = list_arrays[0].len(); - - let mut array_lengths = vec![]; - let mut arrays = vec![]; - let mut valid = BooleanBufferBuilder::new(row_count); - for i in 0..row_count { - let nulls = list_arrays - .iter() - .map(|arr| arr.is_null(i)) - .collect::>(); - - // If all the arrays are null, the concatenated array is null - let is_null = nulls.iter().all(|&x| x); - if is_null { - array_lengths.push(0); - valid.append(false); - } else { - // Get all the arrays on i-th row - let values = list_arrays - .iter() - .map(|arr| arr.value(i)) - .collect::>(); - - let elements = values - .iter() - .map(|a| a.as_ref()) - .collect::>(); - - // Concatenated array on i-th row - let concated_array = compute::concat(elements.as_slice())?; - array_lengths.push(concated_array.len()); - arrays.push(concated_array); - valid.append(true); - } - } - // Assume all arrays have the same data type - let data_type = list_arrays[0].value_type(); - let buffer = valid.finish(); - - let elements = arrays - .iter() - .map(|a| a.as_ref()) - .collect::>(); - - let list_arr = GenericListArray::::new( - Arc::new(Field::new("item", data_type, true)), - OffsetBuffer::from_lengths(array_lengths), - Arc::new(compute::concat(elements.as_slice())?), - Some(NullBuffer::new(buffer)), - ); - - Ok(Arc::new(list_arr)) -} - -/// Array_concat/Array_cat SQL function -pub fn array_concat(args: &[ArrayRef]) -> Result { - if args.is_empty() { - return exec_err!("array_concat expects at least one arguments"); - } - - let mut new_args = vec![]; - for arg in args { - let ndim = list_ndims(arg.data_type()); - let base_type = datafusion_common::utils::base_type(arg.data_type()); - if ndim == 0 { - return not_impl_err!("Array is not type '{base_type:?}'."); - } else if !base_type.eq(&DataType::Null) { - new_args.push(arg.clone()); - } - } - - match &args[0].data_type() { - DataType::LargeList(_) => concat_internal::(new_args.as_slice()), - _ => concat_internal::(new_args.as_slice()), - } -} - -/// Array_repeat SQL function -pub fn array_repeat(args: &[ArrayRef]) -> Result { - if args.len() != 2 { - return exec_err!("array_repeat expects two arguments"); - } - - let element = &args[0]; - let count_array = as_int64_array(&args[1])?; - - match element.data_type() { - DataType::List(_) => { - let list_array = as_list_array(element)?; - general_list_repeat::(list_array, count_array) - } - DataType::LargeList(_) => { - let list_array = as_large_list_array(element)?; - general_list_repeat::(list_array, count_array) - } - _ => general_repeat::(element, count_array), - } -} - -/// For each element of `array[i]` repeat `count_array[i]` times. -/// -/// Assumption for the input: -/// 1. `count[i] >= 0` -/// 2. `array.len() == count_array.len()` -/// -/// For example, -/// ```text -/// array_repeat( -/// [1, 2, 3], [2, 0, 1] => [[1, 1], [], [3]] -/// ) -/// ``` -fn general_repeat( - array: &ArrayRef, - count_array: &Int64Array, -) -> Result { - let data_type = array.data_type(); - let mut new_values = vec![]; - - let count_vec = count_array - .values() - .to_vec() - .iter() - .map(|x| *x as usize) - .collect::>(); - - for (row_index, &count) in count_vec.iter().enumerate() { - let repeated_array = if array.is_null(row_index) { - new_null_array(data_type, count) - } else { - let original_data = array.to_data(); - let capacity = Capacities::Array(count); - let mut mutable = - MutableArrayData::with_capacities(vec![&original_data], false, capacity); - - for _ in 0..count { - mutable.extend(0, row_index, row_index + 1); - } - - let data = mutable.freeze(); - arrow_array::make_array(data) - }; - new_values.push(repeated_array); - } - - let new_values: Vec<_> = new_values.iter().map(|a| a.as_ref()).collect(); - let values = compute::concat(&new_values)?; - - Ok(Arc::new(GenericListArray::::try_new( - Arc::new(Field::new("item", data_type.to_owned(), true)), - OffsetBuffer::from_lengths(count_vec), - values, - None, - )?)) -} - -/// Handle List version of `general_repeat` -/// -/// For each element of `list_array[i]` repeat `count_array[i]` times. -/// -/// For example, -/// ```text -/// array_repeat( -/// [[1, 2, 3], [4, 5], [6]], [2, 0, 1] => [[[1, 2, 3], [1, 2, 3]], [], [[6]]] -/// ) -/// ``` -fn general_list_repeat( - list_array: &GenericListArray, - count_array: &Int64Array, -) -> Result { - let data_type = list_array.data_type(); - let value_type = list_array.value_type(); - let mut new_values = vec![]; - - let count_vec = count_array - .values() - .to_vec() - .iter() - .map(|x| *x as usize) - .collect::>(); - - for (list_array_row, &count) in list_array.iter().zip(count_vec.iter()) { - let list_arr = match list_array_row { - Some(list_array_row) => { - let original_data = list_array_row.to_data(); - let capacity = Capacities::Array(original_data.len() * count); - let mut mutable = MutableArrayData::with_capacities( - vec![&original_data], - false, - capacity, - ); - - for _ in 0..count { - mutable.extend(0, 0, original_data.len()); - } - - let data = mutable.freeze(); - let repeated_array = arrow_array::make_array(data); - - let list_arr = GenericListArray::::try_new( - Arc::new(Field::new("item", value_type.clone(), true)), - OffsetBuffer::::from_lengths(vec![original_data.len(); count]), - repeated_array, - None, - )?; - Arc::new(list_arr) as ArrayRef - } - None => new_null_array(data_type, count), - }; - new_values.push(list_arr); - } - - let lengths = new_values.iter().map(|a| a.len()).collect::>(); - let new_values: Vec<_> = new_values.iter().map(|a| a.as_ref()).collect(); - let values = compute::concat(&new_values)?; - - Ok(Arc::new(ListArray::try_new( - Arc::new(Field::new("item", data_type.to_owned(), true)), - OffsetBuffer::::from_lengths(lengths), - values, - None, - )?)) -} - /// Array_position SQL function pub fn array_position(args: &[ArrayRef]) -> Result { if args.len() < 2 || args.len() > 3 { @@ -1836,95 +1357,6 @@ pub fn array_intersect(args: &[ArrayRef]) -> Result { general_set_op(array1, array2, SetOp::Intersect) } -/// Splits string at occurrences of delimiter and returns an array of parts -/// string_to_array('abc~@~def~@~ghi', '~@~') = '["abc", "def", "ghi"]' -pub fn string_to_array(args: &[ArrayRef]) -> Result { - let string_array = as_generic_string_array::(&args[0])?; - let delimiter_array = as_generic_string_array::(&args[1])?; - - let mut list_builder = ListBuilder::new(StringBuilder::with_capacity( - string_array.len(), - string_array.get_buffer_memory_size(), - )); - - match args.len() { - 2 => { - string_array.iter().zip(delimiter_array.iter()).for_each( - |(string, delimiter)| { - match (string, delimiter) { - (Some(string), Some("")) => { - list_builder.values().append_value(string); - list_builder.append(true); - } - (Some(string), Some(delimiter)) => { - string.split(delimiter).for_each(|s| { - list_builder.values().append_value(s); - }); - list_builder.append(true); - } - (Some(string), None) => { - string.chars().map(|c| c.to_string()).for_each(|c| { - list_builder.values().append_value(c); - }); - list_builder.append(true); - } - _ => list_builder.append(false), // null value - } - }, - ); - } - - 3 => { - let null_value_array = as_generic_string_array::(&args[2])?; - string_array - .iter() - .zip(delimiter_array.iter()) - .zip(null_value_array.iter()) - .for_each(|((string, delimiter), null_value)| { - match (string, delimiter) { - (Some(string), Some("")) => { - if Some(string) == null_value { - list_builder.values().append_null(); - } else { - list_builder.values().append_value(string); - } - list_builder.append(true); - } - (Some(string), Some(delimiter)) => { - string.split(delimiter).for_each(|s| { - if Some(s) == null_value { - list_builder.values().append_null(); - } else { - list_builder.values().append_value(s); - } - }); - list_builder.append(true); - } - (Some(string), None) => { - string.chars().map(|c| c.to_string()).for_each(|c| { - if Some(c.as_str()) == null_value { - list_builder.values().append_null(); - } else { - list_builder.values().append_value(c); - } - }); - list_builder.append(true); - } - _ => list_builder.append(false), // null value - } - }); - } - _ => { - return exec_err!( - "Expect string_to_array function to take two or three parameters" - ) - } - } - - let list_array = list_builder.finish(); - Ok(Arc::new(list_array) as ArrayRef) -} - pub fn general_array_distinct( array: &GenericListArray, field: &FieldRef, @@ -1961,32 +1393,6 @@ pub fn general_array_distinct( )?)) } -/// array_distinct SQL function -/// example: from list [1, 3, 2, 3, 1, 2, 4] to [1, 2, 3, 4] -pub fn array_distinct(args: &[ArrayRef]) -> Result { - if args.len() != 1 { - return exec_err!("array_distinct needs one argument"); - } - - // handle null - if args[0].data_type() == &DataType::Null { - return Ok(args[0].clone()); - } - - // handle for list & largelist - match args[0].data_type() { - DataType::List(field) => { - let array = as_list_array(&args[0])?; - general_array_distinct(array, field) - } - DataType::LargeList(field) => { - let array = as_large_list_array(&args[0])?; - general_array_distinct(array, field) - } - array_type => exec_err!("array_distinct does not support type '{array_type:?}'"), - } -} - /// array_resize SQL function pub fn array_resize(arg: &[ArrayRef]) -> Result { if arg.len() < 2 || arg.len() > 3 { @@ -2149,54 +1555,3 @@ where Some(nulls.into()), )?)) } - -#[cfg(test)] -mod tests { - use super::*; - use arrow::datatypes::Int64Type; - - /// Only test internal functions, array-related sql functions will be tested in sqllogictest `array.slt` - #[test] - fn test_align_array_dimensions() { - let array1d_1 = - Arc::new(ListArray::from_iter_primitive::(vec![ - Some(vec![Some(1), Some(2), Some(3)]), - Some(vec![Some(4), Some(5)]), - ])); - let array1d_2 = - Arc::new(ListArray::from_iter_primitive::(vec![ - Some(vec![Some(6), Some(7), Some(8)]), - ])); - - let array2d_1 = Arc::new(array_into_list_array(array1d_1.clone())) as ArrayRef; - let array2d_2 = Arc::new(array_into_list_array(array1d_2.clone())) as ArrayRef; - - let res = align_array_dimensions::(vec![ - array1d_1.to_owned(), - array2d_2.to_owned(), - ]) - .unwrap(); - - let expected = as_list_array(&array2d_1).unwrap(); - let expected_dim = datafusion_common::utils::list_ndims(array2d_1.data_type()); - assert_ne!(as_list_array(&res[0]).unwrap(), expected); - assert_eq!( - datafusion_common::utils::list_ndims(res[0].data_type()), - expected_dim - ); - - let array3d_1 = Arc::new(array_into_list_array(array2d_1)) as ArrayRef; - let array3d_2 = array_into_list_array(array2d_2.to_owned()); - let res = - align_array_dimensions::(vec![array1d_1, Arc::new(array3d_2.clone())]) - .unwrap(); - - let expected = as_list_array(&array3d_1).unwrap(); - let expected_dim = datafusion_common::utils::list_ndims(array3d_1.data_type()); - assert_ne!(as_list_array(&res[0]).unwrap(), expected); - assert_eq!( - datafusion_common::utils::list_ndims(res[0].data_type()), - expected_dim - ); - } -} diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index 84aa0c94a22d..072e4ba47e24 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -33,7 +33,7 @@ use crate::sort_properties::SortProperties; use crate::{ array_expressions, conditional_expressions, datetime_expressions, math_expressions, - string_expressions, struct_expressions, PhysicalExpr, ScalarFunctionExpr, + string_expressions, PhysicalExpr, ScalarFunctionExpr, }; use arrow::{ array::ArrayRef, @@ -282,8 +282,6 @@ pub fn create_physical_fun( BuiltinScalarFunction::Sinh => Arc::new(math_expressions::sinh), BuiltinScalarFunction::Sqrt => Arc::new(math_expressions::sqrt), BuiltinScalarFunction::Cbrt => Arc::new(math_expressions::cbrt), - BuiltinScalarFunction::Tan => Arc::new(math_expressions::tan), - BuiltinScalarFunction::Tanh => Arc::new(math_expressions::tanh), BuiltinScalarFunction::Trunc => { Arc::new(|args| make_scalar_function_inner(math_expressions::trunc)(args)) } @@ -302,18 +300,6 @@ pub fn create_physical_fun( } // array functions - BuiltinScalarFunction::ArrayAppend => Arc::new(|args| { - make_scalar_function_inner(array_expressions::array_append)(args) - }), - BuiltinScalarFunction::ArraySort => Arc::new(|args| { - make_scalar_function_inner(array_expressions::array_sort)(args) - }), - BuiltinScalarFunction::ArrayConcat => Arc::new(|args| { - make_scalar_function_inner(array_expressions::array_concat)(args) - }), - BuiltinScalarFunction::ArrayDistinct => Arc::new(|args| { - make_scalar_function_inner(array_expressions::array_distinct)(args) - }), BuiltinScalarFunction::ArrayElement => Arc::new(|args| { make_scalar_function_inner(array_expressions::array_element)(args) }), @@ -332,12 +318,6 @@ pub fn create_physical_fun( BuiltinScalarFunction::ArrayPositions => Arc::new(|args| { make_scalar_function_inner(array_expressions::array_positions)(args) }), - BuiltinScalarFunction::ArrayPrepend => Arc::new(|args| { - make_scalar_function_inner(array_expressions::array_prepend)(args) - }), - BuiltinScalarFunction::ArrayRepeat => Arc::new(|args| { - make_scalar_function_inner(array_expressions::array_repeat)(args) - }), BuiltinScalarFunction::ArrayRemove => Arc::new(|args| { make_scalar_function_inner(array_expressions::array_remove)(args) }), @@ -368,14 +348,9 @@ pub fn create_physical_fun( BuiltinScalarFunction::ArrayResize => Arc::new(|args| { make_scalar_function_inner(array_expressions::array_resize)(args) }), - BuiltinScalarFunction::MakeArray => Arc::new(|args| { - make_scalar_function_inner(array_expressions::make_array)(args) - }), BuiltinScalarFunction::ArrayUnion => Arc::new(|args| { make_scalar_function_inner(array_expressions::array_union)(args) }), - // struct functions - BuiltinScalarFunction::Struct => Arc::new(struct_expressions::struct_expr), // string functions BuiltinScalarFunction::Ascii => Arc::new(|args| match args[0].data_type() { @@ -593,21 +568,6 @@ pub fn create_physical_fun( exec_err!("Unsupported data type {other:?} for function split_part") } }), - BuiltinScalarFunction::StringToArray => { - Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => make_scalar_function_inner( - array_expressions::string_to_array::, - )(args), - DataType::LargeUtf8 => make_scalar_function_inner( - array_expressions::string_to_array::, - )(args), - other => { - exec_err!( - "Unsupported data type {other:?} for function string_to_array" - ) - } - }) - } BuiltinScalarFunction::StartsWith => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { make_scalar_function_inner(string_expressions::starts_with::)(args) @@ -699,19 +659,6 @@ pub fn create_physical_fun( }), BuiltinScalarFunction::Upper => Arc::new(string_expressions::upper), BuiltinScalarFunction::Uuid => Arc::new(string_expressions::uuid), - BuiltinScalarFunction::ArrowTypeof => Arc::new(move |args| { - if args.len() != 1 { - return exec_err!( - "arrow_typeof function requires 1 arguments, got {}", - args.len() - ); - } - - let input_data_type = args[0].data_type(); - Ok(ColumnarValue::Scalar(ScalarValue::from(format!( - "{input_data_type}" - )))) - }), BuiltinScalarFunction::OverLay => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { make_scalar_function_inner(string_expressions::overlay::)(args) diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs index b36e5d79bb44..07bccf25c86a 100644 --- a/datafusion/physical-expr/src/lib.rs +++ b/datafusion/physical-expr/src/lib.rs @@ -35,7 +35,6 @@ mod scalar_function; mod sort_expr; pub mod sort_properties; pub mod string_expressions; -pub mod struct_expressions; pub mod tree_node; pub mod udf; #[cfg(feature = "unicode_expressions")] diff --git a/datafusion/physical-expr/src/math_expressions.rs b/datafusion/physical-expr/src/math_expressions.rs index a8c115ba3a82..db8855cb5400 100644 --- a/datafusion/physical-expr/src/math_expressions.rs +++ b/datafusion/physical-expr/src/math_expressions.rs @@ -159,10 +159,8 @@ math_unary_function!("sqrt", sqrt); math_unary_function!("cbrt", cbrt); math_unary_function!("sin", sin); math_unary_function!("cos", cos); -math_unary_function!("tan", tan); math_unary_function!("sinh", sinh); math_unary_function!("cosh", cosh); -math_unary_function!("tanh", tanh); math_unary_function!("asin", asin); math_unary_function!("acos", acos); math_unary_function!("atan", atan); diff --git a/datafusion/physical-expr/src/scalar_function.rs b/datafusion/physical-expr/src/scalar_function.rs index bfe0fdb279f5..1c9f0e609c3c 100644 --- a/datafusion/physical-expr/src/scalar_function.rs +++ b/datafusion/physical-expr/src/scalar_function.rs @@ -153,14 +153,15 @@ impl PhysicalExpr for ScalarFunctionExpr { if scalar_fun .signature() .type_signature - .supports_zero_argument() - && scalar_fun != BuiltinScalarFunction::MakeArray => + .supports_zero_argument() => { vec![ColumnarValue::create_null_array(batch.num_rows())] } // If the function supports zero argument, we pass in a null array indicating the batch size. // This is for user-defined functions. - (true, Err(_)) if self.supports_zero_argument => { + (true, Err(_)) + if self.supports_zero_argument && self.name != "make_array" => + { vec![ColumnarValue::create_null_array(batch.num_rows())] } _ => self diff --git a/datafusion/physical-expr/src/window/nth_value.rs b/datafusion/physical-expr/src/window/nth_value.rs index a7bb31b6e109..5c7c891f92d2 100644 --- a/datafusion/physical-expr/src/window/nth_value.rs +++ b/datafusion/physical-expr/src/window/nth_value.rs @@ -42,6 +42,7 @@ pub struct NthValue { /// Output data type data_type: DataType, kind: NthValueKind, + ignore_nulls: bool, } impl NthValue { @@ -50,12 +51,14 @@ impl NthValue { name: impl Into, expr: Arc, data_type: DataType, + ignore_nulls: bool, ) -> Self { Self { name: name.into(), expr, data_type, kind: NthValueKind::First, + ignore_nulls, } } @@ -64,12 +67,14 @@ impl NthValue { name: impl Into, expr: Arc, data_type: DataType, + ignore_nulls: bool, ) -> Self { Self { name: name.into(), expr, data_type, kind: NthValueKind::Last, + ignore_nulls, } } @@ -79,7 +84,11 @@ impl NthValue { expr: Arc, data_type: DataType, n: u32, + ignore_nulls: bool, ) -> Result { + if ignore_nulls { + return exec_err!("NTH_VALUE ignore_nulls is not supported yet"); + } match n { 0 => exec_err!("NTH_VALUE expects n to be non-zero"), _ => Ok(Self { @@ -87,6 +96,7 @@ impl NthValue { expr, data_type, kind: NthValueKind::Nth(n as i64), + ignore_nulls, }), } } @@ -122,7 +132,10 @@ impl BuiltInWindowFunctionExpr for NthValue { finalized_result: None, kind: self.kind, }; - Ok(Box::new(NthValueEvaluator { state })) + Ok(Box::new(NthValueEvaluator { + state, + ignore_nulls: self.ignore_nulls, + })) } fn reverse_expr(&self) -> Option> { @@ -136,6 +149,7 @@ impl BuiltInWindowFunctionExpr for NthValue { expr: self.expr.clone(), data_type: self.data_type.clone(), kind: reversed_kind, + ignore_nulls: self.ignore_nulls, })) } } @@ -144,6 +158,7 @@ impl BuiltInWindowFunctionExpr for NthValue { #[derive(Debug)] pub(crate) struct NthValueEvaluator { state: NthValueState, + ignore_nulls: bool, } impl PartitionEvaluator for NthValueEvaluator { @@ -184,7 +199,8 @@ impl PartitionEvaluator for NthValueEvaluator { } } }; - if is_prunable { + // Do not memoize results when nulls are ignored. + if is_prunable && !self.ignore_nulls { if self.state.finalized_result.is_none() && !is_reverse_direction { let result = ScalarValue::try_from_array(out, size - 1)?; self.state.finalized_result = Some(result); @@ -210,9 +226,39 @@ impl PartitionEvaluator for NthValueEvaluator { // We produce None if the window is empty. return ScalarValue::try_from(arr.data_type()); } + + // Extract valid indices if ignoring nulls. + let (slice, valid_indices) = if self.ignore_nulls { + let slice = arr.slice(range.start, n_range); + let valid_indices = + slice.nulls().unwrap().valid_indices().collect::>(); + if valid_indices.is_empty() { + return ScalarValue::try_from(arr.data_type()); + } + (Some(slice), Some(valid_indices)) + } else { + (None, None) + }; match self.state.kind { - NthValueKind::First => ScalarValue::try_from_array(arr, range.start), - NthValueKind::Last => ScalarValue::try_from_array(arr, range.end - 1), + NthValueKind::First => { + if let Some(slice) = &slice { + let valid_indices = valid_indices.unwrap(); + ScalarValue::try_from_array(slice, valid_indices[0]) + } else { + ScalarValue::try_from_array(arr, range.start) + } + } + NthValueKind::Last => { + if let Some(slice) = &slice { + let valid_indices = valid_indices.unwrap(); + ScalarValue::try_from_array( + slice, + valid_indices[valid_indices.len() - 1], + ) + } else { + ScalarValue::try_from_array(arr, range.end - 1) + } + } NthValueKind::Nth(n) => { match n.cmp(&0) { Ordering::Greater => { @@ -295,6 +341,7 @@ mod tests { "first_value".to_owned(), Arc::new(Column::new("arr", 0)), DataType::Int32, + false, ); test_i32_result(first_value, Int32Array::from(vec![1; 8]))?; Ok(()) @@ -306,6 +353,7 @@ mod tests { "last_value".to_owned(), Arc::new(Column::new("arr", 0)), DataType::Int32, + false, ); test_i32_result( last_value, @@ -330,6 +378,7 @@ mod tests { Arc::new(Column::new("arr", 0)), DataType::Int32, 1, + false, )?; test_i32_result(nth_value, Int32Array::from(vec![1; 8]))?; Ok(()) @@ -342,6 +391,7 @@ mod tests { Arc::new(Column::new("arr", 0)), DataType::Int32, 2, + false, )?; test_i32_result( nth_value, diff --git a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs index 4cba571054de..0349f8f1eeec 100644 --- a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs +++ b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs @@ -1179,15 +1179,19 @@ mod tests { .map(|e| Arc::new(e) as Arc)?; let col_a = col("a", &schema)?; let nth_value_func1 = - NthValue::nth("nth_value(-1)", col_a.clone(), DataType::Int32, 1)? + NthValue::nth("nth_value(-1)", col_a.clone(), DataType::Int32, 1, false)? .reverse_expr() .unwrap(); let nth_value_func2 = - NthValue::nth("nth_value(-2)", col_a.clone(), DataType::Int32, 2)? + NthValue::nth("nth_value(-2)", col_a.clone(), DataType::Int32, 2, false)? .reverse_expr() .unwrap(); - let last_value_func = - Arc::new(NthValue::last("last", col_a.clone(), DataType::Int32)) as _; + let last_value_func = Arc::new(NthValue::last( + "last", + col_a.clone(), + DataType::Int32, + false, + )) as _; let window_exprs = vec![ // LAST_VALUE(a) Arc::new(BuiltInWindowExpr::new( diff --git a/datafusion/physical-plan/src/windows/mod.rs b/datafusion/physical-plan/src/windows/mod.rs index f91b525d6090..6712bc855ffd 100644 --- a/datafusion/physical-plan/src/windows/mod.rs +++ b/datafusion/physical-plan/src/windows/mod.rs @@ -250,15 +250,21 @@ fn create_built_in_window_expr( .try_into() .map_err(|e| DataFusionError::Execution(format!("{e:?}")))?; let n: u32 = n as u32; - Arc::new(NthValue::nth(name, arg, data_type.clone(), n)?) + Arc::new(NthValue::nth( + name, + arg, + data_type.clone(), + n, + ignore_nulls, + )?) } BuiltInWindowFunction::FirstValue => { let arg = args[0].clone(); - Arc::new(NthValue::first(name, arg, data_type.clone())) + Arc::new(NthValue::first(name, arg, data_type.clone(), ignore_nulls)) } BuiltInWindowFunction::LastValue => { let arg = args[0].clone(); - Arc::new(NthValue::last(name, arg, data_type.clone())) + Arc::new(NthValue::last(name, arg, data_type.clone(), ignore_nulls)) } }) } diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index dfdf6bd68276..e6ee41fadb9f 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -90,17 +90,13 @@ message ProjectionColumns { } message CsvFormat { - bool has_header = 1; - string delimiter = 2; - string quote = 3; - oneof optional_escape { - string escape = 4; - } + CsvOptions options = 5; } message ParquetFormat { // Used to be bool enable_pruning = 1; reserved 1; + TableParquetOptions options = 2; } message AvroFormat {} @@ -324,22 +320,18 @@ message DistinctOnNode { message CopyToNode { LogicalPlanNode input = 1; string output_url = 2; - oneof CopyOptions { - SQLOptions sql_options = 4; - FileTypeWriterOptions writer_options = 5; + oneof format_options { + CsvOptions csv = 8; + JsonOptions json = 9; + TableParquetOptions parquet = 10; + AvroOptions avro = 11; + ArrowOptions arrow = 12; } - string file_type = 6; repeated string partition_by = 7; } -message SQLOptions { - repeated SQLOption option = 1; -} - -message SQLOption { - string key = 1; - string value = 2; -} +message AvroOptions {} +message ArrowOptions {} message UnionNode { repeated LogicalPlanNode inputs = 1; @@ -567,9 +559,9 @@ enum ScalarFunction { Signum = 15; Sin = 16; Sqrt = 17; - Tan = 18; + // Tan = 18; Trunc = 19; - Array = 20; + // 20 was Array // RegexpMatch = 21; BitLength = 22; Btrim = 23; @@ -614,11 +606,11 @@ enum ScalarFunction { Upper = 62; Coalesce = 63; Power = 64; - StructFun = 65; + // 65 was StructFun // 66 was FromUnixtime Atan2 = 67; // 68 was DateBin - ArrowTypeof = 69; + // 69 was ArrowTypeof // 70 was CurrentDate // 71 was CurrentTime Uuid = 72; @@ -628,22 +620,22 @@ enum ScalarFunction { Atanh = 76; Sinh = 77; Cosh = 78; - Tanh = 79; + // Tanh = 79; Pi = 80; Degrees = 81; Radians = 82; Factorial = 83; Lcm = 84; Gcd = 85; - ArrayAppend = 86; - ArrayConcat = 87; + // 86 was ArrayAppend + // 87 was ArrayConcat // 88 was ArrayDims - ArrayRepeat = 89; + // 89 was ArrayRepeat // 90 was ArrayLength // 91 was ArrayNdims ArrayPosition = 92; ArrayPositions = 93; - ArrayPrepend = 94; + // 94 was ArrayPrepend ArrayRemove = 95; ArrayReplace = 96; // 97 was ArrayToString @@ -664,7 +656,7 @@ enum ScalarFunction { Iszero = 114; // 115 was ArrayEmpty ArrayPopBack = 116; - StringToArray = 117; + // 117 was StringToArray // 118 was ToTimestampNanos ArrayIntersect = 119; ArrayUnion = 120; @@ -675,8 +667,8 @@ enum ScalarFunction { Levenshtein = 125; SubstrIndex = 126; FindInSet = 127; - ArraySort = 128; - ArrayDistinct = 129; + /// 128 was ArraySort + /// 129 was ArrayDistinct ArrayResize = 130; EndsWith = 131; /// 132 was InStr @@ -1219,22 +1211,11 @@ message PartitionColumn { ArrowType arrow_type = 2; } -message FileTypeWriterOptions { - oneof FileType { - JsonWriterOptions json_options = 1; - ParquetWriterOptions parquet_options = 2; - CsvWriterOptions csv_options = 3; - ArrowWriterOptions arrow_options = 4; - } -} message JsonWriterOptions { CompressionTypeVariant compression = 1; } -message ParquetWriterOptions { - WriterProperties writer_properties = 1; -} message CsvWriterOptions { // Compression type @@ -1255,16 +1236,26 @@ message CsvWriterOptions { string null_value = 8; } -message ArrowWriterOptions {} +// Options controlling CSV format +message CsvOptions { + bool has_header = 1; // Indicates if the CSV has a header row + bytes delimiter = 2; // Delimiter character as a byte + bytes quote = 3; // Quote character as a byte + bytes escape = 4; // Optional escape character as a byte + CompressionTypeVariant compression = 5; // Compression type + uint64 schema_infer_max_rec = 6; // Max records for schema inference + string date_format = 7; // Optional date format + string datetime_format = 8; // Optional datetime format + string timestamp_format = 9; // Optional timestamp format + string timestamp_tz_format = 10; // Optional timestamp with timezone format + string time_format = 11; // Optional time format + string null_value = 12; // Optional representation of null value +} -message WriterProperties { - uint64 data_page_size_limit = 1; - uint64 dictionary_page_size_limit = 2; - uint64 data_page_row_count_limit = 3; - uint64 write_batch_size = 4; - uint64 max_row_group_size = 5; - string writer_version = 6; - string created_by = 7; +// Options controlling CSV format +message JsonOptions { + CompressionTypeVariant compression = 1; // Compression type + uint64 schema_infer_max_rec = 2; // Max records for schema inference } message FileSinkConfig { @@ -1276,11 +1267,11 @@ message FileSinkConfig { Schema output_schema = 4; repeated PartitionColumn table_partition_cols = 5; bool overwrite = 8; - FileTypeWriterOptions file_type_writer_options = 9; } message JsonSink { FileSinkConfig config = 1; + JsonWriterOptions writer_options = 2; } message JsonSinkExecNode { @@ -1292,6 +1283,7 @@ message JsonSinkExecNode { message CsvSink { FileSinkConfig config = 1; + CsvWriterOptions writer_options = 2; } message CsvSinkExecNode { @@ -1301,8 +1293,115 @@ message CsvSinkExecNode { PhysicalSortExprNodeCollection sort_order = 4; } +message TableParquetOptions { + ParquetOptions global = 1; + repeated ColumnSpecificOptions column_specific_options = 2; +} + +message ColumnSpecificOptions { + string column_name = 1; + ColumnOptions options = 2; +} + +message ColumnOptions { + oneof bloom_filter_enabled_opt { + bool bloom_filter_enabled = 1; + } + + oneof encoding_opt { + string encoding = 2; + } + + oneof dictionary_enabled_opt { + bool dictionary_enabled = 3; + } + + oneof compression_opt { + string compression = 4; + } + + oneof statistics_enabled_opt { + string statistics_enabled = 5; + } + + oneof bloom_filter_fpp_opt { + double bloom_filter_fpp = 6; + } + + oneof bloom_filter_ndv_opt { + uint64 bloom_filter_ndv = 7; + } + + oneof max_statistics_size_opt { + uint32 max_statistics_size = 8; + } +} + +message ParquetOptions { + // Regular fields + bool enable_page_index = 1; // default = true + bool pruning = 2; // default = true + bool skip_metadata = 3; // default = true + bool pushdown_filters = 5; // default = false + bool reorder_filters = 6; // default = false + uint64 data_pagesize_limit = 7; // default = 1024 * 1024 + uint64 write_batch_size = 8; // default = 1024 + string writer_version = 9; // default = "1.0" + bool bloom_filter_enabled = 20; // default = false + bool allow_single_file_parallelism = 23; // default = true + uint64 maximum_parallel_row_group_writers = 24; // default = 1 + uint64 maximum_buffered_record_batches_per_stream = 25; // default = 2 + + oneof metadata_size_hint_opt { + uint64 metadata_size_hint = 4; + } + + oneof compression_opt { + string compression = 10; + } + + oneof dictionary_enabled_opt { + bool dictionary_enabled = 11; + } + + oneof statistics_enabled_opt { + string statistics_enabled = 13; + } + + oneof max_statistics_size_opt { + uint64 max_statistics_size = 14; + } + + oneof column_index_truncate_length_opt { + uint64 column_index_truncate_length = 17; + } + + oneof encoding_opt { + string encoding = 19; + } + + oneof bloom_filter_fpp_opt { + double bloom_filter_fpp = 21; + } + + oneof bloom_filter_ndv_opt { + uint64 bloom_filter_ndv = 22; + } + + uint64 dictionary_page_size_limit = 12; + + uint64 data_page_row_count_limit = 18; + + uint64 max_row_group_size = 15; + + string created_by = 16; +} + + + message ParquetSink { FileSinkConfig config = 1; + TableParquetOptions parquet_options = 2; } message ParquetSinkExecNode { diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index 83b0c6813b4b..37cc1a45785b 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -1409,6 +1409,77 @@ impl<'de> serde::Deserialize<'de> for AnalyzedLogicalPlanType { deserializer.deserialize_struct("datafusion.AnalyzedLogicalPlanType", FIELDS, GeneratedVisitor) } } +impl serde::Serialize for ArrowOptions { + #[allow(deprecated)] + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + let len = 0; + let struct_ser = serializer.serialize_struct("datafusion.ArrowOptions", len)?; + struct_ser.end() + } +} +impl<'de> serde::Deserialize<'de> for ArrowOptions { + #[allow(deprecated)] + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + const FIELDS: &[&str] = &[ + ]; + + #[allow(clippy::enum_variant_names)] + enum GeneratedField { + } + impl<'de> serde::Deserialize<'de> for GeneratedField { + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + struct GeneratedVisitor; + + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = GeneratedField; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(formatter, "expected one of: {:?}", &FIELDS) + } + + #[allow(unused_variables)] + fn visit_str(self, value: &str) -> std::result::Result + where + E: serde::de::Error, + { + Err(serde::de::Error::unknown_field(value, FIELDS)) + } + } + deserializer.deserialize_identifier(GeneratedVisitor) + } + } + struct GeneratedVisitor; + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = ArrowOptions; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + formatter.write_str("struct datafusion.ArrowOptions") + } + + fn visit_map(self, mut map_: V) -> std::result::Result + where + V: serde::de::MapAccess<'de>, + { + while map_.next_key::()?.is_some() { + let _ = map_.next_value::()?; + } + Ok(ArrowOptions { + }) + } + } + deserializer.deserialize_struct("datafusion.ArrowOptions", FIELDS, GeneratedVisitor) + } +} impl serde::Serialize for ArrowType { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result @@ -1929,7 +2000,7 @@ impl<'de> serde::Deserialize<'de> for ArrowType { deserializer.deserialize_struct("datafusion.ArrowType", FIELDS, GeneratedVisitor) } } -impl serde::Serialize for ArrowWriterOptions { +impl serde::Serialize for AvroFormat { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result where @@ -1937,11 +2008,11 @@ impl serde::Serialize for ArrowWriterOptions { { use serde::ser::SerializeStruct; let len = 0; - let struct_ser = serializer.serialize_struct("datafusion.ArrowWriterOptions", len)?; + let struct_ser = serializer.serialize_struct("datafusion.AvroFormat", len)?; struct_ser.end() } } -impl<'de> serde::Deserialize<'de> for ArrowWriterOptions { +impl<'de> serde::Deserialize<'de> for AvroFormat { #[allow(deprecated)] fn deserialize(deserializer: D) -> std::result::Result where @@ -1980,27 +2051,27 @@ impl<'de> serde::Deserialize<'de> for ArrowWriterOptions { } struct GeneratedVisitor; impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = ArrowWriterOptions; + type Value = AvroFormat; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion.ArrowWriterOptions") + formatter.write_str("struct datafusion.AvroFormat") } - fn visit_map(self, mut map_: V) -> std::result::Result + fn visit_map(self, mut map_: V) -> std::result::Result where V: serde::de::MapAccess<'de>, { while map_.next_key::()?.is_some() { let _ = map_.next_value::()?; } - Ok(ArrowWriterOptions { + Ok(AvroFormat { }) } } - deserializer.deserialize_struct("datafusion.ArrowWriterOptions", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion.AvroFormat", FIELDS, GeneratedVisitor) } } -impl serde::Serialize for AvroFormat { +impl serde::Serialize for AvroOptions { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result where @@ -2008,11 +2079,11 @@ impl serde::Serialize for AvroFormat { { use serde::ser::SerializeStruct; let len = 0; - let struct_ser = serializer.serialize_struct("datafusion.AvroFormat", len)?; + let struct_ser = serializer.serialize_struct("datafusion.AvroOptions", len)?; struct_ser.end() } } -impl<'de> serde::Deserialize<'de> for AvroFormat { +impl<'de> serde::Deserialize<'de> for AvroOptions { #[allow(deprecated)] fn deserialize(deserializer: D) -> std::result::Result where @@ -2051,24 +2122,24 @@ impl<'de> serde::Deserialize<'de> for AvroFormat { } struct GeneratedVisitor; impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = AvroFormat; + type Value = AvroOptions; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion.AvroFormat") + formatter.write_str("struct datafusion.AvroOptions") } - fn visit_map(self, mut map_: V) -> std::result::Result + fn visit_map(self, mut map_: V) -> std::result::Result where V: serde::de::MapAccess<'de>, { while map_.next_key::()?.is_some() { let _ = map_.next_value::()?; } - Ok(AvroFormat { + Ok(AvroOptions { }) } } - deserializer.deserialize_struct("datafusion.AvroFormat", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion.AvroOptions", FIELDS, GeneratedVisitor) } } impl serde::Serialize for AvroScanExecNode { @@ -3260,6 +3331,255 @@ impl<'de> serde::Deserialize<'de> for ColumnIndex { deserializer.deserialize_struct("datafusion.ColumnIndex", FIELDS, GeneratedVisitor) } } +impl serde::Serialize for ColumnOptions { + #[allow(deprecated)] + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + let mut len = 0; + if self.bloom_filter_enabled_opt.is_some() { + len += 1; + } + if self.encoding_opt.is_some() { + len += 1; + } + if self.dictionary_enabled_opt.is_some() { + len += 1; + } + if self.compression_opt.is_some() { + len += 1; + } + if self.statistics_enabled_opt.is_some() { + len += 1; + } + if self.bloom_filter_fpp_opt.is_some() { + len += 1; + } + if self.bloom_filter_ndv_opt.is_some() { + len += 1; + } + if self.max_statistics_size_opt.is_some() { + len += 1; + } + let mut struct_ser = serializer.serialize_struct("datafusion.ColumnOptions", len)?; + if let Some(v) = self.bloom_filter_enabled_opt.as_ref() { + match v { + column_options::BloomFilterEnabledOpt::BloomFilterEnabled(v) => { + struct_ser.serialize_field("bloomFilterEnabled", v)?; + } + } + } + if let Some(v) = self.encoding_opt.as_ref() { + match v { + column_options::EncodingOpt::Encoding(v) => { + struct_ser.serialize_field("encoding", v)?; + } + } + } + if let Some(v) = self.dictionary_enabled_opt.as_ref() { + match v { + column_options::DictionaryEnabledOpt::DictionaryEnabled(v) => { + struct_ser.serialize_field("dictionaryEnabled", v)?; + } + } + } + if let Some(v) = self.compression_opt.as_ref() { + match v { + column_options::CompressionOpt::Compression(v) => { + struct_ser.serialize_field("compression", v)?; + } + } + } + if let Some(v) = self.statistics_enabled_opt.as_ref() { + match v { + column_options::StatisticsEnabledOpt::StatisticsEnabled(v) => { + struct_ser.serialize_field("statisticsEnabled", v)?; + } + } + } + if let Some(v) = self.bloom_filter_fpp_opt.as_ref() { + match v { + column_options::BloomFilterFppOpt::BloomFilterFpp(v) => { + struct_ser.serialize_field("bloomFilterFpp", v)?; + } + } + } + if let Some(v) = self.bloom_filter_ndv_opt.as_ref() { + match v { + column_options::BloomFilterNdvOpt::BloomFilterNdv(v) => { + #[allow(clippy::needless_borrow)] + struct_ser.serialize_field("bloomFilterNdv", ToString::to_string(&v).as_str())?; + } + } + } + if let Some(v) = self.max_statistics_size_opt.as_ref() { + match v { + column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => { + struct_ser.serialize_field("maxStatisticsSize", v)?; + } + } + } + struct_ser.end() + } +} +impl<'de> serde::Deserialize<'de> for ColumnOptions { + #[allow(deprecated)] + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + const FIELDS: &[&str] = &[ + "bloom_filter_enabled", + "bloomFilterEnabled", + "encoding", + "dictionary_enabled", + "dictionaryEnabled", + "compression", + "statistics_enabled", + "statisticsEnabled", + "bloom_filter_fpp", + "bloomFilterFpp", + "bloom_filter_ndv", + "bloomFilterNdv", + "max_statistics_size", + "maxStatisticsSize", + ]; + + #[allow(clippy::enum_variant_names)] + enum GeneratedField { + BloomFilterEnabled, + Encoding, + DictionaryEnabled, + Compression, + StatisticsEnabled, + BloomFilterFpp, + BloomFilterNdv, + MaxStatisticsSize, + } + impl<'de> serde::Deserialize<'de> for GeneratedField { + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + struct GeneratedVisitor; + + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = GeneratedField; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(formatter, "expected one of: {:?}", &FIELDS) + } + + #[allow(unused_variables)] + fn visit_str(self, value: &str) -> std::result::Result + where + E: serde::de::Error, + { + match value { + "bloomFilterEnabled" | "bloom_filter_enabled" => Ok(GeneratedField::BloomFilterEnabled), + "encoding" => Ok(GeneratedField::Encoding), + "dictionaryEnabled" | "dictionary_enabled" => Ok(GeneratedField::DictionaryEnabled), + "compression" => Ok(GeneratedField::Compression), + "statisticsEnabled" | "statistics_enabled" => Ok(GeneratedField::StatisticsEnabled), + "bloomFilterFpp" | "bloom_filter_fpp" => Ok(GeneratedField::BloomFilterFpp), + "bloomFilterNdv" | "bloom_filter_ndv" => Ok(GeneratedField::BloomFilterNdv), + "maxStatisticsSize" | "max_statistics_size" => Ok(GeneratedField::MaxStatisticsSize), + _ => Err(serde::de::Error::unknown_field(value, FIELDS)), + } + } + } + deserializer.deserialize_identifier(GeneratedVisitor) + } + } + struct GeneratedVisitor; + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = ColumnOptions; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + formatter.write_str("struct datafusion.ColumnOptions") + } + + fn visit_map(self, mut map_: V) -> std::result::Result + where + V: serde::de::MapAccess<'de>, + { + let mut bloom_filter_enabled_opt__ = None; + let mut encoding_opt__ = None; + let mut dictionary_enabled_opt__ = None; + let mut compression_opt__ = None; + let mut statistics_enabled_opt__ = None; + let mut bloom_filter_fpp_opt__ = None; + let mut bloom_filter_ndv_opt__ = None; + let mut max_statistics_size_opt__ = None; + while let Some(k) = map_.next_key()? { + match k { + GeneratedField::BloomFilterEnabled => { + if bloom_filter_enabled_opt__.is_some() { + return Err(serde::de::Error::duplicate_field("bloomFilterEnabled")); + } + bloom_filter_enabled_opt__ = map_.next_value::<::std::option::Option<_>>()?.map(column_options::BloomFilterEnabledOpt::BloomFilterEnabled); + } + GeneratedField::Encoding => { + if encoding_opt__.is_some() { + return Err(serde::de::Error::duplicate_field("encoding")); + } + encoding_opt__ = map_.next_value::<::std::option::Option<_>>()?.map(column_options::EncodingOpt::Encoding); + } + GeneratedField::DictionaryEnabled => { + if dictionary_enabled_opt__.is_some() { + return Err(serde::de::Error::duplicate_field("dictionaryEnabled")); + } + dictionary_enabled_opt__ = map_.next_value::<::std::option::Option<_>>()?.map(column_options::DictionaryEnabledOpt::DictionaryEnabled); + } + GeneratedField::Compression => { + if compression_opt__.is_some() { + return Err(serde::de::Error::duplicate_field("compression")); + } + compression_opt__ = map_.next_value::<::std::option::Option<_>>()?.map(column_options::CompressionOpt::Compression); + } + GeneratedField::StatisticsEnabled => { + if statistics_enabled_opt__.is_some() { + return Err(serde::de::Error::duplicate_field("statisticsEnabled")); + } + statistics_enabled_opt__ = map_.next_value::<::std::option::Option<_>>()?.map(column_options::StatisticsEnabledOpt::StatisticsEnabled); + } + GeneratedField::BloomFilterFpp => { + if bloom_filter_fpp_opt__.is_some() { + return Err(serde::de::Error::duplicate_field("bloomFilterFpp")); + } + bloom_filter_fpp_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| column_options::BloomFilterFppOpt::BloomFilterFpp(x.0)); + } + GeneratedField::BloomFilterNdv => { + if bloom_filter_ndv_opt__.is_some() { + return Err(serde::de::Error::duplicate_field("bloomFilterNdv")); + } + bloom_filter_ndv_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| column_options::BloomFilterNdvOpt::BloomFilterNdv(x.0)); + } + GeneratedField::MaxStatisticsSize => { + if max_statistics_size_opt__.is_some() { + return Err(serde::de::Error::duplicate_field("maxStatisticsSize")); + } + max_statistics_size_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(x.0)); + } + } + } + Ok(ColumnOptions { + bloom_filter_enabled_opt: bloom_filter_enabled_opt__, + encoding_opt: encoding_opt__, + dictionary_enabled_opt: dictionary_enabled_opt__, + compression_opt: compression_opt__, + statistics_enabled_opt: statistics_enabled_opt__, + bloom_filter_fpp_opt: bloom_filter_fpp_opt__, + bloom_filter_ndv_opt: bloom_filter_ndv_opt__, + max_statistics_size_opt: max_statistics_size_opt__, + }) + } + } + deserializer.deserialize_struct("datafusion.ColumnOptions", FIELDS, GeneratedVisitor) + } +} impl serde::Serialize for ColumnRelation { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result @@ -3351,7 +3671,7 @@ impl<'de> serde::Deserialize<'de> for ColumnRelation { deserializer.deserialize_struct("datafusion.ColumnRelation", FIELDS, GeneratedVisitor) } } -impl serde::Serialize for ColumnStats { +impl serde::Serialize for ColumnSpecificOptions { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result where @@ -3359,57 +3679,38 @@ impl serde::Serialize for ColumnStats { { use serde::ser::SerializeStruct; let mut len = 0; - if self.min_value.is_some() { - len += 1; - } - if self.max_value.is_some() { + if !self.column_name.is_empty() { len += 1; } - if self.null_count.is_some() { - len += 1; - } - if self.distinct_count.is_some() { + if self.options.is_some() { len += 1; } - let mut struct_ser = serializer.serialize_struct("datafusion.ColumnStats", len)?; - if let Some(v) = self.min_value.as_ref() { - struct_ser.serialize_field("minValue", v)?; - } - if let Some(v) = self.max_value.as_ref() { - struct_ser.serialize_field("maxValue", v)?; - } - if let Some(v) = self.null_count.as_ref() { - struct_ser.serialize_field("nullCount", v)?; + let mut struct_ser = serializer.serialize_struct("datafusion.ColumnSpecificOptions", len)?; + if !self.column_name.is_empty() { + struct_ser.serialize_field("columnName", &self.column_name)?; } - if let Some(v) = self.distinct_count.as_ref() { - struct_ser.serialize_field("distinctCount", v)?; + if let Some(v) = self.options.as_ref() { + struct_ser.serialize_field("options", v)?; } struct_ser.end() } } -impl<'de> serde::Deserialize<'de> for ColumnStats { +impl<'de> serde::Deserialize<'de> for ColumnSpecificOptions { #[allow(deprecated)] fn deserialize(deserializer: D) -> std::result::Result where D: serde::Deserializer<'de>, { const FIELDS: &[&str] = &[ - "min_value", - "minValue", - "max_value", - "maxValue", - "null_count", - "nullCount", - "distinct_count", - "distinctCount", + "column_name", + "columnName", + "options", ]; #[allow(clippy::enum_variant_names)] enum GeneratedField { - MinValue, - MaxValue, - NullCount, - DistinctCount, + ColumnName, + Options, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -3431,10 +3732,8 @@ impl<'de> serde::Deserialize<'de> for ColumnStats { E: serde::de::Error, { match value { - "minValue" | "min_value" => Ok(GeneratedField::MinValue), - "maxValue" | "max_value" => Ok(GeneratedField::MaxValue), - "nullCount" | "null_count" => Ok(GeneratedField::NullCount), - "distinctCount" | "distinct_count" => Ok(GeneratedField::DistinctCount), + "columnName" | "column_name" => Ok(GeneratedField::ColumnName), + "options" => Ok(GeneratedField::Options), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -3444,7 +3743,137 @@ impl<'de> serde::Deserialize<'de> for ColumnStats { } struct GeneratedVisitor; impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = ColumnStats; + type Value = ColumnSpecificOptions; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + formatter.write_str("struct datafusion.ColumnSpecificOptions") + } + + fn visit_map(self, mut map_: V) -> std::result::Result + where + V: serde::de::MapAccess<'de>, + { + let mut column_name__ = None; + let mut options__ = None; + while let Some(k) = map_.next_key()? { + match k { + GeneratedField::ColumnName => { + if column_name__.is_some() { + return Err(serde::de::Error::duplicate_field("columnName")); + } + column_name__ = Some(map_.next_value()?); + } + GeneratedField::Options => { + if options__.is_some() { + return Err(serde::de::Error::duplicate_field("options")); + } + options__ = map_.next_value()?; + } + } + } + Ok(ColumnSpecificOptions { + column_name: column_name__.unwrap_or_default(), + options: options__, + }) + } + } + deserializer.deserialize_struct("datafusion.ColumnSpecificOptions", FIELDS, GeneratedVisitor) + } +} +impl serde::Serialize for ColumnStats { + #[allow(deprecated)] + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + let mut len = 0; + if self.min_value.is_some() { + len += 1; + } + if self.max_value.is_some() { + len += 1; + } + if self.null_count.is_some() { + len += 1; + } + if self.distinct_count.is_some() { + len += 1; + } + let mut struct_ser = serializer.serialize_struct("datafusion.ColumnStats", len)?; + if let Some(v) = self.min_value.as_ref() { + struct_ser.serialize_field("minValue", v)?; + } + if let Some(v) = self.max_value.as_ref() { + struct_ser.serialize_field("maxValue", v)?; + } + if let Some(v) = self.null_count.as_ref() { + struct_ser.serialize_field("nullCount", v)?; + } + if let Some(v) = self.distinct_count.as_ref() { + struct_ser.serialize_field("distinctCount", v)?; + } + struct_ser.end() + } +} +impl<'de> serde::Deserialize<'de> for ColumnStats { + #[allow(deprecated)] + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + const FIELDS: &[&str] = &[ + "min_value", + "minValue", + "max_value", + "maxValue", + "null_count", + "nullCount", + "distinct_count", + "distinctCount", + ]; + + #[allow(clippy::enum_variant_names)] + enum GeneratedField { + MinValue, + MaxValue, + NullCount, + DistinctCount, + } + impl<'de> serde::Deserialize<'de> for GeneratedField { + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + struct GeneratedVisitor; + + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = GeneratedField; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(formatter, "expected one of: {:?}", &FIELDS) + } + + #[allow(unused_variables)] + fn visit_str(self, value: &str) -> std::result::Result + where + E: serde::de::Error, + { + match value { + "minValue" | "min_value" => Ok(GeneratedField::MinValue), + "maxValue" | "max_value" => Ok(GeneratedField::MaxValue), + "nullCount" | "null_count" => Ok(GeneratedField::NullCount), + "distinctCount" | "distinct_count" => Ok(GeneratedField::DistinctCount), + _ => Err(serde::de::Error::unknown_field(value, FIELDS)), + } + } + } + deserializer.deserialize_identifier(GeneratedVisitor) + } + } + struct GeneratedVisitor; + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = ColumnStats; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { formatter.write_str("struct datafusion.ColumnStats") @@ -3792,13 +4221,10 @@ impl serde::Serialize for CopyToNode { if !self.output_url.is_empty() { len += 1; } - if !self.file_type.is_empty() { - len += 1; - } if !self.partition_by.is_empty() { len += 1; } - if self.copy_options.is_some() { + if self.format_options.is_some() { len += 1; } let mut struct_ser = serializer.serialize_struct("datafusion.CopyToNode", len)?; @@ -3808,19 +4234,25 @@ impl serde::Serialize for CopyToNode { if !self.output_url.is_empty() { struct_ser.serialize_field("outputUrl", &self.output_url)?; } - if !self.file_type.is_empty() { - struct_ser.serialize_field("fileType", &self.file_type)?; - } if !self.partition_by.is_empty() { struct_ser.serialize_field("partitionBy", &self.partition_by)?; } - if let Some(v) = self.copy_options.as_ref() { + if let Some(v) = self.format_options.as_ref() { match v { - copy_to_node::CopyOptions::SqlOptions(v) => { - struct_ser.serialize_field("sqlOptions", v)?; + copy_to_node::FormatOptions::Csv(v) => { + struct_ser.serialize_field("csv", v)?; + } + copy_to_node::FormatOptions::Json(v) => { + struct_ser.serialize_field("json", v)?; + } + copy_to_node::FormatOptions::Parquet(v) => { + struct_ser.serialize_field("parquet", v)?; } - copy_to_node::CopyOptions::WriterOptions(v) => { - struct_ser.serialize_field("writerOptions", v)?; + copy_to_node::FormatOptions::Avro(v) => { + struct_ser.serialize_field("avro", v)?; + } + copy_to_node::FormatOptions::Arrow(v) => { + struct_ser.serialize_field("arrow", v)?; } } } @@ -3837,24 +4269,25 @@ impl<'de> serde::Deserialize<'de> for CopyToNode { "input", "output_url", "outputUrl", - "file_type", - "fileType", "partition_by", "partitionBy", - "sql_options", - "sqlOptions", - "writer_options", - "writerOptions", + "csv", + "json", + "parquet", + "avro", + "arrow", ]; #[allow(clippy::enum_variant_names)] enum GeneratedField { Input, OutputUrl, - FileType, PartitionBy, - SqlOptions, - WriterOptions, + Csv, + Json, + Parquet, + Avro, + Arrow, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -3878,10 +4311,12 @@ impl<'de> serde::Deserialize<'de> for CopyToNode { match value { "input" => Ok(GeneratedField::Input), "outputUrl" | "output_url" => Ok(GeneratedField::OutputUrl), - "fileType" | "file_type" => Ok(GeneratedField::FileType), "partitionBy" | "partition_by" => Ok(GeneratedField::PartitionBy), - "sqlOptions" | "sql_options" => Ok(GeneratedField::SqlOptions), - "writerOptions" | "writer_options" => Ok(GeneratedField::WriterOptions), + "csv" => Ok(GeneratedField::Csv), + "json" => Ok(GeneratedField::Json), + "parquet" => Ok(GeneratedField::Parquet), + "avro" => Ok(GeneratedField::Avro), + "arrow" => Ok(GeneratedField::Arrow), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -3903,9 +4338,8 @@ impl<'de> serde::Deserialize<'de> for CopyToNode { { let mut input__ = None; let mut output_url__ = None; - let mut file_type__ = None; let mut partition_by__ = None; - let mut copy_options__ = None; + let mut format_options__ = None; while let Some(k) = map_.next_key()? { match k { GeneratedField::Input => { @@ -3920,30 +4354,45 @@ impl<'de> serde::Deserialize<'de> for CopyToNode { } output_url__ = Some(map_.next_value()?); } - GeneratedField::FileType => { - if file_type__.is_some() { - return Err(serde::de::Error::duplicate_field("fileType")); - } - file_type__ = Some(map_.next_value()?); - } GeneratedField::PartitionBy => { if partition_by__.is_some() { return Err(serde::de::Error::duplicate_field("partitionBy")); } partition_by__ = Some(map_.next_value()?); } - GeneratedField::SqlOptions => { - if copy_options__.is_some() { - return Err(serde::de::Error::duplicate_field("sqlOptions")); + GeneratedField::Csv => { + if format_options__.is_some() { + return Err(serde::de::Error::duplicate_field("csv")); } - copy_options__ = map_.next_value::<::std::option::Option<_>>()?.map(copy_to_node::CopyOptions::SqlOptions) + format_options__ = map_.next_value::<::std::option::Option<_>>()?.map(copy_to_node::FormatOptions::Csv) ; } - GeneratedField::WriterOptions => { - if copy_options__.is_some() { - return Err(serde::de::Error::duplicate_field("writerOptions")); + GeneratedField::Json => { + if format_options__.is_some() { + return Err(serde::de::Error::duplicate_field("json")); + } + format_options__ = map_.next_value::<::std::option::Option<_>>()?.map(copy_to_node::FormatOptions::Json) +; + } + GeneratedField::Parquet => { + if format_options__.is_some() { + return Err(serde::de::Error::duplicate_field("parquet")); + } + format_options__ = map_.next_value::<::std::option::Option<_>>()?.map(copy_to_node::FormatOptions::Parquet) +; + } + GeneratedField::Avro => { + if format_options__.is_some() { + return Err(serde::de::Error::duplicate_field("avro")); } - copy_options__ = map_.next_value::<::std::option::Option<_>>()?.map(copy_to_node::CopyOptions::WriterOptions) + format_options__ = map_.next_value::<::std::option::Option<_>>()?.map(copy_to_node::FormatOptions::Avro) +; + } + GeneratedField::Arrow => { + if format_options__.is_some() { + return Err(serde::de::Error::duplicate_field("arrow")); + } + format_options__ = map_.next_value::<::std::option::Option<_>>()?.map(copy_to_node::FormatOptions::Arrow) ; } } @@ -3951,9 +4400,8 @@ impl<'de> serde::Deserialize<'de> for CopyToNode { Ok(CopyToNode { input: input__, output_url: output_url__.unwrap_or_default(), - file_type: file_type__.unwrap_or_default(), partition_by: partition_by__.unwrap_or_default(), - copy_options: copy_options__, + format_options: format_options__, }) } } @@ -4923,34 +5371,12 @@ impl serde::Serialize for CsvFormat { { use serde::ser::SerializeStruct; let mut len = 0; - if self.has_header { - len += 1; - } - if !self.delimiter.is_empty() { - len += 1; - } - if !self.quote.is_empty() { - len += 1; - } - if self.optional_escape.is_some() { + if self.options.is_some() { len += 1; } let mut struct_ser = serializer.serialize_struct("datafusion.CsvFormat", len)?; - if self.has_header { - struct_ser.serialize_field("hasHeader", &self.has_header)?; - } - if !self.delimiter.is_empty() { - struct_ser.serialize_field("delimiter", &self.delimiter)?; - } - if !self.quote.is_empty() { - struct_ser.serialize_field("quote", &self.quote)?; - } - if let Some(v) = self.optional_escape.as_ref() { - match v { - csv_format::OptionalEscape::Escape(v) => { - struct_ser.serialize_field("escape", v)?; - } - } + if let Some(v) = self.options.as_ref() { + struct_ser.serialize_field("options", v)?; } struct_ser.end() } @@ -4962,19 +5388,12 @@ impl<'de> serde::Deserialize<'de> for CsvFormat { D: serde::Deserializer<'de>, { const FIELDS: &[&str] = &[ - "has_header", - "hasHeader", - "delimiter", - "quote", - "escape", + "options", ]; #[allow(clippy::enum_variant_names)] enum GeneratedField { - HasHeader, - Delimiter, - Quote, - Escape, + Options, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -4996,10 +5415,7 @@ impl<'de> serde::Deserialize<'de> for CsvFormat { E: serde::de::Error, { match value { - "hasHeader" | "has_header" => Ok(GeneratedField::HasHeader), - "delimiter" => Ok(GeneratedField::Delimiter), - "quote" => Ok(GeneratedField::Quote), - "escape" => Ok(GeneratedField::Escape), + "options" => Ok(GeneratedField::Options), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -5019,50 +5435,26 @@ impl<'de> serde::Deserialize<'de> for CsvFormat { where V: serde::de::MapAccess<'de>, { - let mut has_header__ = None; - let mut delimiter__ = None; - let mut quote__ = None; - let mut optional_escape__ = None; + let mut options__ = None; while let Some(k) = map_.next_key()? { match k { - GeneratedField::HasHeader => { - if has_header__.is_some() { - return Err(serde::de::Error::duplicate_field("hasHeader")); - } - has_header__ = Some(map_.next_value()?); - } - GeneratedField::Delimiter => { - if delimiter__.is_some() { - return Err(serde::de::Error::duplicate_field("delimiter")); - } - delimiter__ = Some(map_.next_value()?); - } - GeneratedField::Quote => { - if quote__.is_some() { - return Err(serde::de::Error::duplicate_field("quote")); - } - quote__ = Some(map_.next_value()?); - } - GeneratedField::Escape => { - if optional_escape__.is_some() { - return Err(serde::de::Error::duplicate_field("escape")); + GeneratedField::Options => { + if options__.is_some() { + return Err(serde::de::Error::duplicate_field("options")); } - optional_escape__ = map_.next_value::<::std::option::Option<_>>()?.map(csv_format::OptionalEscape::Escape); + options__ = map_.next_value()?; } } } Ok(CsvFormat { - has_header: has_header__.unwrap_or_default(), - delimiter: delimiter__.unwrap_or_default(), - quote: quote__.unwrap_or_default(), - optional_escape: optional_escape__, + options: options__, }) } } deserializer.deserialize_struct("datafusion.CsvFormat", FIELDS, GeneratedVisitor) } } -impl serde::Serialize for CsvScanExecNode { +impl serde::Serialize for CsvOptions { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result where @@ -5070,9 +5462,6 @@ impl serde::Serialize for CsvScanExecNode { { use serde::ser::SerializeStruct; let mut len = 0; - if self.base_conf.is_some() { - len += 1; - } if self.has_header { len += 1; } @@ -5082,14 +5471,317 @@ impl serde::Serialize for CsvScanExecNode { if !self.quote.is_empty() { len += 1; } - if self.optional_escape.is_some() { + if !self.escape.is_empty() { len += 1; } - let mut struct_ser = serializer.serialize_struct("datafusion.CsvScanExecNode", len)?; - if let Some(v) = self.base_conf.as_ref() { - struct_ser.serialize_field("baseConf", v)?; + if self.compression != 0 { + len += 1; } - if self.has_header { + if self.schema_infer_max_rec != 0 { + len += 1; + } + if !self.date_format.is_empty() { + len += 1; + } + if !self.datetime_format.is_empty() { + len += 1; + } + if !self.timestamp_format.is_empty() { + len += 1; + } + if !self.timestamp_tz_format.is_empty() { + len += 1; + } + if !self.time_format.is_empty() { + len += 1; + } + if !self.null_value.is_empty() { + len += 1; + } + let mut struct_ser = serializer.serialize_struct("datafusion.CsvOptions", len)?; + if self.has_header { + struct_ser.serialize_field("hasHeader", &self.has_header)?; + } + if !self.delimiter.is_empty() { + #[allow(clippy::needless_borrow)] + struct_ser.serialize_field("delimiter", pbjson::private::base64::encode(&self.delimiter).as_str())?; + } + if !self.quote.is_empty() { + #[allow(clippy::needless_borrow)] + struct_ser.serialize_field("quote", pbjson::private::base64::encode(&self.quote).as_str())?; + } + if !self.escape.is_empty() { + #[allow(clippy::needless_borrow)] + struct_ser.serialize_field("escape", pbjson::private::base64::encode(&self.escape).as_str())?; + } + if self.compression != 0 { + let v = CompressionTypeVariant::try_from(self.compression) + .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", self.compression)))?; + struct_ser.serialize_field("compression", &v)?; + } + if self.schema_infer_max_rec != 0 { + #[allow(clippy::needless_borrow)] + struct_ser.serialize_field("schemaInferMaxRec", ToString::to_string(&self.schema_infer_max_rec).as_str())?; + } + if !self.date_format.is_empty() { + struct_ser.serialize_field("dateFormat", &self.date_format)?; + } + if !self.datetime_format.is_empty() { + struct_ser.serialize_field("datetimeFormat", &self.datetime_format)?; + } + if !self.timestamp_format.is_empty() { + struct_ser.serialize_field("timestampFormat", &self.timestamp_format)?; + } + if !self.timestamp_tz_format.is_empty() { + struct_ser.serialize_field("timestampTzFormat", &self.timestamp_tz_format)?; + } + if !self.time_format.is_empty() { + struct_ser.serialize_field("timeFormat", &self.time_format)?; + } + if !self.null_value.is_empty() { + struct_ser.serialize_field("nullValue", &self.null_value)?; + } + struct_ser.end() + } +} +impl<'de> serde::Deserialize<'de> for CsvOptions { + #[allow(deprecated)] + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + const FIELDS: &[&str] = &[ + "has_header", + "hasHeader", + "delimiter", + "quote", + "escape", + "compression", + "schema_infer_max_rec", + "schemaInferMaxRec", + "date_format", + "dateFormat", + "datetime_format", + "datetimeFormat", + "timestamp_format", + "timestampFormat", + "timestamp_tz_format", + "timestampTzFormat", + "time_format", + "timeFormat", + "null_value", + "nullValue", + ]; + + #[allow(clippy::enum_variant_names)] + enum GeneratedField { + HasHeader, + Delimiter, + Quote, + Escape, + Compression, + SchemaInferMaxRec, + DateFormat, + DatetimeFormat, + TimestampFormat, + TimestampTzFormat, + TimeFormat, + NullValue, + } + impl<'de> serde::Deserialize<'de> for GeneratedField { + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + struct GeneratedVisitor; + + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = GeneratedField; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(formatter, "expected one of: {:?}", &FIELDS) + } + + #[allow(unused_variables)] + fn visit_str(self, value: &str) -> std::result::Result + where + E: serde::de::Error, + { + match value { + "hasHeader" | "has_header" => Ok(GeneratedField::HasHeader), + "delimiter" => Ok(GeneratedField::Delimiter), + "quote" => Ok(GeneratedField::Quote), + "escape" => Ok(GeneratedField::Escape), + "compression" => Ok(GeneratedField::Compression), + "schemaInferMaxRec" | "schema_infer_max_rec" => Ok(GeneratedField::SchemaInferMaxRec), + "dateFormat" | "date_format" => Ok(GeneratedField::DateFormat), + "datetimeFormat" | "datetime_format" => Ok(GeneratedField::DatetimeFormat), + "timestampFormat" | "timestamp_format" => Ok(GeneratedField::TimestampFormat), + "timestampTzFormat" | "timestamp_tz_format" => Ok(GeneratedField::TimestampTzFormat), + "timeFormat" | "time_format" => Ok(GeneratedField::TimeFormat), + "nullValue" | "null_value" => Ok(GeneratedField::NullValue), + _ => Err(serde::de::Error::unknown_field(value, FIELDS)), + } + } + } + deserializer.deserialize_identifier(GeneratedVisitor) + } + } + struct GeneratedVisitor; + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = CsvOptions; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + formatter.write_str("struct datafusion.CsvOptions") + } + + fn visit_map(self, mut map_: V) -> std::result::Result + where + V: serde::de::MapAccess<'de>, + { + let mut has_header__ = None; + let mut delimiter__ = None; + let mut quote__ = None; + let mut escape__ = None; + let mut compression__ = None; + let mut schema_infer_max_rec__ = None; + let mut date_format__ = None; + let mut datetime_format__ = None; + let mut timestamp_format__ = None; + let mut timestamp_tz_format__ = None; + let mut time_format__ = None; + let mut null_value__ = None; + while let Some(k) = map_.next_key()? { + match k { + GeneratedField::HasHeader => { + if has_header__.is_some() { + return Err(serde::de::Error::duplicate_field("hasHeader")); + } + has_header__ = Some(map_.next_value()?); + } + GeneratedField::Delimiter => { + if delimiter__.is_some() { + return Err(serde::de::Error::duplicate_field("delimiter")); + } + delimiter__ = + Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0) + ; + } + GeneratedField::Quote => { + if quote__.is_some() { + return Err(serde::de::Error::duplicate_field("quote")); + } + quote__ = + Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0) + ; + } + GeneratedField::Escape => { + if escape__.is_some() { + return Err(serde::de::Error::duplicate_field("escape")); + } + escape__ = + Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0) + ; + } + GeneratedField::Compression => { + if compression__.is_some() { + return Err(serde::de::Error::duplicate_field("compression")); + } + compression__ = Some(map_.next_value::()? as i32); + } + GeneratedField::SchemaInferMaxRec => { + if schema_infer_max_rec__.is_some() { + return Err(serde::de::Error::duplicate_field("schemaInferMaxRec")); + } + schema_infer_max_rec__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + GeneratedField::DateFormat => { + if date_format__.is_some() { + return Err(serde::de::Error::duplicate_field("dateFormat")); + } + date_format__ = Some(map_.next_value()?); + } + GeneratedField::DatetimeFormat => { + if datetime_format__.is_some() { + return Err(serde::de::Error::duplicate_field("datetimeFormat")); + } + datetime_format__ = Some(map_.next_value()?); + } + GeneratedField::TimestampFormat => { + if timestamp_format__.is_some() { + return Err(serde::de::Error::duplicate_field("timestampFormat")); + } + timestamp_format__ = Some(map_.next_value()?); + } + GeneratedField::TimestampTzFormat => { + if timestamp_tz_format__.is_some() { + return Err(serde::de::Error::duplicate_field("timestampTzFormat")); + } + timestamp_tz_format__ = Some(map_.next_value()?); + } + GeneratedField::TimeFormat => { + if time_format__.is_some() { + return Err(serde::de::Error::duplicate_field("timeFormat")); + } + time_format__ = Some(map_.next_value()?); + } + GeneratedField::NullValue => { + if null_value__.is_some() { + return Err(serde::de::Error::duplicate_field("nullValue")); + } + null_value__ = Some(map_.next_value()?); + } + } + } + Ok(CsvOptions { + has_header: has_header__.unwrap_or_default(), + delimiter: delimiter__.unwrap_or_default(), + quote: quote__.unwrap_or_default(), + escape: escape__.unwrap_or_default(), + compression: compression__.unwrap_or_default(), + schema_infer_max_rec: schema_infer_max_rec__.unwrap_or_default(), + date_format: date_format__.unwrap_or_default(), + datetime_format: datetime_format__.unwrap_or_default(), + timestamp_format: timestamp_format__.unwrap_or_default(), + timestamp_tz_format: timestamp_tz_format__.unwrap_or_default(), + time_format: time_format__.unwrap_or_default(), + null_value: null_value__.unwrap_or_default(), + }) + } + } + deserializer.deserialize_struct("datafusion.CsvOptions", FIELDS, GeneratedVisitor) + } +} +impl serde::Serialize for CsvScanExecNode { + #[allow(deprecated)] + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + let mut len = 0; + if self.base_conf.is_some() { + len += 1; + } + if self.has_header { + len += 1; + } + if !self.delimiter.is_empty() { + len += 1; + } + if !self.quote.is_empty() { + len += 1; + } + if self.optional_escape.is_some() { + len += 1; + } + let mut struct_ser = serializer.serialize_struct("datafusion.CsvScanExecNode", len)?; + if let Some(v) = self.base_conf.as_ref() { + struct_ser.serialize_field("baseConf", v)?; + } + if self.has_header { struct_ser.serialize_field("hasHeader", &self.has_header)?; } if !self.delimiter.is_empty() { @@ -5238,10 +5930,16 @@ impl serde::Serialize for CsvSink { if self.config.is_some() { len += 1; } + if self.writer_options.is_some() { + len += 1; + } let mut struct_ser = serializer.serialize_struct("datafusion.CsvSink", len)?; if let Some(v) = self.config.as_ref() { struct_ser.serialize_field("config", v)?; } + if let Some(v) = self.writer_options.as_ref() { + struct_ser.serialize_field("writerOptions", v)?; + } struct_ser.end() } } @@ -5253,11 +5951,14 @@ impl<'de> serde::Deserialize<'de> for CsvSink { { const FIELDS: &[&str] = &[ "config", + "writer_options", + "writerOptions", ]; #[allow(clippy::enum_variant_names)] enum GeneratedField { Config, + WriterOptions, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -5280,6 +5981,7 @@ impl<'de> serde::Deserialize<'de> for CsvSink { { match value { "config" => Ok(GeneratedField::Config), + "writerOptions" | "writer_options" => Ok(GeneratedField::WriterOptions), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -5300,6 +6002,7 @@ impl<'de> serde::Deserialize<'de> for CsvSink { V: serde::de::MapAccess<'de>, { let mut config__ = None; + let mut writer_options__ = None; while let Some(k) = map_.next_key()? { match k { GeneratedField::Config => { @@ -5308,10 +6011,17 @@ impl<'de> serde::Deserialize<'de> for CsvSink { } config__ = map_.next_value()?; } + GeneratedField::WriterOptions => { + if writer_options__.is_some() { + return Err(serde::de::Error::duplicate_field("writerOptions")); + } + writer_options__ = map_.next_value()?; + } } } Ok(CsvSink { config: config__, + writer_options: writer_options__, }) } } @@ -8211,9 +8921,6 @@ impl serde::Serialize for FileSinkConfig { if self.overwrite { len += 1; } - if self.file_type_writer_options.is_some() { - len += 1; - } let mut struct_ser = serializer.serialize_struct("datafusion.FileSinkConfig", len)?; if !self.object_store_url.is_empty() { struct_ser.serialize_field("objectStoreUrl", &self.object_store_url)?; @@ -8233,9 +8940,6 @@ impl serde::Serialize for FileSinkConfig { if self.overwrite { struct_ser.serialize_field("overwrite", &self.overwrite)?; } - if let Some(v) = self.file_type_writer_options.as_ref() { - struct_ser.serialize_field("fileTypeWriterOptions", v)?; - } struct_ser.end() } } @@ -8257,8 +8961,6 @@ impl<'de> serde::Deserialize<'de> for FileSinkConfig { "table_partition_cols", "tablePartitionCols", "overwrite", - "file_type_writer_options", - "fileTypeWriterOptions", ]; #[allow(clippy::enum_variant_names)] @@ -8269,7 +8971,6 @@ impl<'de> serde::Deserialize<'de> for FileSinkConfig { OutputSchema, TablePartitionCols, Overwrite, - FileTypeWriterOptions, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -8297,7 +8998,6 @@ impl<'de> serde::Deserialize<'de> for FileSinkConfig { "outputSchema" | "output_schema" => Ok(GeneratedField::OutputSchema), "tablePartitionCols" | "table_partition_cols" => Ok(GeneratedField::TablePartitionCols), "overwrite" => Ok(GeneratedField::Overwrite), - "fileTypeWriterOptions" | "file_type_writer_options" => Ok(GeneratedField::FileTypeWriterOptions), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -8323,7 +9023,6 @@ impl<'de> serde::Deserialize<'de> for FileSinkConfig { let mut output_schema__ = None; let mut table_partition_cols__ = None; let mut overwrite__ = None; - let mut file_type_writer_options__ = None; while let Some(k) = map_.next_key()? { match k { GeneratedField::ObjectStoreUrl => { @@ -8362,12 +9061,6 @@ impl<'de> serde::Deserialize<'de> for FileSinkConfig { } overwrite__ = Some(map_.next_value()?); } - GeneratedField::FileTypeWriterOptions => { - if file_type_writer_options__.is_some() { - return Err(serde::de::Error::duplicate_field("fileTypeWriterOptions")); - } - file_type_writer_options__ = map_.next_value()?; - } } } Ok(FileSinkConfig { @@ -8377,152 +9070,12 @@ impl<'de> serde::Deserialize<'de> for FileSinkConfig { output_schema: output_schema__, table_partition_cols: table_partition_cols__.unwrap_or_default(), overwrite: overwrite__.unwrap_or_default(), - file_type_writer_options: file_type_writer_options__, }) } } deserializer.deserialize_struct("datafusion.FileSinkConfig", FIELDS, GeneratedVisitor) } } -impl serde::Serialize for FileTypeWriterOptions { - #[allow(deprecated)] - fn serialize(&self, serializer: S) -> std::result::Result - where - S: serde::Serializer, - { - use serde::ser::SerializeStruct; - let mut len = 0; - if self.file_type.is_some() { - len += 1; - } - let mut struct_ser = serializer.serialize_struct("datafusion.FileTypeWriterOptions", len)?; - if let Some(v) = self.file_type.as_ref() { - match v { - file_type_writer_options::FileType::JsonOptions(v) => { - struct_ser.serialize_field("jsonOptions", v)?; - } - file_type_writer_options::FileType::ParquetOptions(v) => { - struct_ser.serialize_field("parquetOptions", v)?; - } - file_type_writer_options::FileType::CsvOptions(v) => { - struct_ser.serialize_field("csvOptions", v)?; - } - file_type_writer_options::FileType::ArrowOptions(v) => { - struct_ser.serialize_field("arrowOptions", v)?; - } - } - } - struct_ser.end() - } -} -impl<'de> serde::Deserialize<'de> for FileTypeWriterOptions { - #[allow(deprecated)] - fn deserialize(deserializer: D) -> std::result::Result - where - D: serde::Deserializer<'de>, - { - const FIELDS: &[&str] = &[ - "json_options", - "jsonOptions", - "parquet_options", - "parquetOptions", - "csv_options", - "csvOptions", - "arrow_options", - "arrowOptions", - ]; - - #[allow(clippy::enum_variant_names)] - enum GeneratedField { - JsonOptions, - ParquetOptions, - CsvOptions, - ArrowOptions, - } - impl<'de> serde::Deserialize<'de> for GeneratedField { - fn deserialize(deserializer: D) -> std::result::Result - where - D: serde::Deserializer<'de>, - { - struct GeneratedVisitor; - - impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = GeneratedField; - - fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(formatter, "expected one of: {:?}", &FIELDS) - } - - #[allow(unused_variables)] - fn visit_str(self, value: &str) -> std::result::Result - where - E: serde::de::Error, - { - match value { - "jsonOptions" | "json_options" => Ok(GeneratedField::JsonOptions), - "parquetOptions" | "parquet_options" => Ok(GeneratedField::ParquetOptions), - "csvOptions" | "csv_options" => Ok(GeneratedField::CsvOptions), - "arrowOptions" | "arrow_options" => Ok(GeneratedField::ArrowOptions), - _ => Err(serde::de::Error::unknown_field(value, FIELDS)), - } - } - } - deserializer.deserialize_identifier(GeneratedVisitor) - } - } - struct GeneratedVisitor; - impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = FileTypeWriterOptions; - - fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion.FileTypeWriterOptions") - } - - fn visit_map(self, mut map_: V) -> std::result::Result - where - V: serde::de::MapAccess<'de>, - { - let mut file_type__ = None; - while let Some(k) = map_.next_key()? { - match k { - GeneratedField::JsonOptions => { - if file_type__.is_some() { - return Err(serde::de::Error::duplicate_field("jsonOptions")); - } - file_type__ = map_.next_value::<::std::option::Option<_>>()?.map(file_type_writer_options::FileType::JsonOptions) -; - } - GeneratedField::ParquetOptions => { - if file_type__.is_some() { - return Err(serde::de::Error::duplicate_field("parquetOptions")); - } - file_type__ = map_.next_value::<::std::option::Option<_>>()?.map(file_type_writer_options::FileType::ParquetOptions) -; - } - GeneratedField::CsvOptions => { - if file_type__.is_some() { - return Err(serde::de::Error::duplicate_field("csvOptions")); - } - file_type__ = map_.next_value::<::std::option::Option<_>>()?.map(file_type_writer_options::FileType::CsvOptions) -; - } - GeneratedField::ArrowOptions => { - if file_type__.is_some() { - return Err(serde::de::Error::duplicate_field("arrowOptions")); - } - file_type__ = map_.next_value::<::std::option::Option<_>>()?.map(file_type_writer_options::FileType::ArrowOptions) -; - } - } - } - Ok(FileTypeWriterOptions { - file_type: file_type__, - }) - } - } - deserializer.deserialize_struct("datafusion.FileTypeWriterOptions", FIELDS, GeneratedVisitor) - } -} impl serde::Serialize for FilterExecNode { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result @@ -11651,7 +12204,121 @@ impl<'de> serde::Deserialize<'de> for JoinType { } } } - deserializer.deserialize_any(GeneratedVisitor) + deserializer.deserialize_any(GeneratedVisitor) + } +} +impl serde::Serialize for JsonOptions { + #[allow(deprecated)] + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + let mut len = 0; + if self.compression != 0 { + len += 1; + } + if self.schema_infer_max_rec != 0 { + len += 1; + } + let mut struct_ser = serializer.serialize_struct("datafusion.JsonOptions", len)?; + if self.compression != 0 { + let v = CompressionTypeVariant::try_from(self.compression) + .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", self.compression)))?; + struct_ser.serialize_field("compression", &v)?; + } + if self.schema_infer_max_rec != 0 { + #[allow(clippy::needless_borrow)] + struct_ser.serialize_field("schemaInferMaxRec", ToString::to_string(&self.schema_infer_max_rec).as_str())?; + } + struct_ser.end() + } +} +impl<'de> serde::Deserialize<'de> for JsonOptions { + #[allow(deprecated)] + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + const FIELDS: &[&str] = &[ + "compression", + "schema_infer_max_rec", + "schemaInferMaxRec", + ]; + + #[allow(clippy::enum_variant_names)] + enum GeneratedField { + Compression, + SchemaInferMaxRec, + } + impl<'de> serde::Deserialize<'de> for GeneratedField { + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + struct GeneratedVisitor; + + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = GeneratedField; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(formatter, "expected one of: {:?}", &FIELDS) + } + + #[allow(unused_variables)] + fn visit_str(self, value: &str) -> std::result::Result + where + E: serde::de::Error, + { + match value { + "compression" => Ok(GeneratedField::Compression), + "schemaInferMaxRec" | "schema_infer_max_rec" => Ok(GeneratedField::SchemaInferMaxRec), + _ => Err(serde::de::Error::unknown_field(value, FIELDS)), + } + } + } + deserializer.deserialize_identifier(GeneratedVisitor) + } + } + struct GeneratedVisitor; + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = JsonOptions; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + formatter.write_str("struct datafusion.JsonOptions") + } + + fn visit_map(self, mut map_: V) -> std::result::Result + where + V: serde::de::MapAccess<'de>, + { + let mut compression__ = None; + let mut schema_infer_max_rec__ = None; + while let Some(k) = map_.next_key()? { + match k { + GeneratedField::Compression => { + if compression__.is_some() { + return Err(serde::de::Error::duplicate_field("compression")); + } + compression__ = Some(map_.next_value::()? as i32); + } + GeneratedField::SchemaInferMaxRec => { + if schema_infer_max_rec__.is_some() { + return Err(serde::de::Error::duplicate_field("schemaInferMaxRec")); + } + schema_infer_max_rec__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + } + } + Ok(JsonOptions { + compression: compression__.unwrap_or_default(), + schema_infer_max_rec: schema_infer_max_rec__.unwrap_or_default(), + }) + } + } + deserializer.deserialize_struct("datafusion.JsonOptions", FIELDS, GeneratedVisitor) } } impl serde::Serialize for JsonSink { @@ -11665,10 +12332,16 @@ impl serde::Serialize for JsonSink { if self.config.is_some() { len += 1; } + if self.writer_options.is_some() { + len += 1; + } let mut struct_ser = serializer.serialize_struct("datafusion.JsonSink", len)?; if let Some(v) = self.config.as_ref() { struct_ser.serialize_field("config", v)?; } + if let Some(v) = self.writer_options.as_ref() { + struct_ser.serialize_field("writerOptions", v)?; + } struct_ser.end() } } @@ -11680,11 +12353,14 @@ impl<'de> serde::Deserialize<'de> for JsonSink { { const FIELDS: &[&str] = &[ "config", + "writer_options", + "writerOptions", ]; #[allow(clippy::enum_variant_names)] enum GeneratedField { Config, + WriterOptions, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -11707,6 +12383,7 @@ impl<'de> serde::Deserialize<'de> for JsonSink { { match value { "config" => Ok(GeneratedField::Config), + "writerOptions" | "writer_options" => Ok(GeneratedField::WriterOptions), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -11727,6 +12404,7 @@ impl<'de> serde::Deserialize<'de> for JsonSink { V: serde::de::MapAccess<'de>, { let mut config__ = None; + let mut writer_options__ = None; while let Some(k) = map_.next_key()? { match k { GeneratedField::Config => { @@ -11735,10 +12413,17 @@ impl<'de> serde::Deserialize<'de> for JsonSink { } config__ = map_.next_value()?; } + GeneratedField::WriterOptions => { + if writer_options__.is_some() { + return Err(serde::de::Error::duplicate_field("writerOptions")); + } + writer_options__ = map_.next_value()?; + } } } Ok(JsonSink { config: config__, + writer_options: writer_options__, }) } } @@ -15387,37 +16072,397 @@ impl<'de> serde::Deserialize<'de> for OwnedTableReference { } } } - Ok(OwnedTableReference { - table_reference_enum: table_reference_enum__, - }) + Ok(OwnedTableReference { + table_reference_enum: table_reference_enum__, + }) + } + } + deserializer.deserialize_struct("datafusion.OwnedTableReference", FIELDS, GeneratedVisitor) + } +} +impl serde::Serialize for ParquetFormat { + #[allow(deprecated)] + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + let mut len = 0; + if self.options.is_some() { + len += 1; + } + let mut struct_ser = serializer.serialize_struct("datafusion.ParquetFormat", len)?; + if let Some(v) = self.options.as_ref() { + struct_ser.serialize_field("options", v)?; + } + struct_ser.end() + } +} +impl<'de> serde::Deserialize<'de> for ParquetFormat { + #[allow(deprecated)] + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + const FIELDS: &[&str] = &[ + "options", + ]; + + #[allow(clippy::enum_variant_names)] + enum GeneratedField { + Options, + } + impl<'de> serde::Deserialize<'de> for GeneratedField { + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + struct GeneratedVisitor; + + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = GeneratedField; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(formatter, "expected one of: {:?}", &FIELDS) + } + + #[allow(unused_variables)] + fn visit_str(self, value: &str) -> std::result::Result + where + E: serde::de::Error, + { + match value { + "options" => Ok(GeneratedField::Options), + _ => Err(serde::de::Error::unknown_field(value, FIELDS)), + } + } + } + deserializer.deserialize_identifier(GeneratedVisitor) + } + } + struct GeneratedVisitor; + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = ParquetFormat; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + formatter.write_str("struct datafusion.ParquetFormat") + } + + fn visit_map(self, mut map_: V) -> std::result::Result + where + V: serde::de::MapAccess<'de>, + { + let mut options__ = None; + while let Some(k) = map_.next_key()? { + match k { + GeneratedField::Options => { + if options__.is_some() { + return Err(serde::de::Error::duplicate_field("options")); + } + options__ = map_.next_value()?; + } + } + } + Ok(ParquetFormat { + options: options__, + }) + } + } + deserializer.deserialize_struct("datafusion.ParquetFormat", FIELDS, GeneratedVisitor) + } +} +impl serde::Serialize for ParquetOptions { + #[allow(deprecated)] + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + let mut len = 0; + if self.enable_page_index { + len += 1; + } + if self.pruning { + len += 1; + } + if self.skip_metadata { + len += 1; + } + if self.pushdown_filters { + len += 1; + } + if self.reorder_filters { + len += 1; + } + if self.data_pagesize_limit != 0 { + len += 1; + } + if self.write_batch_size != 0 { + len += 1; + } + if !self.writer_version.is_empty() { + len += 1; + } + if self.bloom_filter_enabled { + len += 1; + } + if self.allow_single_file_parallelism { + len += 1; + } + if self.maximum_parallel_row_group_writers != 0 { + len += 1; + } + if self.maximum_buffered_record_batches_per_stream != 0 { + len += 1; + } + if self.dictionary_page_size_limit != 0 { + len += 1; + } + if self.data_page_row_count_limit != 0 { + len += 1; + } + if self.max_row_group_size != 0 { + len += 1; + } + if !self.created_by.is_empty() { + len += 1; + } + if self.metadata_size_hint_opt.is_some() { + len += 1; + } + if self.compression_opt.is_some() { + len += 1; + } + if self.dictionary_enabled_opt.is_some() { + len += 1; + } + if self.statistics_enabled_opt.is_some() { + len += 1; + } + if self.max_statistics_size_opt.is_some() { + len += 1; + } + if self.column_index_truncate_length_opt.is_some() { + len += 1; + } + if self.encoding_opt.is_some() { + len += 1; + } + if self.bloom_filter_fpp_opt.is_some() { + len += 1; + } + if self.bloom_filter_ndv_opt.is_some() { + len += 1; + } + let mut struct_ser = serializer.serialize_struct("datafusion.ParquetOptions", len)?; + if self.enable_page_index { + struct_ser.serialize_field("enablePageIndex", &self.enable_page_index)?; + } + if self.pruning { + struct_ser.serialize_field("pruning", &self.pruning)?; + } + if self.skip_metadata { + struct_ser.serialize_field("skipMetadata", &self.skip_metadata)?; + } + if self.pushdown_filters { + struct_ser.serialize_field("pushdownFilters", &self.pushdown_filters)?; + } + if self.reorder_filters { + struct_ser.serialize_field("reorderFilters", &self.reorder_filters)?; + } + if self.data_pagesize_limit != 0 { + #[allow(clippy::needless_borrow)] + struct_ser.serialize_field("dataPagesizeLimit", ToString::to_string(&self.data_pagesize_limit).as_str())?; + } + if self.write_batch_size != 0 { + #[allow(clippy::needless_borrow)] + struct_ser.serialize_field("writeBatchSize", ToString::to_string(&self.write_batch_size).as_str())?; + } + if !self.writer_version.is_empty() { + struct_ser.serialize_field("writerVersion", &self.writer_version)?; + } + if self.bloom_filter_enabled { + struct_ser.serialize_field("bloomFilterEnabled", &self.bloom_filter_enabled)?; + } + if self.allow_single_file_parallelism { + struct_ser.serialize_field("allowSingleFileParallelism", &self.allow_single_file_parallelism)?; + } + if self.maximum_parallel_row_group_writers != 0 { + #[allow(clippy::needless_borrow)] + struct_ser.serialize_field("maximumParallelRowGroupWriters", ToString::to_string(&self.maximum_parallel_row_group_writers).as_str())?; + } + if self.maximum_buffered_record_batches_per_stream != 0 { + #[allow(clippy::needless_borrow)] + struct_ser.serialize_field("maximumBufferedRecordBatchesPerStream", ToString::to_string(&self.maximum_buffered_record_batches_per_stream).as_str())?; + } + if self.dictionary_page_size_limit != 0 { + #[allow(clippy::needless_borrow)] + struct_ser.serialize_field("dictionaryPageSizeLimit", ToString::to_string(&self.dictionary_page_size_limit).as_str())?; + } + if self.data_page_row_count_limit != 0 { + #[allow(clippy::needless_borrow)] + struct_ser.serialize_field("dataPageRowCountLimit", ToString::to_string(&self.data_page_row_count_limit).as_str())?; + } + if self.max_row_group_size != 0 { + #[allow(clippy::needless_borrow)] + struct_ser.serialize_field("maxRowGroupSize", ToString::to_string(&self.max_row_group_size).as_str())?; + } + if !self.created_by.is_empty() { + struct_ser.serialize_field("createdBy", &self.created_by)?; + } + if let Some(v) = self.metadata_size_hint_opt.as_ref() { + match v { + parquet_options::MetadataSizeHintOpt::MetadataSizeHint(v) => { + #[allow(clippy::needless_borrow)] + struct_ser.serialize_field("metadataSizeHint", ToString::to_string(&v).as_str())?; + } + } + } + if let Some(v) = self.compression_opt.as_ref() { + match v { + parquet_options::CompressionOpt::Compression(v) => { + struct_ser.serialize_field("compression", v)?; + } + } + } + if let Some(v) = self.dictionary_enabled_opt.as_ref() { + match v { + parquet_options::DictionaryEnabledOpt::DictionaryEnabled(v) => { + struct_ser.serialize_field("dictionaryEnabled", v)?; + } + } + } + if let Some(v) = self.statistics_enabled_opt.as_ref() { + match v { + parquet_options::StatisticsEnabledOpt::StatisticsEnabled(v) => { + struct_ser.serialize_field("statisticsEnabled", v)?; + } + } + } + if let Some(v) = self.max_statistics_size_opt.as_ref() { + match v { + parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => { + #[allow(clippy::needless_borrow)] + struct_ser.serialize_field("maxStatisticsSize", ToString::to_string(&v).as_str())?; + } + } + } + if let Some(v) = self.column_index_truncate_length_opt.as_ref() { + match v { + parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(v) => { + #[allow(clippy::needless_borrow)] + struct_ser.serialize_field("columnIndexTruncateLength", ToString::to_string(&v).as_str())?; + } + } + } + if let Some(v) = self.encoding_opt.as_ref() { + match v { + parquet_options::EncodingOpt::Encoding(v) => { + struct_ser.serialize_field("encoding", v)?; + } + } + } + if let Some(v) = self.bloom_filter_fpp_opt.as_ref() { + match v { + parquet_options::BloomFilterFppOpt::BloomFilterFpp(v) => { + struct_ser.serialize_field("bloomFilterFpp", v)?; + } + } + } + if let Some(v) = self.bloom_filter_ndv_opt.as_ref() { + match v { + parquet_options::BloomFilterNdvOpt::BloomFilterNdv(v) => { + #[allow(clippy::needless_borrow)] + struct_ser.serialize_field("bloomFilterNdv", ToString::to_string(&v).as_str())?; + } } } - deserializer.deserialize_struct("datafusion.OwnedTableReference", FIELDS, GeneratedVisitor) - } -} -impl serde::Serialize for ParquetFormat { - #[allow(deprecated)] - fn serialize(&self, serializer: S) -> std::result::Result - where - S: serde::Serializer, - { - use serde::ser::SerializeStruct; - let len = 0; - let struct_ser = serializer.serialize_struct("datafusion.ParquetFormat", len)?; struct_ser.end() } } -impl<'de> serde::Deserialize<'de> for ParquetFormat { +impl<'de> serde::Deserialize<'de> for ParquetOptions { #[allow(deprecated)] fn deserialize(deserializer: D) -> std::result::Result where D: serde::Deserializer<'de>, { const FIELDS: &[&str] = &[ + "enable_page_index", + "enablePageIndex", + "pruning", + "skip_metadata", + "skipMetadata", + "pushdown_filters", + "pushdownFilters", + "reorder_filters", + "reorderFilters", + "data_pagesize_limit", + "dataPagesizeLimit", + "write_batch_size", + "writeBatchSize", + "writer_version", + "writerVersion", + "bloom_filter_enabled", + "bloomFilterEnabled", + "allow_single_file_parallelism", + "allowSingleFileParallelism", + "maximum_parallel_row_group_writers", + "maximumParallelRowGroupWriters", + "maximum_buffered_record_batches_per_stream", + "maximumBufferedRecordBatchesPerStream", + "dictionary_page_size_limit", + "dictionaryPageSizeLimit", + "data_page_row_count_limit", + "dataPageRowCountLimit", + "max_row_group_size", + "maxRowGroupSize", + "created_by", + "createdBy", + "metadata_size_hint", + "metadataSizeHint", + "compression", + "dictionary_enabled", + "dictionaryEnabled", + "statistics_enabled", + "statisticsEnabled", + "max_statistics_size", + "maxStatisticsSize", + "column_index_truncate_length", + "columnIndexTruncateLength", + "encoding", + "bloom_filter_fpp", + "bloomFilterFpp", + "bloom_filter_ndv", + "bloomFilterNdv", ]; #[allow(clippy::enum_variant_names)] enum GeneratedField { + EnablePageIndex, + Pruning, + SkipMetadata, + PushdownFilters, + ReorderFilters, + DataPagesizeLimit, + WriteBatchSize, + WriterVersion, + BloomFilterEnabled, + AllowSingleFileParallelism, + MaximumParallelRowGroupWriters, + MaximumBufferedRecordBatchesPerStream, + DictionaryPageSizeLimit, + DataPageRowCountLimit, + MaxRowGroupSize, + CreatedBy, + MetadataSizeHint, + Compression, + DictionaryEnabled, + StatisticsEnabled, + MaxStatisticsSize, + ColumnIndexTruncateLength, + Encoding, + BloomFilterFpp, + BloomFilterNdv, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -15438,7 +16483,34 @@ impl<'de> serde::Deserialize<'de> for ParquetFormat { where E: serde::de::Error, { - Err(serde::de::Error::unknown_field(value, FIELDS)) + match value { + "enablePageIndex" | "enable_page_index" => Ok(GeneratedField::EnablePageIndex), + "pruning" => Ok(GeneratedField::Pruning), + "skipMetadata" | "skip_metadata" => Ok(GeneratedField::SkipMetadata), + "pushdownFilters" | "pushdown_filters" => Ok(GeneratedField::PushdownFilters), + "reorderFilters" | "reorder_filters" => Ok(GeneratedField::ReorderFilters), + "dataPagesizeLimit" | "data_pagesize_limit" => Ok(GeneratedField::DataPagesizeLimit), + "writeBatchSize" | "write_batch_size" => Ok(GeneratedField::WriteBatchSize), + "writerVersion" | "writer_version" => Ok(GeneratedField::WriterVersion), + "bloomFilterEnabled" | "bloom_filter_enabled" => Ok(GeneratedField::BloomFilterEnabled), + "allowSingleFileParallelism" | "allow_single_file_parallelism" => Ok(GeneratedField::AllowSingleFileParallelism), + "maximumParallelRowGroupWriters" | "maximum_parallel_row_group_writers" => Ok(GeneratedField::MaximumParallelRowGroupWriters), + "maximumBufferedRecordBatchesPerStream" | "maximum_buffered_record_batches_per_stream" => Ok(GeneratedField::MaximumBufferedRecordBatchesPerStream), + "dictionaryPageSizeLimit" | "dictionary_page_size_limit" => Ok(GeneratedField::DictionaryPageSizeLimit), + "dataPageRowCountLimit" | "data_page_row_count_limit" => Ok(GeneratedField::DataPageRowCountLimit), + "maxRowGroupSize" | "max_row_group_size" => Ok(GeneratedField::MaxRowGroupSize), + "createdBy" | "created_by" => Ok(GeneratedField::CreatedBy), + "metadataSizeHint" | "metadata_size_hint" => Ok(GeneratedField::MetadataSizeHint), + "compression" => Ok(GeneratedField::Compression), + "dictionaryEnabled" | "dictionary_enabled" => Ok(GeneratedField::DictionaryEnabled), + "statisticsEnabled" | "statistics_enabled" => Ok(GeneratedField::StatisticsEnabled), + "maxStatisticsSize" | "max_statistics_size" => Ok(GeneratedField::MaxStatisticsSize), + "columnIndexTruncateLength" | "column_index_truncate_length" => Ok(GeneratedField::ColumnIndexTruncateLength), + "encoding" => Ok(GeneratedField::Encoding), + "bloomFilterFpp" | "bloom_filter_fpp" => Ok(GeneratedField::BloomFilterFpp), + "bloomFilterNdv" | "bloom_filter_ndv" => Ok(GeneratedField::BloomFilterNdv), + _ => Err(serde::de::Error::unknown_field(value, FIELDS)), + } } } deserializer.deserialize_identifier(GeneratedVisitor) @@ -15446,24 +16518,239 @@ impl<'de> serde::Deserialize<'de> for ParquetFormat { } struct GeneratedVisitor; impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = ParquetFormat; + type Value = ParquetOptions; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion.ParquetFormat") + formatter.write_str("struct datafusion.ParquetOptions") } - fn visit_map(self, mut map_: V) -> std::result::Result + fn visit_map(self, mut map_: V) -> std::result::Result where V: serde::de::MapAccess<'de>, { - while map_.next_key::()?.is_some() { - let _ = map_.next_value::()?; + let mut enable_page_index__ = None; + let mut pruning__ = None; + let mut skip_metadata__ = None; + let mut pushdown_filters__ = None; + let mut reorder_filters__ = None; + let mut data_pagesize_limit__ = None; + let mut write_batch_size__ = None; + let mut writer_version__ = None; + let mut bloom_filter_enabled__ = None; + let mut allow_single_file_parallelism__ = None; + let mut maximum_parallel_row_group_writers__ = None; + let mut maximum_buffered_record_batches_per_stream__ = None; + let mut dictionary_page_size_limit__ = None; + let mut data_page_row_count_limit__ = None; + let mut max_row_group_size__ = None; + let mut created_by__ = None; + let mut metadata_size_hint_opt__ = None; + let mut compression_opt__ = None; + let mut dictionary_enabled_opt__ = None; + let mut statistics_enabled_opt__ = None; + let mut max_statistics_size_opt__ = None; + let mut column_index_truncate_length_opt__ = None; + let mut encoding_opt__ = None; + let mut bloom_filter_fpp_opt__ = None; + let mut bloom_filter_ndv_opt__ = None; + while let Some(k) = map_.next_key()? { + match k { + GeneratedField::EnablePageIndex => { + if enable_page_index__.is_some() { + return Err(serde::de::Error::duplicate_field("enablePageIndex")); + } + enable_page_index__ = Some(map_.next_value()?); + } + GeneratedField::Pruning => { + if pruning__.is_some() { + return Err(serde::de::Error::duplicate_field("pruning")); + } + pruning__ = Some(map_.next_value()?); + } + GeneratedField::SkipMetadata => { + if skip_metadata__.is_some() { + return Err(serde::de::Error::duplicate_field("skipMetadata")); + } + skip_metadata__ = Some(map_.next_value()?); + } + GeneratedField::PushdownFilters => { + if pushdown_filters__.is_some() { + return Err(serde::de::Error::duplicate_field("pushdownFilters")); + } + pushdown_filters__ = Some(map_.next_value()?); + } + GeneratedField::ReorderFilters => { + if reorder_filters__.is_some() { + return Err(serde::de::Error::duplicate_field("reorderFilters")); + } + reorder_filters__ = Some(map_.next_value()?); + } + GeneratedField::DataPagesizeLimit => { + if data_pagesize_limit__.is_some() { + return Err(serde::de::Error::duplicate_field("dataPagesizeLimit")); + } + data_pagesize_limit__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + GeneratedField::WriteBatchSize => { + if write_batch_size__.is_some() { + return Err(serde::de::Error::duplicate_field("writeBatchSize")); + } + write_batch_size__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + GeneratedField::WriterVersion => { + if writer_version__.is_some() { + return Err(serde::de::Error::duplicate_field("writerVersion")); + } + writer_version__ = Some(map_.next_value()?); + } + GeneratedField::BloomFilterEnabled => { + if bloom_filter_enabled__.is_some() { + return Err(serde::de::Error::duplicate_field("bloomFilterEnabled")); + } + bloom_filter_enabled__ = Some(map_.next_value()?); + } + GeneratedField::AllowSingleFileParallelism => { + if allow_single_file_parallelism__.is_some() { + return Err(serde::de::Error::duplicate_field("allowSingleFileParallelism")); + } + allow_single_file_parallelism__ = Some(map_.next_value()?); + } + GeneratedField::MaximumParallelRowGroupWriters => { + if maximum_parallel_row_group_writers__.is_some() { + return Err(serde::de::Error::duplicate_field("maximumParallelRowGroupWriters")); + } + maximum_parallel_row_group_writers__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + GeneratedField::MaximumBufferedRecordBatchesPerStream => { + if maximum_buffered_record_batches_per_stream__.is_some() { + return Err(serde::de::Error::duplicate_field("maximumBufferedRecordBatchesPerStream")); + } + maximum_buffered_record_batches_per_stream__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + GeneratedField::DictionaryPageSizeLimit => { + if dictionary_page_size_limit__.is_some() { + return Err(serde::de::Error::duplicate_field("dictionaryPageSizeLimit")); + } + dictionary_page_size_limit__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + GeneratedField::DataPageRowCountLimit => { + if data_page_row_count_limit__.is_some() { + return Err(serde::de::Error::duplicate_field("dataPageRowCountLimit")); + } + data_page_row_count_limit__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + GeneratedField::MaxRowGroupSize => { + if max_row_group_size__.is_some() { + return Err(serde::de::Error::duplicate_field("maxRowGroupSize")); + } + max_row_group_size__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + GeneratedField::CreatedBy => { + if created_by__.is_some() { + return Err(serde::de::Error::duplicate_field("createdBy")); + } + created_by__ = Some(map_.next_value()?); + } + GeneratedField::MetadataSizeHint => { + if metadata_size_hint_opt__.is_some() { + return Err(serde::de::Error::duplicate_field("metadataSizeHint")); + } + metadata_size_hint_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_options::MetadataSizeHintOpt::MetadataSizeHint(x.0)); + } + GeneratedField::Compression => { + if compression_opt__.is_some() { + return Err(serde::de::Error::duplicate_field("compression")); + } + compression_opt__ = map_.next_value::<::std::option::Option<_>>()?.map(parquet_options::CompressionOpt::Compression); + } + GeneratedField::DictionaryEnabled => { + if dictionary_enabled_opt__.is_some() { + return Err(serde::de::Error::duplicate_field("dictionaryEnabled")); + } + dictionary_enabled_opt__ = map_.next_value::<::std::option::Option<_>>()?.map(parquet_options::DictionaryEnabledOpt::DictionaryEnabled); + } + GeneratedField::StatisticsEnabled => { + if statistics_enabled_opt__.is_some() { + return Err(serde::de::Error::duplicate_field("statisticsEnabled")); + } + statistics_enabled_opt__ = map_.next_value::<::std::option::Option<_>>()?.map(parquet_options::StatisticsEnabledOpt::StatisticsEnabled); + } + GeneratedField::MaxStatisticsSize => { + if max_statistics_size_opt__.is_some() { + return Err(serde::de::Error::duplicate_field("maxStatisticsSize")); + } + max_statistics_size_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(x.0)); + } + GeneratedField::ColumnIndexTruncateLength => { + if column_index_truncate_length_opt__.is_some() { + return Err(serde::de::Error::duplicate_field("columnIndexTruncateLength")); + } + column_index_truncate_length_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(x.0)); + } + GeneratedField::Encoding => { + if encoding_opt__.is_some() { + return Err(serde::de::Error::duplicate_field("encoding")); + } + encoding_opt__ = map_.next_value::<::std::option::Option<_>>()?.map(parquet_options::EncodingOpt::Encoding); + } + GeneratedField::BloomFilterFpp => { + if bloom_filter_fpp_opt__.is_some() { + return Err(serde::de::Error::duplicate_field("bloomFilterFpp")); + } + bloom_filter_fpp_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_options::BloomFilterFppOpt::BloomFilterFpp(x.0)); + } + GeneratedField::BloomFilterNdv => { + if bloom_filter_ndv_opt__.is_some() { + return Err(serde::de::Error::duplicate_field("bloomFilterNdv")); + } + bloom_filter_ndv_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_options::BloomFilterNdvOpt::BloomFilterNdv(x.0)); + } + } } - Ok(ParquetFormat { + Ok(ParquetOptions { + enable_page_index: enable_page_index__.unwrap_or_default(), + pruning: pruning__.unwrap_or_default(), + skip_metadata: skip_metadata__.unwrap_or_default(), + pushdown_filters: pushdown_filters__.unwrap_or_default(), + reorder_filters: reorder_filters__.unwrap_or_default(), + data_pagesize_limit: data_pagesize_limit__.unwrap_or_default(), + write_batch_size: write_batch_size__.unwrap_or_default(), + writer_version: writer_version__.unwrap_or_default(), + bloom_filter_enabled: bloom_filter_enabled__.unwrap_or_default(), + allow_single_file_parallelism: allow_single_file_parallelism__.unwrap_or_default(), + maximum_parallel_row_group_writers: maximum_parallel_row_group_writers__.unwrap_or_default(), + maximum_buffered_record_batches_per_stream: maximum_buffered_record_batches_per_stream__.unwrap_or_default(), + dictionary_page_size_limit: dictionary_page_size_limit__.unwrap_or_default(), + data_page_row_count_limit: data_page_row_count_limit__.unwrap_or_default(), + max_row_group_size: max_row_group_size__.unwrap_or_default(), + created_by: created_by__.unwrap_or_default(), + metadata_size_hint_opt: metadata_size_hint_opt__, + compression_opt: compression_opt__, + dictionary_enabled_opt: dictionary_enabled_opt__, + statistics_enabled_opt: statistics_enabled_opt__, + max_statistics_size_opt: max_statistics_size_opt__, + column_index_truncate_length_opt: column_index_truncate_length_opt__, + encoding_opt: encoding_opt__, + bloom_filter_fpp_opt: bloom_filter_fpp_opt__, + bloom_filter_ndv_opt: bloom_filter_ndv_opt__, }) } } - deserializer.deserialize_struct("datafusion.ParquetFormat", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion.ParquetOptions", FIELDS, GeneratedVisitor) } } impl serde::Serialize for ParquetScanExecNode { @@ -15586,10 +16873,16 @@ impl serde::Serialize for ParquetSink { if self.config.is_some() { len += 1; } + if self.parquet_options.is_some() { + len += 1; + } let mut struct_ser = serializer.serialize_struct("datafusion.ParquetSink", len)?; if let Some(v) = self.config.as_ref() { struct_ser.serialize_field("config", v)?; } + if let Some(v) = self.parquet_options.as_ref() { + struct_ser.serialize_field("parquetOptions", v)?; + } struct_ser.end() } } @@ -15601,11 +16894,14 @@ impl<'de> serde::Deserialize<'de> for ParquetSink { { const FIELDS: &[&str] = &[ "config", + "parquet_options", + "parquetOptions", ]; #[allow(clippy::enum_variant_names)] enum GeneratedField { Config, + ParquetOptions, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -15628,6 +16924,7 @@ impl<'de> serde::Deserialize<'de> for ParquetSink { { match value { "config" => Ok(GeneratedField::Config), + "parquetOptions" | "parquet_options" => Ok(GeneratedField::ParquetOptions), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -15648,6 +16945,7 @@ impl<'de> serde::Deserialize<'de> for ParquetSink { V: serde::de::MapAccess<'de>, { let mut config__ = None; + let mut parquet_options__ = None; while let Some(k) = map_.next_key()? { match k { GeneratedField::Config => { @@ -15656,10 +16954,17 @@ impl<'de> serde::Deserialize<'de> for ParquetSink { } config__ = map_.next_value()?; } + GeneratedField::ParquetOptions => { + if parquet_options__.is_some() { + return Err(serde::de::Error::duplicate_field("parquetOptions")); + } + parquet_options__ = map_.next_value()?; + } } } Ok(ParquetSink { config: config__, + parquet_options: parquet_options__, }) } } @@ -15787,119 +17092,27 @@ impl<'de> serde::Deserialize<'de> for ParquetSinkExecNode { } GeneratedField::SinkSchema => { if sink_schema__.is_some() { - return Err(serde::de::Error::duplicate_field("sinkSchema")); - } - sink_schema__ = map_.next_value()?; - } - GeneratedField::SortOrder => { - if sort_order__.is_some() { - return Err(serde::de::Error::duplicate_field("sortOrder")); - } - sort_order__ = map_.next_value()?; - } - } - } - Ok(ParquetSinkExecNode { - input: input__, - sink: sink__, - sink_schema: sink_schema__, - sort_order: sort_order__, - }) - } - } - deserializer.deserialize_struct("datafusion.ParquetSinkExecNode", FIELDS, GeneratedVisitor) - } -} -impl serde::Serialize for ParquetWriterOptions { - #[allow(deprecated)] - fn serialize(&self, serializer: S) -> std::result::Result - where - S: serde::Serializer, - { - use serde::ser::SerializeStruct; - let mut len = 0; - if self.writer_properties.is_some() { - len += 1; - } - let mut struct_ser = serializer.serialize_struct("datafusion.ParquetWriterOptions", len)?; - if let Some(v) = self.writer_properties.as_ref() { - struct_ser.serialize_field("writerProperties", v)?; - } - struct_ser.end() - } -} -impl<'de> serde::Deserialize<'de> for ParquetWriterOptions { - #[allow(deprecated)] - fn deserialize(deserializer: D) -> std::result::Result - where - D: serde::Deserializer<'de>, - { - const FIELDS: &[&str] = &[ - "writer_properties", - "writerProperties", - ]; - - #[allow(clippy::enum_variant_names)] - enum GeneratedField { - WriterProperties, - } - impl<'de> serde::Deserialize<'de> for GeneratedField { - fn deserialize(deserializer: D) -> std::result::Result - where - D: serde::Deserializer<'de>, - { - struct GeneratedVisitor; - - impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = GeneratedField; - - fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(formatter, "expected one of: {:?}", &FIELDS) - } - - #[allow(unused_variables)] - fn visit_str(self, value: &str) -> std::result::Result - where - E: serde::de::Error, - { - match value { - "writerProperties" | "writer_properties" => Ok(GeneratedField::WriterProperties), - _ => Err(serde::de::Error::unknown_field(value, FIELDS)), - } - } - } - deserializer.deserialize_identifier(GeneratedVisitor) - } - } - struct GeneratedVisitor; - impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = ParquetWriterOptions; - - fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion.ParquetWriterOptions") - } - - fn visit_map(self, mut map_: V) -> std::result::Result - where - V: serde::de::MapAccess<'de>, - { - let mut writer_properties__ = None; - while let Some(k) = map_.next_key()? { - match k { - GeneratedField::WriterProperties => { - if writer_properties__.is_some() { - return Err(serde::de::Error::duplicate_field("writerProperties")); + return Err(serde::de::Error::duplicate_field("sinkSchema")); + } + sink_schema__ = map_.next_value()?; + } + GeneratedField::SortOrder => { + if sort_order__.is_some() { + return Err(serde::de::Error::duplicate_field("sortOrder")); } - writer_properties__ = map_.next_value()?; + sort_order__ = map_.next_value()?; } } } - Ok(ParquetWriterOptions { - writer_properties: writer_properties__, + Ok(ParquetSinkExecNode { + input: input__, + sink: sink__, + sink_schema: sink_schema__, + sort_order: sort_order__, }) } } - deserializer.deserialize_struct("datafusion.ParquetWriterOptions", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion.ParquetSinkExecNode", FIELDS, GeneratedVisitor) } } impl serde::Serialize for PartialTableReference { @@ -21414,262 +22627,44 @@ impl<'de> serde::Deserialize<'de> for RepartitionExecNode { let mut input__ = None; let mut partition_method__ = None; while let Some(k) = map_.next_key()? { - match k { - GeneratedField::Input => { - if input__.is_some() { - return Err(serde::de::Error::duplicate_field("input")); - } - input__ = map_.next_value()?; - } - GeneratedField::RoundRobin => { - if partition_method__.is_some() { - return Err(serde::de::Error::duplicate_field("roundRobin")); - } - partition_method__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| repartition_exec_node::PartitionMethod::RoundRobin(x.0)); - } - GeneratedField::Hash => { - if partition_method__.is_some() { - return Err(serde::de::Error::duplicate_field("hash")); - } - partition_method__ = map_.next_value::<::std::option::Option<_>>()?.map(repartition_exec_node::PartitionMethod::Hash) -; - } - GeneratedField::Unknown => { - if partition_method__.is_some() { - return Err(serde::de::Error::duplicate_field("unknown")); - } - partition_method__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| repartition_exec_node::PartitionMethod::Unknown(x.0)); - } - } - } - Ok(RepartitionExecNode { - input: input__, - partition_method: partition_method__, - }) - } - } - deserializer.deserialize_struct("datafusion.RepartitionExecNode", FIELDS, GeneratedVisitor) - } -} -impl serde::Serialize for RepartitionNode { - #[allow(deprecated)] - fn serialize(&self, serializer: S) -> std::result::Result - where - S: serde::Serializer, - { - use serde::ser::SerializeStruct; - let mut len = 0; - if self.input.is_some() { - len += 1; - } - if self.partition_method.is_some() { - len += 1; - } - let mut struct_ser = serializer.serialize_struct("datafusion.RepartitionNode", len)?; - if let Some(v) = self.input.as_ref() { - struct_ser.serialize_field("input", v)?; - } - if let Some(v) = self.partition_method.as_ref() { - match v { - repartition_node::PartitionMethod::RoundRobin(v) => { - #[allow(clippy::needless_borrow)] - struct_ser.serialize_field("roundRobin", ToString::to_string(&v).as_str())?; - } - repartition_node::PartitionMethod::Hash(v) => { - struct_ser.serialize_field("hash", v)?; - } - } - } - struct_ser.end() - } -} -impl<'de> serde::Deserialize<'de> for RepartitionNode { - #[allow(deprecated)] - fn deserialize(deserializer: D) -> std::result::Result - where - D: serde::Deserializer<'de>, - { - const FIELDS: &[&str] = &[ - "input", - "round_robin", - "roundRobin", - "hash", - ]; - - #[allow(clippy::enum_variant_names)] - enum GeneratedField { - Input, - RoundRobin, - Hash, - } - impl<'de> serde::Deserialize<'de> for GeneratedField { - fn deserialize(deserializer: D) -> std::result::Result - where - D: serde::Deserializer<'de>, - { - struct GeneratedVisitor; - - impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = GeneratedField; - - fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(formatter, "expected one of: {:?}", &FIELDS) - } - - #[allow(unused_variables)] - fn visit_str(self, value: &str) -> std::result::Result - where - E: serde::de::Error, - { - match value { - "input" => Ok(GeneratedField::Input), - "roundRobin" | "round_robin" => Ok(GeneratedField::RoundRobin), - "hash" => Ok(GeneratedField::Hash), - _ => Err(serde::de::Error::unknown_field(value, FIELDS)), - } - } - } - deserializer.deserialize_identifier(GeneratedVisitor) - } - } - struct GeneratedVisitor; - impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = RepartitionNode; - - fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion.RepartitionNode") - } - - fn visit_map(self, mut map_: V) -> std::result::Result - where - V: serde::de::MapAccess<'de>, - { - let mut input__ = None; - let mut partition_method__ = None; - while let Some(k) = map_.next_key()? { - match k { - GeneratedField::Input => { - if input__.is_some() { - return Err(serde::de::Error::duplicate_field("input")); - } - input__ = map_.next_value()?; - } - GeneratedField::RoundRobin => { - if partition_method__.is_some() { - return Err(serde::de::Error::duplicate_field("roundRobin")); - } - partition_method__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| repartition_node::PartitionMethod::RoundRobin(x.0)); - } - GeneratedField::Hash => { - if partition_method__.is_some() { - return Err(serde::de::Error::duplicate_field("hash")); - } - partition_method__ = map_.next_value::<::std::option::Option<_>>()?.map(repartition_node::PartitionMethod::Hash) -; - } - } - } - Ok(RepartitionNode { - input: input__, - partition_method: partition_method__, - }) - } - } - deserializer.deserialize_struct("datafusion.RepartitionNode", FIELDS, GeneratedVisitor) - } -} -impl serde::Serialize for RollupNode { - #[allow(deprecated)] - fn serialize(&self, serializer: S) -> std::result::Result - where - S: serde::Serializer, - { - use serde::ser::SerializeStruct; - let mut len = 0; - if !self.expr.is_empty() { - len += 1; - } - let mut struct_ser = serializer.serialize_struct("datafusion.RollupNode", len)?; - if !self.expr.is_empty() { - struct_ser.serialize_field("expr", &self.expr)?; - } - struct_ser.end() - } -} -impl<'de> serde::Deserialize<'de> for RollupNode { - #[allow(deprecated)] - fn deserialize(deserializer: D) -> std::result::Result - where - D: serde::Deserializer<'de>, - { - const FIELDS: &[&str] = &[ - "expr", - ]; - - #[allow(clippy::enum_variant_names)] - enum GeneratedField { - Expr, - } - impl<'de> serde::Deserialize<'de> for GeneratedField { - fn deserialize(deserializer: D) -> std::result::Result - where - D: serde::Deserializer<'de>, - { - struct GeneratedVisitor; - - impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = GeneratedField; - - fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(formatter, "expected one of: {:?}", &FIELDS) - } - - #[allow(unused_variables)] - fn visit_str(self, value: &str) -> std::result::Result - where - E: serde::de::Error, - { - match value { - "expr" => Ok(GeneratedField::Expr), - _ => Err(serde::de::Error::unknown_field(value, FIELDS)), - } - } - } - deserializer.deserialize_identifier(GeneratedVisitor) - } - } - struct GeneratedVisitor; - impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = RollupNode; - - fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion.RollupNode") - } - - fn visit_map(self, mut map_: V) -> std::result::Result - where - V: serde::de::MapAccess<'de>, - { - let mut expr__ = None; - while let Some(k) = map_.next_key()? { - match k { - GeneratedField::Expr => { - if expr__.is_some() { - return Err(serde::de::Error::duplicate_field("expr")); + match k { + GeneratedField::Input => { + if input__.is_some() { + return Err(serde::de::Error::duplicate_field("input")); } - expr__ = Some(map_.next_value()?); + input__ = map_.next_value()?; + } + GeneratedField::RoundRobin => { + if partition_method__.is_some() { + return Err(serde::de::Error::duplicate_field("roundRobin")); + } + partition_method__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| repartition_exec_node::PartitionMethod::RoundRobin(x.0)); + } + GeneratedField::Hash => { + if partition_method__.is_some() { + return Err(serde::de::Error::duplicate_field("hash")); + } + partition_method__ = map_.next_value::<::std::option::Option<_>>()?.map(repartition_exec_node::PartitionMethod::Hash) +; + } + GeneratedField::Unknown => { + if partition_method__.is_some() { + return Err(serde::de::Error::duplicate_field("unknown")); + } + partition_method__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| repartition_exec_node::PartitionMethod::Unknown(x.0)); } } } - Ok(RollupNode { - expr: expr__.unwrap_or_default(), + Ok(RepartitionExecNode { + input: input__, + partition_method: partition_method__, }) } } - deserializer.deserialize_struct("datafusion.RollupNode", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion.RepartitionExecNode", FIELDS, GeneratedVisitor) } } -impl serde::Serialize for SqlOption { +impl serde::Serialize for RepartitionNode { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result where @@ -21677,37 +22672,48 @@ impl serde::Serialize for SqlOption { { use serde::ser::SerializeStruct; let mut len = 0; - if !self.key.is_empty() { + if self.input.is_some() { len += 1; } - if !self.value.is_empty() { + if self.partition_method.is_some() { len += 1; } - let mut struct_ser = serializer.serialize_struct("datafusion.SQLOption", len)?; - if !self.key.is_empty() { - struct_ser.serialize_field("key", &self.key)?; + let mut struct_ser = serializer.serialize_struct("datafusion.RepartitionNode", len)?; + if let Some(v) = self.input.as_ref() { + struct_ser.serialize_field("input", v)?; } - if !self.value.is_empty() { - struct_ser.serialize_field("value", &self.value)?; + if let Some(v) = self.partition_method.as_ref() { + match v { + repartition_node::PartitionMethod::RoundRobin(v) => { + #[allow(clippy::needless_borrow)] + struct_ser.serialize_field("roundRobin", ToString::to_string(&v).as_str())?; + } + repartition_node::PartitionMethod::Hash(v) => { + struct_ser.serialize_field("hash", v)?; + } + } } struct_ser.end() } } -impl<'de> serde::Deserialize<'de> for SqlOption { +impl<'de> serde::Deserialize<'de> for RepartitionNode { #[allow(deprecated)] fn deserialize(deserializer: D) -> std::result::Result where D: serde::Deserializer<'de>, { const FIELDS: &[&str] = &[ - "key", - "value", + "input", + "round_robin", + "roundRobin", + "hash", ]; #[allow(clippy::enum_variant_names)] enum GeneratedField { - Key, - Value, + Input, + RoundRobin, + Hash, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -21729,8 +22735,9 @@ impl<'de> serde::Deserialize<'de> for SqlOption { E: serde::de::Error, { match value { - "key" => Ok(GeneratedField::Key), - "value" => Ok(GeneratedField::Value), + "input" => Ok(GeneratedField::Input), + "roundRobin" | "round_robin" => Ok(GeneratedField::RoundRobin), + "hash" => Ok(GeneratedField::Hash), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -21740,44 +22747,51 @@ impl<'de> serde::Deserialize<'de> for SqlOption { } struct GeneratedVisitor; impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = SqlOption; + type Value = RepartitionNode; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion.SQLOption") + formatter.write_str("struct datafusion.RepartitionNode") } - fn visit_map(self, mut map_: V) -> std::result::Result + fn visit_map(self, mut map_: V) -> std::result::Result where V: serde::de::MapAccess<'de>, { - let mut key__ = None; - let mut value__ = None; + let mut input__ = None; + let mut partition_method__ = None; while let Some(k) = map_.next_key()? { match k { - GeneratedField::Key => { - if key__.is_some() { - return Err(serde::de::Error::duplicate_field("key")); + GeneratedField::Input => { + if input__.is_some() { + return Err(serde::de::Error::duplicate_field("input")); } - key__ = Some(map_.next_value()?); + input__ = map_.next_value()?; } - GeneratedField::Value => { - if value__.is_some() { - return Err(serde::de::Error::duplicate_field("value")); + GeneratedField::RoundRobin => { + if partition_method__.is_some() { + return Err(serde::de::Error::duplicate_field("roundRobin")); + } + partition_method__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| repartition_node::PartitionMethod::RoundRobin(x.0)); + } + GeneratedField::Hash => { + if partition_method__.is_some() { + return Err(serde::de::Error::duplicate_field("hash")); } - value__ = Some(map_.next_value()?); + partition_method__ = map_.next_value::<::std::option::Option<_>>()?.map(repartition_node::PartitionMethod::Hash) +; } } } - Ok(SqlOption { - key: key__.unwrap_or_default(), - value: value__.unwrap_or_default(), + Ok(RepartitionNode { + input: input__, + partition_method: partition_method__, }) } } - deserializer.deserialize_struct("datafusion.SQLOption", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion.RepartitionNode", FIELDS, GeneratedVisitor) } } -impl serde::Serialize for SqlOptions { +impl serde::Serialize for RollupNode { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result where @@ -21785,29 +22799,29 @@ impl serde::Serialize for SqlOptions { { use serde::ser::SerializeStruct; let mut len = 0; - if !self.option.is_empty() { + if !self.expr.is_empty() { len += 1; } - let mut struct_ser = serializer.serialize_struct("datafusion.SQLOptions", len)?; - if !self.option.is_empty() { - struct_ser.serialize_field("option", &self.option)?; + let mut struct_ser = serializer.serialize_struct("datafusion.RollupNode", len)?; + if !self.expr.is_empty() { + struct_ser.serialize_field("expr", &self.expr)?; } struct_ser.end() } } -impl<'de> serde::Deserialize<'de> for SqlOptions { +impl<'de> serde::Deserialize<'de> for RollupNode { #[allow(deprecated)] fn deserialize(deserializer: D) -> std::result::Result where D: serde::Deserializer<'de>, { const FIELDS: &[&str] = &[ - "option", + "expr", ]; #[allow(clippy::enum_variant_names)] enum GeneratedField { - Option, + Expr, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -21829,7 +22843,7 @@ impl<'de> serde::Deserialize<'de> for SqlOptions { E: serde::de::Error, { match value { - "option" => Ok(GeneratedField::Option), + "expr" => Ok(GeneratedField::Expr), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -21839,33 +22853,33 @@ impl<'de> serde::Deserialize<'de> for SqlOptions { } struct GeneratedVisitor; impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = SqlOptions; + type Value = RollupNode; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion.SQLOptions") + formatter.write_str("struct datafusion.RollupNode") } - fn visit_map(self, mut map_: V) -> std::result::Result + fn visit_map(self, mut map_: V) -> std::result::Result where V: serde::de::MapAccess<'de>, { - let mut option__ = None; + let mut expr__ = None; while let Some(k) = map_.next_key()? { match k { - GeneratedField::Option => { - if option__.is_some() { - return Err(serde::de::Error::duplicate_field("option")); + GeneratedField::Expr => { + if expr__.is_some() { + return Err(serde::de::Error::duplicate_field("expr")); } - option__ = Some(map_.next_value()?); + expr__ = Some(map_.next_value()?); } } } - Ok(SqlOptions { - option: option__.unwrap_or_default(), + Ok(RollupNode { + expr: expr__.unwrap_or_default(), }) } } - deserializer.deserialize_struct("datafusion.SQLOptions", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion.RollupNode", FIELDS, GeneratedVisitor) } } impl serde::Serialize for ScalarDictionaryValue { @@ -22113,9 +23127,7 @@ impl serde::Serialize for ScalarFunction { Self::Signum => "Signum", Self::Sin => "Sin", Self::Sqrt => "Sqrt", - Self::Tan => "Tan", Self::Trunc => "Trunc", - Self::Array => "Array", Self::BitLength => "BitLength", Self::Btrim => "Btrim", Self::CharacterLength => "CharacterLength", @@ -22145,14 +23157,16 @@ impl serde::Serialize for ScalarFunction { Self::Strpos => "Strpos", Self::Substr => "Substr", Self::ToHex => "ToHex", + Self::Now => "Now", Self::Translate => "Translate", Self::Trim => "Trim", Self::Upper => "Upper", Self::Coalesce => "Coalesce", Self::Power => "Power", - Self::StructFun => "StructFun", + Self::FromUnixtime => "FromUnixtime", Self::Atan2 => "Atan2", - Self::ArrowTypeof => "ArrowTypeof", + Self::CurrentDate => "CurrentDate", + Self::CurrentTime => "CurrentTime", Self::Uuid => "Uuid", Self::Cbrt => "Cbrt", Self::Acosh => "Acosh", @@ -22160,19 +23174,14 @@ impl serde::Serialize for ScalarFunction { Self::Atanh => "Atanh", Self::Sinh => "Sinh", Self::Cosh => "Cosh", - Self::Tanh => "Tanh", Self::Pi => "Pi", Self::Degrees => "Degrees", Self::Radians => "Radians", Self::Factorial => "Factorial", Self::Lcm => "Lcm", Self::Gcd => "Gcd", - Self::ArrayAppend => "ArrayAppend", - Self::ArrayConcat => "ArrayConcat", - Self::ArrayRepeat => "ArrayRepeat", Self::ArrayPosition => "ArrayPosition", Self::ArrayPositions => "ArrayPositions", - Self::ArrayPrepend => "ArrayPrepend", Self::ArrayRemove => "ArrayRemove", Self::ArrayReplace => "ArrayReplace", Self::ArrayElement => "ArrayElement", @@ -22185,7 +23194,6 @@ impl serde::Serialize for ScalarFunction { Self::Nanvl => "Nanvl", Self::Iszero => "Iszero", Self::ArrayPopBack => "ArrayPopBack", - Self::StringToArray => "StringToArray", Self::ArrayIntersect => "ArrayIntersect", Self::ArrayUnion => "ArrayUnion", Self::OverLay => "OverLay", @@ -22194,8 +23202,6 @@ impl serde::Serialize for ScalarFunction { Self::Levenshtein => "Levenshtein", Self::SubstrIndex => "SubstrIndex", Self::FindInSet => "FindInSet", - Self::ArraySort => "ArraySort", - Self::ArrayDistinct => "ArrayDistinct", Self::ArrayResize => "ArrayResize", Self::EndsWith => "EndsWith", Self::MakeDate => "MakeDate", @@ -22228,9 +23234,7 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Signum", "Sin", "Sqrt", - "Tan", "Trunc", - "Array", "BitLength", "Btrim", "CharacterLength", @@ -22260,14 +23264,16 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Strpos", "Substr", "ToHex", + "Now", "Translate", "Trim", "Upper", "Coalesce", "Power", - "StructFun", + "FromUnixtime", "Atan2", - "ArrowTypeof", + "CurrentDate", + "CurrentTime", "Uuid", "Cbrt", "Acosh", @@ -22275,19 +23281,14 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Atanh", "Sinh", "Cosh", - "Tanh", "Pi", "Degrees", "Radians", "Factorial", "Lcm", "Gcd", - "ArrayAppend", - "ArrayConcat", - "ArrayRepeat", "ArrayPosition", "ArrayPositions", - "ArrayPrepend", "ArrayRemove", "ArrayReplace", "ArrayElement", @@ -22300,7 +23301,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Nanvl", "Iszero", "ArrayPopBack", - "StringToArray", "ArrayIntersect", "ArrayUnion", "OverLay", @@ -22309,8 +23309,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Levenshtein", "SubstrIndex", "FindInSet", - "ArraySort", - "ArrayDistinct", "ArrayResize", "EndsWith", "MakeDate", @@ -22372,9 +23370,7 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Signum" => Ok(ScalarFunction::Signum), "Sin" => Ok(ScalarFunction::Sin), "Sqrt" => Ok(ScalarFunction::Sqrt), - "Tan" => Ok(ScalarFunction::Tan), "Trunc" => Ok(ScalarFunction::Trunc), - "Array" => Ok(ScalarFunction::Array), "BitLength" => Ok(ScalarFunction::BitLength), "Btrim" => Ok(ScalarFunction::Btrim), "CharacterLength" => Ok(ScalarFunction::CharacterLength), @@ -22404,14 +23400,16 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Strpos" => Ok(ScalarFunction::Strpos), "Substr" => Ok(ScalarFunction::Substr), "ToHex" => Ok(ScalarFunction::ToHex), + "Now" => Ok(ScalarFunction::Now), "Translate" => Ok(ScalarFunction::Translate), "Trim" => Ok(ScalarFunction::Trim), "Upper" => Ok(ScalarFunction::Upper), "Coalesce" => Ok(ScalarFunction::Coalesce), "Power" => Ok(ScalarFunction::Power), - "StructFun" => Ok(ScalarFunction::StructFun), + "FromUnixtime" => Ok(ScalarFunction::FromUnixtime), "Atan2" => Ok(ScalarFunction::Atan2), - "ArrowTypeof" => Ok(ScalarFunction::ArrowTypeof), + "CurrentDate" => Ok(ScalarFunction::CurrentDate), + "CurrentTime" => Ok(ScalarFunction::CurrentTime), "Uuid" => Ok(ScalarFunction::Uuid), "Cbrt" => Ok(ScalarFunction::Cbrt), "Acosh" => Ok(ScalarFunction::Acosh), @@ -22419,19 +23417,14 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Atanh" => Ok(ScalarFunction::Atanh), "Sinh" => Ok(ScalarFunction::Sinh), "Cosh" => Ok(ScalarFunction::Cosh), - "Tanh" => Ok(ScalarFunction::Tanh), "Pi" => Ok(ScalarFunction::Pi), "Degrees" => Ok(ScalarFunction::Degrees), "Radians" => Ok(ScalarFunction::Radians), "Factorial" => Ok(ScalarFunction::Factorial), "Lcm" => Ok(ScalarFunction::Lcm), "Gcd" => Ok(ScalarFunction::Gcd), - "ArrayAppend" => Ok(ScalarFunction::ArrayAppend), - "ArrayConcat" => Ok(ScalarFunction::ArrayConcat), - "ArrayRepeat" => Ok(ScalarFunction::ArrayRepeat), "ArrayPosition" => Ok(ScalarFunction::ArrayPosition), "ArrayPositions" => Ok(ScalarFunction::ArrayPositions), - "ArrayPrepend" => Ok(ScalarFunction::ArrayPrepend), "ArrayRemove" => Ok(ScalarFunction::ArrayRemove), "ArrayReplace" => Ok(ScalarFunction::ArrayReplace), "ArrayElement" => Ok(ScalarFunction::ArrayElement), @@ -22444,7 +23437,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Nanvl" => Ok(ScalarFunction::Nanvl), "Iszero" => Ok(ScalarFunction::Iszero), "ArrayPopBack" => Ok(ScalarFunction::ArrayPopBack), - "StringToArray" => Ok(ScalarFunction::StringToArray), "ArrayIntersect" => Ok(ScalarFunction::ArrayIntersect), "ArrayUnion" => Ok(ScalarFunction::ArrayUnion), "OverLay" => Ok(ScalarFunction::OverLay), @@ -22453,8 +23445,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Levenshtein" => Ok(ScalarFunction::Levenshtein), "SubstrIndex" => Ok(ScalarFunction::SubstrIndex), "FindInSet" => Ok(ScalarFunction::FindInSet), - "ArraySort" => Ok(ScalarFunction::ArraySort), - "ArrayDistinct" => Ok(ScalarFunction::ArrayDistinct), "ArrayResize" => Ok(ScalarFunction::ArrayResize), "EndsWith" => Ok(ScalarFunction::EndsWith), "MakeDate" => Ok(ScalarFunction::MakeDate), @@ -25535,76 +26525,185 @@ impl<'de> serde::Deserialize<'de> for SymmetricHashJoinExecNode { let mut right_sort_exprs__ = None; while let Some(k) = map_.next_key()? { match k { - GeneratedField::Left => { - if left__.is_some() { - return Err(serde::de::Error::duplicate_field("left")); - } - left__ = map_.next_value()?; - } - GeneratedField::Right => { - if right__.is_some() { - return Err(serde::de::Error::duplicate_field("right")); - } - right__ = map_.next_value()?; - } - GeneratedField::On => { - if on__.is_some() { - return Err(serde::de::Error::duplicate_field("on")); - } - on__ = Some(map_.next_value()?); - } - GeneratedField::JoinType => { - if join_type__.is_some() { - return Err(serde::de::Error::duplicate_field("joinType")); - } - join_type__ = Some(map_.next_value::()? as i32); - } - GeneratedField::PartitionMode => { - if partition_mode__.is_some() { - return Err(serde::de::Error::duplicate_field("partitionMode")); - } - partition_mode__ = Some(map_.next_value::()? as i32); - } - GeneratedField::NullEqualsNull => { - if null_equals_null__.is_some() { - return Err(serde::de::Error::duplicate_field("nullEqualsNull")); - } - null_equals_null__ = Some(map_.next_value()?); - } - GeneratedField::Filter => { - if filter__.is_some() { - return Err(serde::de::Error::duplicate_field("filter")); + GeneratedField::Left => { + if left__.is_some() { + return Err(serde::de::Error::duplicate_field("left")); + } + left__ = map_.next_value()?; + } + GeneratedField::Right => { + if right__.is_some() { + return Err(serde::de::Error::duplicate_field("right")); + } + right__ = map_.next_value()?; + } + GeneratedField::On => { + if on__.is_some() { + return Err(serde::de::Error::duplicate_field("on")); + } + on__ = Some(map_.next_value()?); + } + GeneratedField::JoinType => { + if join_type__.is_some() { + return Err(serde::de::Error::duplicate_field("joinType")); + } + join_type__ = Some(map_.next_value::()? as i32); + } + GeneratedField::PartitionMode => { + if partition_mode__.is_some() { + return Err(serde::de::Error::duplicate_field("partitionMode")); + } + partition_mode__ = Some(map_.next_value::()? as i32); + } + GeneratedField::NullEqualsNull => { + if null_equals_null__.is_some() { + return Err(serde::de::Error::duplicate_field("nullEqualsNull")); + } + null_equals_null__ = Some(map_.next_value()?); + } + GeneratedField::Filter => { + if filter__.is_some() { + return Err(serde::de::Error::duplicate_field("filter")); + } + filter__ = map_.next_value()?; + } + GeneratedField::LeftSortExprs => { + if left_sort_exprs__.is_some() { + return Err(serde::de::Error::duplicate_field("leftSortExprs")); + } + left_sort_exprs__ = Some(map_.next_value()?); + } + GeneratedField::RightSortExprs => { + if right_sort_exprs__.is_some() { + return Err(serde::de::Error::duplicate_field("rightSortExprs")); + } + right_sort_exprs__ = Some(map_.next_value()?); + } + } + } + Ok(SymmetricHashJoinExecNode { + left: left__, + right: right__, + on: on__.unwrap_or_default(), + join_type: join_type__.unwrap_or_default(), + partition_mode: partition_mode__.unwrap_or_default(), + null_equals_null: null_equals_null__.unwrap_or_default(), + filter: filter__, + left_sort_exprs: left_sort_exprs__.unwrap_or_default(), + right_sort_exprs: right_sort_exprs__.unwrap_or_default(), + }) + } + } + deserializer.deserialize_struct("datafusion.SymmetricHashJoinExecNode", FIELDS, GeneratedVisitor) + } +} +impl serde::Serialize for TableParquetOptions { + #[allow(deprecated)] + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + let mut len = 0; + if self.global.is_some() { + len += 1; + } + if !self.column_specific_options.is_empty() { + len += 1; + } + let mut struct_ser = serializer.serialize_struct("datafusion.TableParquetOptions", len)?; + if let Some(v) = self.global.as_ref() { + struct_ser.serialize_field("global", v)?; + } + if !self.column_specific_options.is_empty() { + struct_ser.serialize_field("columnSpecificOptions", &self.column_specific_options)?; + } + struct_ser.end() + } +} +impl<'de> serde::Deserialize<'de> for TableParquetOptions { + #[allow(deprecated)] + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + const FIELDS: &[&str] = &[ + "global", + "column_specific_options", + "columnSpecificOptions", + ]; + + #[allow(clippy::enum_variant_names)] + enum GeneratedField { + Global, + ColumnSpecificOptions, + } + impl<'de> serde::Deserialize<'de> for GeneratedField { + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + struct GeneratedVisitor; + + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = GeneratedField; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(formatter, "expected one of: {:?}", &FIELDS) + } + + #[allow(unused_variables)] + fn visit_str(self, value: &str) -> std::result::Result + where + E: serde::de::Error, + { + match value { + "global" => Ok(GeneratedField::Global), + "columnSpecificOptions" | "column_specific_options" => Ok(GeneratedField::ColumnSpecificOptions), + _ => Err(serde::de::Error::unknown_field(value, FIELDS)), + } + } + } + deserializer.deserialize_identifier(GeneratedVisitor) + } + } + struct GeneratedVisitor; + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = TableParquetOptions; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + formatter.write_str("struct datafusion.TableParquetOptions") + } + + fn visit_map(self, mut map_: V) -> std::result::Result + where + V: serde::de::MapAccess<'de>, + { + let mut global__ = None; + let mut column_specific_options__ = None; + while let Some(k) = map_.next_key()? { + match k { + GeneratedField::Global => { + if global__.is_some() { + return Err(serde::de::Error::duplicate_field("global")); } - filter__ = map_.next_value()?; - } - GeneratedField::LeftSortExprs => { - if left_sort_exprs__.is_some() { - return Err(serde::de::Error::duplicate_field("leftSortExprs")); - } - left_sort_exprs__ = Some(map_.next_value()?); + global__ = map_.next_value()?; } - GeneratedField::RightSortExprs => { - if right_sort_exprs__.is_some() { - return Err(serde::de::Error::duplicate_field("rightSortExprs")); + GeneratedField::ColumnSpecificOptions => { + if column_specific_options__.is_some() { + return Err(serde::de::Error::duplicate_field("columnSpecificOptions")); } - right_sort_exprs__ = Some(map_.next_value()?); + column_specific_options__ = Some(map_.next_value()?); } } } - Ok(SymmetricHashJoinExecNode { - left: left__, - right: right__, - on: on__.unwrap_or_default(), - join_type: join_type__.unwrap_or_default(), - partition_mode: partition_mode__.unwrap_or_default(), - null_equals_null: null_equals_null__.unwrap_or_default(), - filter: filter__, - left_sort_exprs: left_sort_exprs__.unwrap_or_default(), - right_sort_exprs: right_sort_exprs__.unwrap_or_default(), + Ok(TableParquetOptions { + global: global__, + column_specific_options: column_specific_options__.unwrap_or_default(), }) } } - deserializer.deserialize_struct("datafusion.SymmetricHashJoinExecNode", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion.TableParquetOptions", FIELDS, GeneratedVisitor) } } impl serde::Serialize for TimeUnit { @@ -27836,218 +28935,3 @@ impl<'de> serde::Deserialize<'de> for WindowNode { deserializer.deserialize_struct("datafusion.WindowNode", FIELDS, GeneratedVisitor) } } -impl serde::Serialize for WriterProperties { - #[allow(deprecated)] - fn serialize(&self, serializer: S) -> std::result::Result - where - S: serde::Serializer, - { - use serde::ser::SerializeStruct; - let mut len = 0; - if self.data_page_size_limit != 0 { - len += 1; - } - if self.dictionary_page_size_limit != 0 { - len += 1; - } - if self.data_page_row_count_limit != 0 { - len += 1; - } - if self.write_batch_size != 0 { - len += 1; - } - if self.max_row_group_size != 0 { - len += 1; - } - if !self.writer_version.is_empty() { - len += 1; - } - if !self.created_by.is_empty() { - len += 1; - } - let mut struct_ser = serializer.serialize_struct("datafusion.WriterProperties", len)?; - if self.data_page_size_limit != 0 { - #[allow(clippy::needless_borrow)] - struct_ser.serialize_field("dataPageSizeLimit", ToString::to_string(&self.data_page_size_limit).as_str())?; - } - if self.dictionary_page_size_limit != 0 { - #[allow(clippy::needless_borrow)] - struct_ser.serialize_field("dictionaryPageSizeLimit", ToString::to_string(&self.dictionary_page_size_limit).as_str())?; - } - if self.data_page_row_count_limit != 0 { - #[allow(clippy::needless_borrow)] - struct_ser.serialize_field("dataPageRowCountLimit", ToString::to_string(&self.data_page_row_count_limit).as_str())?; - } - if self.write_batch_size != 0 { - #[allow(clippy::needless_borrow)] - struct_ser.serialize_field("writeBatchSize", ToString::to_string(&self.write_batch_size).as_str())?; - } - if self.max_row_group_size != 0 { - #[allow(clippy::needless_borrow)] - struct_ser.serialize_field("maxRowGroupSize", ToString::to_string(&self.max_row_group_size).as_str())?; - } - if !self.writer_version.is_empty() { - struct_ser.serialize_field("writerVersion", &self.writer_version)?; - } - if !self.created_by.is_empty() { - struct_ser.serialize_field("createdBy", &self.created_by)?; - } - struct_ser.end() - } -} -impl<'de> serde::Deserialize<'de> for WriterProperties { - #[allow(deprecated)] - fn deserialize(deserializer: D) -> std::result::Result - where - D: serde::Deserializer<'de>, - { - const FIELDS: &[&str] = &[ - "data_page_size_limit", - "dataPageSizeLimit", - "dictionary_page_size_limit", - "dictionaryPageSizeLimit", - "data_page_row_count_limit", - "dataPageRowCountLimit", - "write_batch_size", - "writeBatchSize", - "max_row_group_size", - "maxRowGroupSize", - "writer_version", - "writerVersion", - "created_by", - "createdBy", - ]; - - #[allow(clippy::enum_variant_names)] - enum GeneratedField { - DataPageSizeLimit, - DictionaryPageSizeLimit, - DataPageRowCountLimit, - WriteBatchSize, - MaxRowGroupSize, - WriterVersion, - CreatedBy, - } - impl<'de> serde::Deserialize<'de> for GeneratedField { - fn deserialize(deserializer: D) -> std::result::Result - where - D: serde::Deserializer<'de>, - { - struct GeneratedVisitor; - - impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = GeneratedField; - - fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(formatter, "expected one of: {:?}", &FIELDS) - } - - #[allow(unused_variables)] - fn visit_str(self, value: &str) -> std::result::Result - where - E: serde::de::Error, - { - match value { - "dataPageSizeLimit" | "data_page_size_limit" => Ok(GeneratedField::DataPageSizeLimit), - "dictionaryPageSizeLimit" | "dictionary_page_size_limit" => Ok(GeneratedField::DictionaryPageSizeLimit), - "dataPageRowCountLimit" | "data_page_row_count_limit" => Ok(GeneratedField::DataPageRowCountLimit), - "writeBatchSize" | "write_batch_size" => Ok(GeneratedField::WriteBatchSize), - "maxRowGroupSize" | "max_row_group_size" => Ok(GeneratedField::MaxRowGroupSize), - "writerVersion" | "writer_version" => Ok(GeneratedField::WriterVersion), - "createdBy" | "created_by" => Ok(GeneratedField::CreatedBy), - _ => Err(serde::de::Error::unknown_field(value, FIELDS)), - } - } - } - deserializer.deserialize_identifier(GeneratedVisitor) - } - } - struct GeneratedVisitor; - impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = WriterProperties; - - fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion.WriterProperties") - } - - fn visit_map(self, mut map_: V) -> std::result::Result - where - V: serde::de::MapAccess<'de>, - { - let mut data_page_size_limit__ = None; - let mut dictionary_page_size_limit__ = None; - let mut data_page_row_count_limit__ = None; - let mut write_batch_size__ = None; - let mut max_row_group_size__ = None; - let mut writer_version__ = None; - let mut created_by__ = None; - while let Some(k) = map_.next_key()? { - match k { - GeneratedField::DataPageSizeLimit => { - if data_page_size_limit__.is_some() { - return Err(serde::de::Error::duplicate_field("dataPageSizeLimit")); - } - data_page_size_limit__ = - Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) - ; - } - GeneratedField::DictionaryPageSizeLimit => { - if dictionary_page_size_limit__.is_some() { - return Err(serde::de::Error::duplicate_field("dictionaryPageSizeLimit")); - } - dictionary_page_size_limit__ = - Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) - ; - } - GeneratedField::DataPageRowCountLimit => { - if data_page_row_count_limit__.is_some() { - return Err(serde::de::Error::duplicate_field("dataPageRowCountLimit")); - } - data_page_row_count_limit__ = - Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) - ; - } - GeneratedField::WriteBatchSize => { - if write_batch_size__.is_some() { - return Err(serde::de::Error::duplicate_field("writeBatchSize")); - } - write_batch_size__ = - Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) - ; - } - GeneratedField::MaxRowGroupSize => { - if max_row_group_size__.is_some() { - return Err(serde::de::Error::duplicate_field("maxRowGroupSize")); - } - max_row_group_size__ = - Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) - ; - } - GeneratedField::WriterVersion => { - if writer_version__.is_some() { - return Err(serde::de::Error::duplicate_field("writerVersion")); - } - writer_version__ = Some(map_.next_value()?); - } - GeneratedField::CreatedBy => { - if created_by__.is_some() { - return Err(serde::de::Error::duplicate_field("createdBy")); - } - created_by__ = Some(map_.next_value()?); - } - } - } - Ok(WriterProperties { - data_page_size_limit: data_page_size_limit__.unwrap_or_default(), - dictionary_page_size_limit: dictionary_page_size_limit__.unwrap_or_default(), - data_page_row_count_limit: data_page_row_count_limit__.unwrap_or_default(), - write_batch_size: write_batch_size__.unwrap_or_default(), - max_row_group_size: max_row_group_size__.unwrap_or_default(), - writer_version: writer_version__.unwrap_or_default(), - created_by: created_by__.unwrap_or_default(), - }) - } - } - deserializer.deserialize_struct("datafusion.WriterProperties", FIELDS, GeneratedVisitor) - } -} diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index d4f911585bb9..c557fb48b191 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -122,27 +122,15 @@ pub struct ProjectionColumns { #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct CsvFormat { - #[prost(bool, tag = "1")] - pub has_header: bool, - #[prost(string, tag = "2")] - pub delimiter: ::prost::alloc::string::String, - #[prost(string, tag = "3")] - pub quote: ::prost::alloc::string::String, - #[prost(oneof = "csv_format::OptionalEscape", tags = "4")] - pub optional_escape: ::core::option::Option, -} -/// Nested message and enum types in `CsvFormat`. -pub mod csv_format { - #[allow(clippy::derive_partial_eq_without_eq)] - #[derive(Clone, PartialEq, ::prost::Oneof)] - pub enum OptionalEscape { - #[prost(string, tag = "4")] - Escape(::prost::alloc::string::String), - } + #[prost(message, optional, tag = "5")] + pub options: ::core::option::Option, } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] -pub struct ParquetFormat {} +pub struct ParquetFormat { + #[prost(message, optional, tag = "2")] + pub options: ::core::option::Option, +} #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct AvroFormat {} @@ -509,38 +497,34 @@ pub struct CopyToNode { pub input: ::core::option::Option<::prost::alloc::boxed::Box>, #[prost(string, tag = "2")] pub output_url: ::prost::alloc::string::String, - #[prost(string, tag = "6")] - pub file_type: ::prost::alloc::string::String, #[prost(string, repeated, tag = "7")] pub partition_by: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, - #[prost(oneof = "copy_to_node::CopyOptions", tags = "4, 5")] - pub copy_options: ::core::option::Option, + #[prost(oneof = "copy_to_node::FormatOptions", tags = "8, 9, 10, 11, 12")] + pub format_options: ::core::option::Option, } /// Nested message and enum types in `CopyToNode`. pub mod copy_to_node { #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Oneof)] - pub enum CopyOptions { - #[prost(message, tag = "4")] - SqlOptions(super::SqlOptions), - #[prost(message, tag = "5")] - WriterOptions(super::FileTypeWriterOptions), + pub enum FormatOptions { + #[prost(message, tag = "8")] + Csv(super::CsvOptions), + #[prost(message, tag = "9")] + Json(super::JsonOptions), + #[prost(message, tag = "10")] + Parquet(super::TableParquetOptions), + #[prost(message, tag = "11")] + Avro(super::AvroOptions), + #[prost(message, tag = "12")] + Arrow(super::ArrowOptions), } } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] -pub struct SqlOptions { - #[prost(message, repeated, tag = "1")] - pub option: ::prost::alloc::vec::Vec, -} +pub struct AvroOptions {} #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] -pub struct SqlOption { - #[prost(string, tag = "1")] - pub key: ::prost::alloc::string::String, - #[prost(string, tag = "2")] - pub value: ::prost::alloc::string::String, -} +pub struct ArrowOptions {} #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct UnionNode { @@ -1647,39 +1631,12 @@ pub struct PartitionColumn { } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] -pub struct FileTypeWriterOptions { - #[prost(oneof = "file_type_writer_options::FileType", tags = "1, 2, 3, 4")] - pub file_type: ::core::option::Option, -} -/// Nested message and enum types in `FileTypeWriterOptions`. -pub mod file_type_writer_options { - #[allow(clippy::derive_partial_eq_without_eq)] - #[derive(Clone, PartialEq, ::prost::Oneof)] - pub enum FileType { - #[prost(message, tag = "1")] - JsonOptions(super::JsonWriterOptions), - #[prost(message, tag = "2")] - ParquetOptions(super::ParquetWriterOptions), - #[prost(message, tag = "3")] - CsvOptions(super::CsvWriterOptions), - #[prost(message, tag = "4")] - ArrowOptions(super::ArrowWriterOptions), - } -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] pub struct JsonWriterOptions { #[prost(enumeration = "CompressionTypeVariant", tag = "1")] pub compression: i32, } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] -pub struct ParquetWriterOptions { - #[prost(message, optional, tag = "1")] - pub writer_properties: ::core::option::Option, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] pub struct CsvWriterOptions { /// Compression type #[prost(enumeration = "CompressionTypeVariant", tag = "1")] @@ -1706,26 +1663,57 @@ pub struct CsvWriterOptions { #[prost(string, tag = "8")] pub null_value: ::prost::alloc::string::String, } +/// Options controlling CSV format #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] -pub struct ArrowWriterOptions {} +pub struct CsvOptions { + /// Indicates if the CSV has a header row + #[prost(bool, tag = "1")] + pub has_header: bool, + /// Delimiter character as a byte + #[prost(bytes = "vec", tag = "2")] + pub delimiter: ::prost::alloc::vec::Vec, + /// Quote character as a byte + #[prost(bytes = "vec", tag = "3")] + pub quote: ::prost::alloc::vec::Vec, + /// Optional escape character as a byte + #[prost(bytes = "vec", tag = "4")] + pub escape: ::prost::alloc::vec::Vec, + /// Compression type + #[prost(enumeration = "CompressionTypeVariant", tag = "5")] + pub compression: i32, + /// Max records for schema inference + #[prost(uint64, tag = "6")] + pub schema_infer_max_rec: u64, + /// Optional date format + #[prost(string, tag = "7")] + pub date_format: ::prost::alloc::string::String, + /// Optional datetime format + #[prost(string, tag = "8")] + pub datetime_format: ::prost::alloc::string::String, + /// Optional timestamp format + #[prost(string, tag = "9")] + pub timestamp_format: ::prost::alloc::string::String, + /// Optional timestamp with timezone format + #[prost(string, tag = "10")] + pub timestamp_tz_format: ::prost::alloc::string::String, + /// Optional time format + #[prost(string, tag = "11")] + pub time_format: ::prost::alloc::string::String, + /// Optional representation of null value + #[prost(string, tag = "12")] + pub null_value: ::prost::alloc::string::String, +} +/// Options controlling CSV format #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] -pub struct WriterProperties { - #[prost(uint64, tag = "1")] - pub data_page_size_limit: u64, +pub struct JsonOptions { + /// Compression type + #[prost(enumeration = "CompressionTypeVariant", tag = "1")] + pub compression: i32, + /// Max records for schema inference #[prost(uint64, tag = "2")] - pub dictionary_page_size_limit: u64, - #[prost(uint64, tag = "3")] - pub data_page_row_count_limit: u64, - #[prost(uint64, tag = "4")] - pub write_batch_size: u64, - #[prost(uint64, tag = "5")] - pub max_row_group_size: u64, - #[prost(string, tag = "6")] - pub writer_version: ::prost::alloc::string::String, - #[prost(string, tag = "7")] - pub created_by: ::prost::alloc::string::String, + pub schema_infer_max_rec: u64, } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] @@ -1742,14 +1730,14 @@ pub struct FileSinkConfig { pub table_partition_cols: ::prost::alloc::vec::Vec, #[prost(bool, tag = "8")] pub overwrite: bool, - #[prost(message, optional, tag = "9")] - pub file_type_writer_options: ::core::option::Option, } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct JsonSink { #[prost(message, optional, tag = "1")] pub config: ::core::option::Option, + #[prost(message, optional, tag = "2")] + pub writer_options: ::core::option::Option, } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] @@ -1768,6 +1756,8 @@ pub struct JsonSinkExecNode { pub struct CsvSink { #[prost(message, optional, tag = "1")] pub config: ::core::option::Option, + #[prost(message, optional, tag = "2")] + pub writer_options: ::core::option::Option, } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] @@ -1783,9 +1773,241 @@ pub struct CsvSinkExecNode { } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] +pub struct TableParquetOptions { + #[prost(message, optional, tag = "1")] + pub global: ::core::option::Option, + #[prost(message, repeated, tag = "2")] + pub column_specific_options: ::prost::alloc::vec::Vec, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ColumnSpecificOptions { + #[prost(string, tag = "1")] + pub column_name: ::prost::alloc::string::String, + #[prost(message, optional, tag = "2")] + pub options: ::core::option::Option, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ColumnOptions { + #[prost(oneof = "column_options::BloomFilterEnabledOpt", tags = "1")] + pub bloom_filter_enabled_opt: ::core::option::Option< + column_options::BloomFilterEnabledOpt, + >, + #[prost(oneof = "column_options::EncodingOpt", tags = "2")] + pub encoding_opt: ::core::option::Option, + #[prost(oneof = "column_options::DictionaryEnabledOpt", tags = "3")] + pub dictionary_enabled_opt: ::core::option::Option< + column_options::DictionaryEnabledOpt, + >, + #[prost(oneof = "column_options::CompressionOpt", tags = "4")] + pub compression_opt: ::core::option::Option, + #[prost(oneof = "column_options::StatisticsEnabledOpt", tags = "5")] + pub statistics_enabled_opt: ::core::option::Option< + column_options::StatisticsEnabledOpt, + >, + #[prost(oneof = "column_options::BloomFilterFppOpt", tags = "6")] + pub bloom_filter_fpp_opt: ::core::option::Option, + #[prost(oneof = "column_options::BloomFilterNdvOpt", tags = "7")] + pub bloom_filter_ndv_opt: ::core::option::Option, + #[prost(oneof = "column_options::MaxStatisticsSizeOpt", tags = "8")] + pub max_statistics_size_opt: ::core::option::Option< + column_options::MaxStatisticsSizeOpt, + >, +} +/// Nested message and enum types in `ColumnOptions`. +pub mod column_options { + #[allow(clippy::derive_partial_eq_without_eq)] + #[derive(Clone, PartialEq, ::prost::Oneof)] + pub enum BloomFilterEnabledOpt { + #[prost(bool, tag = "1")] + BloomFilterEnabled(bool), + } + #[allow(clippy::derive_partial_eq_without_eq)] + #[derive(Clone, PartialEq, ::prost::Oneof)] + pub enum EncodingOpt { + #[prost(string, tag = "2")] + Encoding(::prost::alloc::string::String), + } + #[allow(clippy::derive_partial_eq_without_eq)] + #[derive(Clone, PartialEq, ::prost::Oneof)] + pub enum DictionaryEnabledOpt { + #[prost(bool, tag = "3")] + DictionaryEnabled(bool), + } + #[allow(clippy::derive_partial_eq_without_eq)] + #[derive(Clone, PartialEq, ::prost::Oneof)] + pub enum CompressionOpt { + #[prost(string, tag = "4")] + Compression(::prost::alloc::string::String), + } + #[allow(clippy::derive_partial_eq_without_eq)] + #[derive(Clone, PartialEq, ::prost::Oneof)] + pub enum StatisticsEnabledOpt { + #[prost(string, tag = "5")] + StatisticsEnabled(::prost::alloc::string::String), + } + #[allow(clippy::derive_partial_eq_without_eq)] + #[derive(Clone, PartialEq, ::prost::Oneof)] + pub enum BloomFilterFppOpt { + #[prost(double, tag = "6")] + BloomFilterFpp(f64), + } + #[allow(clippy::derive_partial_eq_without_eq)] + #[derive(Clone, PartialEq, ::prost::Oneof)] + pub enum BloomFilterNdvOpt { + #[prost(uint64, tag = "7")] + BloomFilterNdv(u64), + } + #[allow(clippy::derive_partial_eq_without_eq)] + #[derive(Clone, PartialEq, ::prost::Oneof)] + pub enum MaxStatisticsSizeOpt { + #[prost(uint32, tag = "8")] + MaxStatisticsSize(u32), + } +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ParquetOptions { + /// Regular fields + /// + /// default = true + #[prost(bool, tag = "1")] + pub enable_page_index: bool, + /// default = true + #[prost(bool, tag = "2")] + pub pruning: bool, + /// default = true + #[prost(bool, tag = "3")] + pub skip_metadata: bool, + /// default = false + #[prost(bool, tag = "5")] + pub pushdown_filters: bool, + /// default = false + #[prost(bool, tag = "6")] + pub reorder_filters: bool, + /// default = 1024 * 1024 + #[prost(uint64, tag = "7")] + pub data_pagesize_limit: u64, + /// default = 1024 + #[prost(uint64, tag = "8")] + pub write_batch_size: u64, + /// default = "1.0" + #[prost(string, tag = "9")] + pub writer_version: ::prost::alloc::string::String, + /// default = false + #[prost(bool, tag = "20")] + pub bloom_filter_enabled: bool, + /// default = true + #[prost(bool, tag = "23")] + pub allow_single_file_parallelism: bool, + /// default = 1 + #[prost(uint64, tag = "24")] + pub maximum_parallel_row_group_writers: u64, + /// default = 2 + #[prost(uint64, tag = "25")] + pub maximum_buffered_record_batches_per_stream: u64, + #[prost(uint64, tag = "12")] + pub dictionary_page_size_limit: u64, + #[prost(uint64, tag = "18")] + pub data_page_row_count_limit: u64, + #[prost(uint64, tag = "15")] + pub max_row_group_size: u64, + #[prost(string, tag = "16")] + pub created_by: ::prost::alloc::string::String, + #[prost(oneof = "parquet_options::MetadataSizeHintOpt", tags = "4")] + pub metadata_size_hint_opt: ::core::option::Option< + parquet_options::MetadataSizeHintOpt, + >, + #[prost(oneof = "parquet_options::CompressionOpt", tags = "10")] + pub compression_opt: ::core::option::Option, + #[prost(oneof = "parquet_options::DictionaryEnabledOpt", tags = "11")] + pub dictionary_enabled_opt: ::core::option::Option< + parquet_options::DictionaryEnabledOpt, + >, + #[prost(oneof = "parquet_options::StatisticsEnabledOpt", tags = "13")] + pub statistics_enabled_opt: ::core::option::Option< + parquet_options::StatisticsEnabledOpt, + >, + #[prost(oneof = "parquet_options::MaxStatisticsSizeOpt", tags = "14")] + pub max_statistics_size_opt: ::core::option::Option< + parquet_options::MaxStatisticsSizeOpt, + >, + #[prost(oneof = "parquet_options::ColumnIndexTruncateLengthOpt", tags = "17")] + pub column_index_truncate_length_opt: ::core::option::Option< + parquet_options::ColumnIndexTruncateLengthOpt, + >, + #[prost(oneof = "parquet_options::EncodingOpt", tags = "19")] + pub encoding_opt: ::core::option::Option, + #[prost(oneof = "parquet_options::BloomFilterFppOpt", tags = "21")] + pub bloom_filter_fpp_opt: ::core::option::Option, + #[prost(oneof = "parquet_options::BloomFilterNdvOpt", tags = "22")] + pub bloom_filter_ndv_opt: ::core::option::Option, +} +/// Nested message and enum types in `ParquetOptions`. +pub mod parquet_options { + #[allow(clippy::derive_partial_eq_without_eq)] + #[derive(Clone, PartialEq, ::prost::Oneof)] + pub enum MetadataSizeHintOpt { + #[prost(uint64, tag = "4")] + MetadataSizeHint(u64), + } + #[allow(clippy::derive_partial_eq_without_eq)] + #[derive(Clone, PartialEq, ::prost::Oneof)] + pub enum CompressionOpt { + #[prost(string, tag = "10")] + Compression(::prost::alloc::string::String), + } + #[allow(clippy::derive_partial_eq_without_eq)] + #[derive(Clone, PartialEq, ::prost::Oneof)] + pub enum DictionaryEnabledOpt { + #[prost(bool, tag = "11")] + DictionaryEnabled(bool), + } + #[allow(clippy::derive_partial_eq_without_eq)] + #[derive(Clone, PartialEq, ::prost::Oneof)] + pub enum StatisticsEnabledOpt { + #[prost(string, tag = "13")] + StatisticsEnabled(::prost::alloc::string::String), + } + #[allow(clippy::derive_partial_eq_without_eq)] + #[derive(Clone, PartialEq, ::prost::Oneof)] + pub enum MaxStatisticsSizeOpt { + #[prost(uint64, tag = "14")] + MaxStatisticsSize(u64), + } + #[allow(clippy::derive_partial_eq_without_eq)] + #[derive(Clone, PartialEq, ::prost::Oneof)] + pub enum ColumnIndexTruncateLengthOpt { + #[prost(uint64, tag = "17")] + ColumnIndexTruncateLength(u64), + } + #[allow(clippy::derive_partial_eq_without_eq)] + #[derive(Clone, PartialEq, ::prost::Oneof)] + pub enum EncodingOpt { + #[prost(string, tag = "19")] + Encoding(::prost::alloc::string::String), + } + #[allow(clippy::derive_partial_eq_without_eq)] + #[derive(Clone, PartialEq, ::prost::Oneof)] + pub enum BloomFilterFppOpt { + #[prost(double, tag = "21")] + BloomFilterFpp(f64), + } + #[allow(clippy::derive_partial_eq_without_eq)] + #[derive(Clone, PartialEq, ::prost::Oneof)] + pub enum BloomFilterNdvOpt { + #[prost(uint64, tag = "22")] + BloomFilterNdv(u64), + } +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] pub struct ParquetSink { #[prost(message, optional, tag = "1")] pub config: ::core::option::Option, + #[prost(message, optional, tag = "2")] + pub parquet_options: ::core::option::Option, } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] @@ -2639,9 +2861,9 @@ pub enum ScalarFunction { Signum = 15, Sin = 16, Sqrt = 17, - Tan = 18, + /// Tan = 18; Trunc = 19, - Array = 20, + /// 20 was Array /// RegexpMatch = 21; BitLength = 22, Btrim = 23, @@ -2680,19 +2902,19 @@ pub enum ScalarFunction { /// 56 was ToTimestampMillis /// 57 was ToTimestampMicros /// 58 was ToTimestampSeconds - /// 59 was Now + Now = 59, Translate = 60, Trim = 61, Upper = 62, Coalesce = 63, Power = 64, - StructFun = 65, - /// 66 was FromUnixtime + /// 65 was StructFun + FromUnixtime = 66, Atan2 = 67, /// 68 was DateBin - ArrowTypeof = 69, - /// 70 was CurrentDate - /// 71 was CurrentTime + /// 69 was ArrowTypeof + CurrentDate = 70, + CurrentTime = 71, Uuid = 72, Cbrt = 73, Acosh = 74, @@ -2700,22 +2922,22 @@ pub enum ScalarFunction { Atanh = 76, Sinh = 77, Cosh = 78, - Tanh = 79, + /// Tanh = 79; Pi = 80, Degrees = 81, Radians = 82, Factorial = 83, Lcm = 84, Gcd = 85, - ArrayAppend = 86, - ArrayConcat = 87, + /// 86 was ArrayAppend + /// 87 was ArrayConcat /// 88 was ArrayDims - ArrayRepeat = 89, + /// 89 was ArrayRepeat /// 90 was ArrayLength /// 91 was ArrayNdims ArrayPosition = 92, ArrayPositions = 93, - ArrayPrepend = 94, + /// 94 was ArrayPrepend ArrayRemove = 95, ArrayReplace = 96, /// 97 was ArrayToString @@ -2736,7 +2958,7 @@ pub enum ScalarFunction { Iszero = 114, /// 115 was ArrayEmpty ArrayPopBack = 116, - StringToArray = 117, + /// 117 was StringToArray /// 118 was ToTimestampNanos ArrayIntersect = 119, ArrayUnion = 120, @@ -2747,8 +2969,8 @@ pub enum ScalarFunction { Levenshtein = 125, SubstrIndex = 126, FindInSet = 127, - ArraySort = 128, - ArrayDistinct = 129, + /// / 128 was ArraySort + /// / 129 was ArrayDistinct ArrayResize = 130, EndsWith = 131, /// / 132 was InStr @@ -2783,9 +3005,7 @@ impl ScalarFunction { ScalarFunction::Signum => "Signum", ScalarFunction::Sin => "Sin", ScalarFunction::Sqrt => "Sqrt", - ScalarFunction::Tan => "Tan", ScalarFunction::Trunc => "Trunc", - ScalarFunction::Array => "Array", ScalarFunction::BitLength => "BitLength", ScalarFunction::Btrim => "Btrim", ScalarFunction::CharacterLength => "CharacterLength", @@ -2815,14 +3035,16 @@ impl ScalarFunction { ScalarFunction::Strpos => "Strpos", ScalarFunction::Substr => "Substr", ScalarFunction::ToHex => "ToHex", + ScalarFunction::Now => "Now", ScalarFunction::Translate => "Translate", ScalarFunction::Trim => "Trim", ScalarFunction::Upper => "Upper", ScalarFunction::Coalesce => "Coalesce", ScalarFunction::Power => "Power", - ScalarFunction::StructFun => "StructFun", + ScalarFunction::FromUnixtime => "FromUnixtime", ScalarFunction::Atan2 => "Atan2", - ScalarFunction::ArrowTypeof => "ArrowTypeof", + ScalarFunction::CurrentDate => "CurrentDate", + ScalarFunction::CurrentTime => "CurrentTime", ScalarFunction::Uuid => "Uuid", ScalarFunction::Cbrt => "Cbrt", ScalarFunction::Acosh => "Acosh", @@ -2830,19 +3052,14 @@ impl ScalarFunction { ScalarFunction::Atanh => "Atanh", ScalarFunction::Sinh => "Sinh", ScalarFunction::Cosh => "Cosh", - ScalarFunction::Tanh => "Tanh", ScalarFunction::Pi => "Pi", ScalarFunction::Degrees => "Degrees", ScalarFunction::Radians => "Radians", ScalarFunction::Factorial => "Factorial", ScalarFunction::Lcm => "Lcm", ScalarFunction::Gcd => "Gcd", - ScalarFunction::ArrayAppend => "ArrayAppend", - ScalarFunction::ArrayConcat => "ArrayConcat", - ScalarFunction::ArrayRepeat => "ArrayRepeat", ScalarFunction::ArrayPosition => "ArrayPosition", ScalarFunction::ArrayPositions => "ArrayPositions", - ScalarFunction::ArrayPrepend => "ArrayPrepend", ScalarFunction::ArrayRemove => "ArrayRemove", ScalarFunction::ArrayReplace => "ArrayReplace", ScalarFunction::ArrayElement => "ArrayElement", @@ -2855,7 +3072,6 @@ impl ScalarFunction { ScalarFunction::Nanvl => "Nanvl", ScalarFunction::Iszero => "Iszero", ScalarFunction::ArrayPopBack => "ArrayPopBack", - ScalarFunction::StringToArray => "StringToArray", ScalarFunction::ArrayIntersect => "ArrayIntersect", ScalarFunction::ArrayUnion => "ArrayUnion", ScalarFunction::OverLay => "OverLay", @@ -2864,8 +3080,6 @@ impl ScalarFunction { ScalarFunction::Levenshtein => "Levenshtein", ScalarFunction::SubstrIndex => "SubstrIndex", ScalarFunction::FindInSet => "FindInSet", - ScalarFunction::ArraySort => "ArraySort", - ScalarFunction::ArrayDistinct => "ArrayDistinct", ScalarFunction::ArrayResize => "ArrayResize", ScalarFunction::EndsWith => "EndsWith", ScalarFunction::MakeDate => "MakeDate", @@ -2892,9 +3106,7 @@ impl ScalarFunction { "Signum" => Some(Self::Signum), "Sin" => Some(Self::Sin), "Sqrt" => Some(Self::Sqrt), - "Tan" => Some(Self::Tan), "Trunc" => Some(Self::Trunc), - "Array" => Some(Self::Array), "BitLength" => Some(Self::BitLength), "Btrim" => Some(Self::Btrim), "CharacterLength" => Some(Self::CharacterLength), @@ -2924,14 +3136,16 @@ impl ScalarFunction { "Strpos" => Some(Self::Strpos), "Substr" => Some(Self::Substr), "ToHex" => Some(Self::ToHex), + "Now" => Some(Self::Now), "Translate" => Some(Self::Translate), "Trim" => Some(Self::Trim), "Upper" => Some(Self::Upper), "Coalesce" => Some(Self::Coalesce), "Power" => Some(Self::Power), - "StructFun" => Some(Self::StructFun), + "FromUnixtime" => Some(Self::FromUnixtime), "Atan2" => Some(Self::Atan2), - "ArrowTypeof" => Some(Self::ArrowTypeof), + "CurrentDate" => Some(Self::CurrentDate), + "CurrentTime" => Some(Self::CurrentTime), "Uuid" => Some(Self::Uuid), "Cbrt" => Some(Self::Cbrt), "Acosh" => Some(Self::Acosh), @@ -2939,19 +3153,14 @@ impl ScalarFunction { "Atanh" => Some(Self::Atanh), "Sinh" => Some(Self::Sinh), "Cosh" => Some(Self::Cosh), - "Tanh" => Some(Self::Tanh), "Pi" => Some(Self::Pi), "Degrees" => Some(Self::Degrees), "Radians" => Some(Self::Radians), "Factorial" => Some(Self::Factorial), "Lcm" => Some(Self::Lcm), "Gcd" => Some(Self::Gcd), - "ArrayAppend" => Some(Self::ArrayAppend), - "ArrayConcat" => Some(Self::ArrayConcat), - "ArrayRepeat" => Some(Self::ArrayRepeat), "ArrayPosition" => Some(Self::ArrayPosition), "ArrayPositions" => Some(Self::ArrayPositions), - "ArrayPrepend" => Some(Self::ArrayPrepend), "ArrayRemove" => Some(Self::ArrayRemove), "ArrayReplace" => Some(Self::ArrayReplace), "ArrayElement" => Some(Self::ArrayElement), @@ -2964,7 +3173,6 @@ impl ScalarFunction { "Nanvl" => Some(Self::Nanvl), "Iszero" => Some(Self::Iszero), "ArrayPopBack" => Some(Self::ArrayPopBack), - "StringToArray" => Some(Self::StringToArray), "ArrayIntersect" => Some(Self::ArrayIntersect), "ArrayUnion" => Some(Self::ArrayUnion), "OverLay" => Some(Self::OverLay), @@ -2973,8 +3181,6 @@ impl ScalarFunction { "Levenshtein" => Some(Self::Levenshtein), "SubstrIndex" => Some(Self::SubstrIndex), "FindInSet" => Some(Self::FindInSet), - "ArraySort" => Some(Self::ArraySort), - "ArrayDistinct" => Some(Self::ArrayDistinct), "ArrayResize" => Some(Self::ArrayResize), "EndsWith" => Some(Self::EndsWith), "MakeDate" => Some(Self::MakeDate), diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index 1af661ad8e5f..4b9bd45fd55b 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -47,11 +47,10 @@ use datafusion_common::{ use datafusion_expr::expr::Unnest; use datafusion_expr::window_frame::{check_window_frame, regularize_window_order_by}; use datafusion_expr::{ - acosh, array, array_append, array_concat, array_distinct, array_element, - array_except, array_intersect, array_pop_back, array_pop_front, array_position, - array_positions, array_prepend, array_remove, array_remove_all, array_remove_n, - array_repeat, array_replace, array_replace_all, array_replace_n, array_resize, - array_slice, array_sort, array_union, arrow_typeof, ascii, asinh, atan, atan2, atanh, + acosh, array_element, array_except, array_intersect, array_pop_back, array_pop_front, + array_position, array_positions, array_remove, array_remove_all, array_remove_n, + array_replace, array_replace_all, array_replace_n, array_resize, array_slice, + array_union, ascii, asinh, atan, atan2, atanh, bit_length, btrim, cbrt, ceil, character_length, chr, coalesce, concat_expr, concat_ws_expr, cos, cosh, cot, degrees, digest, ends_with, exp, expr::{self, InList, Sort, WindowFunction}, @@ -60,11 +59,10 @@ use datafusion_expr::{ logical_plan::{PlanType, StringifiedPlan}, lower, lpad, ltrim, md5, nanvl, octet_length, overlay, pi, power, radians, random, repeat, replace, reverse, right, round, rpad, rtrim, sha224, sha256, sha384, sha512, - signum, sin, sinh, split_part, sqrt, starts_with, string_to_array, strpos, - struct_fun, substr, substr_index, substring, tan, tanh, to_hex, translate, trim, - trunc, upper, uuid, AggregateFunction, Between, BinaryExpr, BuiltInWindowFunction, - BuiltinScalarFunction, Case, Cast, Expr, GetFieldAccess, GetIndexedField, - GroupingSet, + signum, sin, sinh, split_part, sqrt, starts_with, strpos, substr, + substr_index, substring, to_hex, translate, trim, trunc, upper, uuid, + AggregateFunction, Between, BinaryExpr, BuiltInWindowFunction, BuiltinScalarFunction, + Case, Cast, Expr, GetFieldAccess, GetIndexedField, GroupingSet, GroupingSet::GroupingSets, JoinConstraint, JoinType, Like, Operator, TryCast, WindowFrame, WindowFrameBound, WindowFrameUnits, @@ -447,12 +445,10 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::Cbrt => Self::Cbrt, ScalarFunction::Sin => Self::Sin, ScalarFunction::Cos => Self::Cos, - ScalarFunction::Tan => Self::Tan, ScalarFunction::Cot => Self::Cot, ScalarFunction::Atan => Self::Atan, ScalarFunction::Sinh => Self::Sinh, ScalarFunction::Cosh => Self::Cosh, - ScalarFunction::Tanh => Self::Tanh, ScalarFunction::Asinh => Self::Asinh, ScalarFunction::Acosh => Self::Acosh, ScalarFunction::Atanh => Self::Atanh, @@ -476,18 +472,12 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::Trim => Self::Trim, ScalarFunction::Ltrim => Self::Ltrim, ScalarFunction::Rtrim => Self::Rtrim, - ScalarFunction::ArrayAppend => Self::ArrayAppend, - ScalarFunction::ArraySort => Self::ArraySort, - ScalarFunction::ArrayConcat => Self::ArrayConcat, ScalarFunction::ArrayExcept => Self::ArrayExcept, - ScalarFunction::ArrayDistinct => Self::ArrayDistinct, ScalarFunction::ArrayElement => Self::ArrayElement, ScalarFunction::ArrayPopFront => Self::ArrayPopFront, ScalarFunction::ArrayPopBack => Self::ArrayPopBack, ScalarFunction::ArrayPosition => Self::ArrayPosition, ScalarFunction::ArrayPositions => Self::ArrayPositions, - ScalarFunction::ArrayPrepend => Self::ArrayPrepend, - ScalarFunction::ArrayRepeat => Self::ArrayRepeat, ScalarFunction::ArrayRemove => Self::ArrayRemove, ScalarFunction::ArrayRemoveN => Self::ArrayRemoveN, ScalarFunction::ArrayRemoveAll => Self::ArrayRemoveAll, @@ -499,7 +489,6 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::ArrayIntersect => Self::ArrayIntersect, ScalarFunction::ArrayUnion => Self::ArrayUnion, ScalarFunction::ArrayResize => Self::ArrayResize, - ScalarFunction::Array => Self::MakeArray, ScalarFunction::Md5 => Self::MD5, ScalarFunction::Sha224 => Self::SHA224, ScalarFunction::Sha256 => Self::SHA256, @@ -525,7 +514,6 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::Right => Self::Right, ScalarFunction::Rpad => Self::Rpad, ScalarFunction::SplitPart => Self::SplitPart, - ScalarFunction::StringToArray => Self::StringToArray, ScalarFunction::StartsWith => Self::StartsWith, ScalarFunction::Strpos => Self::Strpos, ScalarFunction::Substr => Self::Substr, @@ -537,11 +525,9 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::Coalesce => Self::Coalesce, ScalarFunction::Pi => Self::Pi, ScalarFunction::Power => Self::Power, - ScalarFunction::StructFun => Self::Struct, - ScalarFunction::Atan2 => Self::Atan2, + ScalarFunction::Atan2 => Self::Atan2, ScalarFunction::Nanvl => Self::Nanvl, ScalarFunction::Iszero => Self::Iszero, - ScalarFunction::ArrowTypeof => Self::ArrowTypeof, ScalarFunction::OverLay => Self::OverLay, ScalarFunction::Levenshtein => Self::Levenshtein, ScalarFunction::SubstrIndex => Self::SubstrIndex, @@ -1404,37 +1390,12 @@ pub fn parse_expr( ScalarFunction::Acosh => { Ok(acosh(parse_expr(&args[0], registry, codec)?)) } - ScalarFunction::Array => Ok(array( - args.to_owned() - .iter() - .map(|expr| parse_expr(expr, registry, codec)) - .collect::, _>>()?, - )), - ScalarFunction::ArrayAppend => Ok(array_append( - parse_expr(&args[0], registry, codec)?, - parse_expr(&args[1], registry, codec)?, - )), - ScalarFunction::ArraySort => Ok(array_sort( - parse_expr(&args[0], registry, codec)?, - parse_expr(&args[1], registry, codec)?, - parse_expr(&args[2], registry, codec)?, - )), ScalarFunction::ArrayPopFront => { Ok(array_pop_front(parse_expr(&args[0], registry, codec)?)) } ScalarFunction::ArrayPopBack => { Ok(array_pop_back(parse_expr(&args[0], registry, codec)?)) } - ScalarFunction::ArrayPrepend => Ok(array_prepend( - parse_expr(&args[0], registry, codec)?, - parse_expr(&args[1], registry, codec)?, - )), - ScalarFunction::ArrayConcat => Ok(array_concat( - args.to_owned() - .iter() - .map(|expr| parse_expr(expr, registry, codec)) - .collect::, _>>()?, - )), ScalarFunction::ArrayExcept => Ok(array_except( parse_expr(&args[0], registry, codec)?, parse_expr(&args[1], registry, codec)?, @@ -1452,10 +1413,6 @@ pub fn parse_expr( parse_expr(&args[0], registry, codec)?, parse_expr(&args[1], registry, codec)?, )), - ScalarFunction::ArrayRepeat => Ok(array_repeat( - parse_expr(&args[0], registry, codec)?, - parse_expr(&args[1], registry, codec)?, - )), ScalarFunction::ArrayRemove => Ok(array_remove( parse_expr(&args[0], registry, codec)?, parse_expr(&args[1], registry, codec)?, @@ -1494,9 +1451,6 @@ pub fn parse_expr( parse_expr(&args[2], registry, codec)?, parse_expr(&args[3], registry, codec)?, )), - ScalarFunction::ArrayDistinct => { - Ok(array_distinct(parse_expr(&args[0], registry, codec)?)) - } ScalarFunction::ArrayElement => Ok(array_element( parse_expr(&args[0], registry, codec)?, parse_expr(&args[1], registry, codec)?, @@ -1514,11 +1468,9 @@ pub fn parse_expr( ScalarFunction::Cbrt => Ok(cbrt(parse_expr(&args[0], registry, codec)?)), ScalarFunction::Sin => Ok(sin(parse_expr(&args[0], registry, codec)?)), ScalarFunction::Cos => Ok(cos(parse_expr(&args[0], registry, codec)?)), - ScalarFunction::Tan => Ok(tan(parse_expr(&args[0], registry, codec)?)), ScalarFunction::Atan => Ok(atan(parse_expr(&args[0], registry, codec)?)), ScalarFunction::Sinh => Ok(sinh(parse_expr(&args[0], registry, codec)?)), ScalarFunction::Cosh => Ok(cosh(parse_expr(&args[0], registry, codec)?)), - ScalarFunction::Tanh => Ok(tanh(parse_expr(&args[0], registry, codec)?)), ScalarFunction::Atanh => { Ok(atanh(parse_expr(&args[0], registry, codec)?)) } @@ -1753,14 +1705,6 @@ pub fn parse_expr( ScalarFunction::Iszero => { Ok(iszero(parse_expr(&args[0], registry, codec)?)) } - ScalarFunction::ArrowTypeof => { - Ok(arrow_typeof(parse_expr(&args[0], registry, codec)?)) - } - ScalarFunction::StringToArray => Ok(string_to_array( - parse_expr(&args[0], registry, codec)?, - parse_expr(&args[1], registry, codec)?, - parse_expr(&args[2], registry, codec)?, - )), ScalarFunction::OverLay => Ok(overlay( args.to_owned() .iter() @@ -1776,9 +1720,6 @@ pub fn parse_expr( parse_expr(&args[0], registry, codec)?, parse_expr(&args[1], registry, codec)?, )), - ScalarFunction::StructFun => { - Ok(struct_fun(parse_expr(&args[0], registry, codec)?)) - } } } ExprType::ScalarUdfExpr(protobuf::ScalarUdfExprNode { diff --git a/datafusion/proto/src/logical_plan/mod.rs b/datafusion/proto/src/logical_plan/mod.rs index 7acad1844d48..9b3b677e3c0a 100644 --- a/datafusion/proto/src/logical_plan/mod.rs +++ b/datafusion/proto/src/logical_plan/mod.rs @@ -15,20 +15,13 @@ // specific language governing permissions and limitations // under the License. -use arrow::csv::WriterBuilder; -use datafusion_common::file_options::arrow_writer::ArrowWriterOptions; -use datafusion_expr::ScalarUDF; use std::collections::HashMap; use std::fmt::Debug; -use std::str::FromStr; use std::sync::Arc; -use crate::common::{byte_to_string, proto_error, str_to_byte}; +use crate::common::proto_error; use crate::protobuf::logical_plan_node::LogicalPlanType::CustomScan; -use crate::protobuf::{ - copy_to_node, file_type_writer_options, CustomTableScanNode, - LogicalExprNodeCollection, SqlOption, -}; +use crate::protobuf::{CustomTableScanNode, LogicalExprNodeCollection}; use crate::{ convert_required, protobuf::{ @@ -37,6 +30,7 @@ use crate::{ }, }; +use arrow::csv::WriterBuilder; use arrow::datatypes::{DataType, Schema, SchemaRef}; #[cfg(feature = "parquet")] use datafusion::datasource::file_format::parquet::ParquetFormat; @@ -51,9 +45,8 @@ use datafusion::{ prelude::SessionContext, }; use datafusion_common::{ - context, file_options::StatementOptions, internal_err, not_impl_err, - parsers::CompressionTypeVariant, plan_datafusion_err, DataFusionError, FileType, - FileTypeWriterOptions, OwnedTableReference, Result, + context, internal_err, not_impl_err, parsers::CompressionTypeVariant, + plan_datafusion_err, DataFusionError, OwnedTableReference, Result, }; use datafusion_expr::{ dml, @@ -63,13 +56,9 @@ use datafusion_expr::{ EmptyRelation, Extension, Join, JoinConstraint, Limit, Prepare, Projection, Repartition, Sort, SubqueryAlias, TableScan, Values, Window, }, - DistinctOn, DropView, Expr, LogicalPlan, LogicalPlanBuilder, + DistinctOn, DropView, Expr, LogicalPlan, LogicalPlanBuilder, ScalarUDF, }; -use datafusion::parquet::file::properties::{WriterProperties, WriterVersion}; -use datafusion_common::file_options::csv_writer::CsvWriterOptions; -use datafusion_common::file_options::parquet_writer::ParquetWriterOptions; -use datafusion_expr::dml::CopyOptions; use prost::bytes::BufMut; use prost::Message; @@ -361,21 +350,19 @@ impl AsLogicalPlan for LogicalPlanNode { )) })? { #[cfg(feature = "parquet")] - &FileFormatType::Parquet(protobuf::ParquetFormat {}) => { - Arc::new(ParquetFormat::default()) + FileFormatType::Parquet(protobuf::ParquetFormat {options}) => { + let mut parquet = ParquetFormat::default(); + if let Some(options) = options { + parquet = parquet.with_options(options.try_into()?) + } + Arc::new(parquet) } FileFormatType::Csv(protobuf::CsvFormat { - has_header, - delimiter, - quote, - optional_escape + options }) => { - let mut csv = CsvFormat::default() - .with_has_header(*has_header) - .with_delimiter(str_to_byte(delimiter, "delimiter")?) - .with_quote(str_to_byte(quote, "quote")?); - if let Some(protobuf::csv_format::OptionalEscape::Escape(escape)) = optional_escape { - csv = csv.with_quote(str_to_byte(escape, "escape")?); + let mut csv = CsvFormat::default(); + if let Some(options) = options { + csv = csv.with_options(options.try_into()?) } Arc::new(csv)}, FileFormatType::Avro(..) => Arc::new(AvroFormat), @@ -864,80 +851,13 @@ impl AsLogicalPlan for LogicalPlanNode { let input: LogicalPlan = into_logical_plan!(copy.input, ctx, extension_codec)?; - let copy_options = match ©.copy_options { - Some(copy_to_node::CopyOptions::SqlOptions(opt)) => { - let options = opt - .option - .iter() - .map(|o| (o.key.clone(), o.value.clone())) - .collect(); - CopyOptions::SQLOptions(StatementOptions::from(&options)) - } - Some(copy_to_node::CopyOptions::WriterOptions(opt)) => { - match &opt.file_type { - Some(ft) => match ft { - file_type_writer_options::FileType::ArrowOptions(_) => { - CopyOptions::WriterOptions(Box::new( - FileTypeWriterOptions::Arrow( - ArrowWriterOptions::new(), - ), - )) - } - file_type_writer_options::FileType::CsvOptions( - writer_options, - ) => { - let writer_builder = - csv_writer_options_from_proto(writer_options)?; - CopyOptions::WriterOptions(Box::new( - FileTypeWriterOptions::CSV( - CsvWriterOptions::new( - writer_builder, - CompressionTypeVariant::UNCOMPRESSED, - ), - ), - )) - } - file_type_writer_options::FileType::ParquetOptions( - writer_options, - ) => { - let writer_properties = - match &writer_options.writer_properties { - Some(serialized_writer_options) => { - writer_properties_from_proto( - serialized_writer_options, - )? - } - _ => WriterProperties::default(), - }; - CopyOptions::WriterOptions(Box::new( - FileTypeWriterOptions::Parquet( - ParquetWriterOptions::new(writer_properties), - ), - )) - } - _ => { - return Err(proto_error( - "WriterOptions unsupported file_type", - )) - } - }, - None => { - return Err(proto_error( - "WriterOptions missing file_type", - )) - } - } - } - None => return Err(proto_error("CopyTo missing CopyOptions")), - }; - Ok(datafusion_expr::LogicalPlan::Copy( datafusion_expr::dml::CopyTo { input: Arc::new(input), output_url: copy.output_url.clone(), - file_format: FileType::from_str(©.file_type)?, partition_by: copy.partition_by.clone(), - copy_options, + format_options: convert_required!(copy.format_options)?, + options: Default::default(), }, )) } @@ -1008,30 +928,20 @@ impl AsLogicalPlan for LogicalPlanNode { let mut maybe_some_type = None; #[cfg(feature = "parquet")] - if any.is::() { + if let Some(parquet) = any.downcast_ref::() { + let options = parquet.options(); maybe_some_type = - Some(FileFormatType::Parquet(protobuf::ParquetFormat {})) + Some(FileFormatType::Parquet(protobuf::ParquetFormat { + options: Some(options.try_into()?), + })); }; if let Some(csv) = any.downcast_ref::() { + let options = csv.options(); maybe_some_type = Some(FileFormatType::Csv(protobuf::CsvFormat { - delimiter: byte_to_string( - csv.delimiter(), - "delimiter", - )?, - has_header: csv.has_header(), - quote: byte_to_string(csv.quote(), "quote")?, - optional_escape: if let Some(escape) = csv.escape() { - Some( - protobuf::csv_format::OptionalEscape::Escape( - byte_to_string(escape, "escape")?, - ), - ) - } else { - None - }, - })) + options: Some(options.try_into()?), + })); } if any.is::() { @@ -1672,92 +1582,21 @@ impl AsLogicalPlan for LogicalPlanNode { LogicalPlan::Copy(dml::CopyTo { input, output_url, - file_format, - copy_options, + format_options, partition_by, + .. }) => { let input = protobuf::LogicalPlanNode::try_from_logical_plan( input, extension_codec, )?; - let copy_options_proto: Option = - match copy_options { - CopyOptions::SQLOptions(opt) => { - let options: Vec = opt - .clone() - .into_inner() - .iter() - .map(|(k, v)| SqlOption { - key: k.to_string(), - value: v.to_string(), - }) - .collect(); - Some(copy_to_node::CopyOptions::SqlOptions( - protobuf::SqlOptions { option: options }, - )) - } - CopyOptions::WriterOptions(opt) => { - match opt.as_ref() { - FileTypeWriterOptions::Arrow(_) => { - let arrow_writer_options = - file_type_writer_options::FileType::ArrowOptions( - protobuf::ArrowWriterOptions {}, - ); - Some(copy_to_node::CopyOptions::WriterOptions( - protobuf::FileTypeWriterOptions { - file_type: Some(arrow_writer_options), - }, - )) - } - FileTypeWriterOptions::CSV(csv_opts) => { - let csv_options = &csv_opts.writer_options; - let csv_writer_options = csv_writer_options_to_proto( - csv_options, - &csv_opts.compression, - ); - let csv_options = - file_type_writer_options::FileType::CsvOptions( - csv_writer_options, - ); - Some(copy_to_node::CopyOptions::WriterOptions( - protobuf::FileTypeWriterOptions { - file_type: Some(csv_options), - }, - )) - } - FileTypeWriterOptions::Parquet(parquet_opts) => { - let parquet_writer_options = - protobuf::ParquetWriterOptions { - writer_properties: Some( - writer_properties_to_proto( - &parquet_opts.writer_options, - ), - ), - }; - let parquet_options = file_type_writer_options::FileType::ParquetOptions(parquet_writer_options); - Some(copy_to_node::CopyOptions::WriterOptions( - protobuf::FileTypeWriterOptions { - file_type: Some(parquet_options), - }, - )) - } - _ => { - return Err(proto_error( - "Unsupported FileTypeWriterOptions in CopyTo", - )) - } - } - } - }; - Ok(protobuf::LogicalPlanNode { logical_plan_type: Some(LogicalPlanType::CopyTo(Box::new( protobuf::CopyToNode { input: Some(Box::new(input)), output_url: output_url.to_string(), - file_type: file_format.to_string(), - copy_options: copy_options_proto, + format_options: Some(format_options.try_into()?), partition_by: partition_by.clone(), }, ))), @@ -1813,33 +1652,3 @@ pub(crate) fn csv_writer_options_from_proto( .with_time_format(writer_options.time_format.clone()) .with_null(writer_options.null_value.clone())) } - -pub(crate) fn writer_properties_to_proto( - props: &WriterProperties, -) -> protobuf::WriterProperties { - protobuf::WriterProperties { - data_page_size_limit: props.data_page_size_limit() as u64, - dictionary_page_size_limit: props.dictionary_page_size_limit() as u64, - data_page_row_count_limit: props.data_page_row_count_limit() as u64, - write_batch_size: props.write_batch_size() as u64, - max_row_group_size: props.max_row_group_size() as u64, - writer_version: format!("{:?}", props.writer_version()), - created_by: props.created_by().to_string(), - } -} - -pub(crate) fn writer_properties_from_proto( - props: &protobuf::WriterProperties, -) -> Result { - let writer_version = - WriterVersion::from_str(&props.writer_version).map_err(proto_error)?; - Ok(WriterProperties::builder() - .set_created_by(props.created_by.clone()) - .set_writer_version(writer_version) - .set_dictionary_page_size_limit(props.dictionary_page_size_limit as usize) - .set_data_page_row_count_limit(props.data_page_row_count_limit as usize) - .set_data_page_size_limit(props.data_page_size_limit as usize) - .set_write_batch_size(props.write_batch_size as usize) - .set_max_row_group_size(props.max_row_group_size as usize) - .build()) -} diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index 7024a9fab3f9..65b4c8ba0445 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -1426,11 +1426,9 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::Cbrt => Self::Cbrt, BuiltinScalarFunction::Sin => Self::Sin, BuiltinScalarFunction::Cos => Self::Cos, - BuiltinScalarFunction::Tan => Self::Tan, BuiltinScalarFunction::Cot => Self::Cot, BuiltinScalarFunction::Sinh => Self::Sinh, BuiltinScalarFunction::Cosh => Self::Cosh, - BuiltinScalarFunction::Tanh => Self::Tanh, BuiltinScalarFunction::Atan => Self::Atan, BuiltinScalarFunction::Asinh => Self::Asinh, BuiltinScalarFunction::Acosh => Self::Acosh, @@ -1456,18 +1454,12 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::Ltrim => Self::Ltrim, BuiltinScalarFunction::Rtrim => Self::Rtrim, BuiltinScalarFunction::ToChar => Self::ToChar, - BuiltinScalarFunction::ArrayAppend => Self::ArrayAppend, - BuiltinScalarFunction::ArraySort => Self::ArraySort, - BuiltinScalarFunction::ArrayConcat => Self::ArrayConcat, BuiltinScalarFunction::ArrayExcept => Self::ArrayExcept, - BuiltinScalarFunction::ArrayDistinct => Self::ArrayDistinct, BuiltinScalarFunction::ArrayElement => Self::ArrayElement, BuiltinScalarFunction::ArrayPopFront => Self::ArrayPopFront, BuiltinScalarFunction::ArrayPopBack => Self::ArrayPopBack, BuiltinScalarFunction::ArrayPosition => Self::ArrayPosition, BuiltinScalarFunction::ArrayPositions => Self::ArrayPositions, - BuiltinScalarFunction::ArrayPrepend => Self::ArrayPrepend, - BuiltinScalarFunction::ArrayRepeat => Self::ArrayRepeat, BuiltinScalarFunction::ArrayResize => Self::ArrayResize, BuiltinScalarFunction::ArrayRemove => Self::ArrayRemove, BuiltinScalarFunction::ArrayRemoveN => Self::ArrayRemoveN, @@ -1479,7 +1471,6 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::ArraySlice => Self::ArraySlice, BuiltinScalarFunction::ArrayIntersect => Self::ArrayIntersect, BuiltinScalarFunction::ArrayUnion => Self::ArrayUnion, - BuiltinScalarFunction::MakeArray => Self::Array, BuiltinScalarFunction::MD5 => Self::Md5, BuiltinScalarFunction::SHA224 => Self::Sha224, BuiltinScalarFunction::SHA256 => Self::Sha256, @@ -1506,7 +1497,6 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::Right => Self::Right, BuiltinScalarFunction::Rpad => Self::Rpad, BuiltinScalarFunction::SplitPart => Self::SplitPart, - BuiltinScalarFunction::StringToArray => Self::StringToArray, BuiltinScalarFunction::StartsWith => Self::StartsWith, BuiltinScalarFunction::Strpos => Self::Strpos, BuiltinScalarFunction::Substr => Self::Substr, @@ -1516,11 +1506,9 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::Coalesce => Self::Coalesce, BuiltinScalarFunction::Pi => Self::Pi, BuiltinScalarFunction::Power => Self::Power, - BuiltinScalarFunction::Struct => Self::StructFun, BuiltinScalarFunction::Atan2 => Self::Atan2, BuiltinScalarFunction::Nanvl => Self::Nanvl, BuiltinScalarFunction::Iszero => Self::Iszero, - BuiltinScalarFunction::ArrowTypeof => Self::ArrowTypeof, BuiltinScalarFunction::OverLay => Self::OverLay, BuiltinScalarFunction::Levenshtein => Self::Levenshtein, BuiltinScalarFunction::SubstrIndex => Self::SubstrIndex, diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index d3b41f114fba..16f0e94cad83 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -17,9 +17,16 @@ //! Serde code to convert from protocol buffers to Rust data structures. +use std::collections::HashMap; use std::convert::{TryFrom, TryInto}; use std::sync::Arc; +use crate::common::proto_error; +use crate::convert_required; +use crate::logical_plan::{self, csv_writer_options_from_proto}; +use crate::protobuf::physical_expr_node::ExprType; +use crate::protobuf::{self, copy_to_node}; + use arrow::compute::SortOptions; use datafusion::arrow::datatypes::Schema; use datafusion::datasource::file_format::csv::CsvSink; @@ -34,31 +41,24 @@ use datafusion::execution::FunctionRegistry; use datafusion::logical_expr::WindowFunctionDefinition; use datafusion::physical_expr::{PhysicalSortExpr, ScalarFunctionExpr}; use datafusion::physical_plan::expressions::{ - in_list, BinaryExpr, CaseExpr, CastExpr, Column, IsNotNullExpr, IsNullExpr, LikeExpr, - Literal, NegativeExpr, NotExpr, TryCastExpr, + in_list, BinaryExpr, CaseExpr, CastExpr, Column, GetFieldAccessExpr, + GetIndexedFieldExpr, IsNotNullExpr, IsNullExpr, LikeExpr, Literal, NegativeExpr, + NotExpr, TryCastExpr, }; -use datafusion::physical_plan::expressions::{GetFieldAccessExpr, GetIndexedFieldExpr}; use datafusion::physical_plan::windows::create_window_expr; use datafusion::physical_plan::{ functions, ColumnStatistics, Partitioning, PhysicalExpr, Statistics, WindowExpr, }; -use datafusion_common::file_options::arrow_writer::ArrowWriterOptions; +use datafusion_common::config::{ + ColumnOptions, CsvOptions, FormatOptions, JsonOptions, ParquetOptions, + TableParquetOptions, +}; use datafusion_common::file_options::csv_writer::CsvWriterOptions; use datafusion_common::file_options::json_writer::JsonWriterOptions; -use datafusion_common::file_options::parquet_writer::ParquetWriterOptions; use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::stats::Precision; -use datafusion_common::{ - not_impl_err, DataFusionError, FileTypeWriterOptions, JoinSide, Result, ScalarValue, -}; - -use crate::common::proto_error; -use crate::convert_required; -use crate::logical_plan; -use crate::protobuf; -use crate::protobuf::physical_expr_node::ExprType; +use datafusion_common::{not_impl_err, DataFusionError, JoinSide, Result, ScalarValue}; -use crate::logical_plan::{csv_writer_options_from_proto, writer_properties_from_proto}; use chrono::{TimeZone, Utc}; use object_store::path::Path; use object_store::ObjectMeta; @@ -735,7 +735,10 @@ impl TryFrom<&protobuf::JsonSink> for JsonSink { type Error = DataFusionError; fn try_from(value: &protobuf::JsonSink) -> Result { - Ok(Self::new(convert_required!(value.config)?)) + Ok(Self::new( + convert_required!(value.config)?, + convert_required!(value.writer_options)?, + )) } } @@ -744,7 +747,10 @@ impl TryFrom<&protobuf::ParquetSink> for ParquetSink { type Error = DataFusionError; fn try_from(value: &protobuf::ParquetSink) -> Result { - Ok(Self::new(convert_required!(value.config)?)) + Ok(Self::new( + convert_required!(value.config)?, + convert_required!(value.parquet_options)?, + )) } } @@ -752,7 +758,10 @@ impl TryFrom<&protobuf::CsvSink> for CsvSink { type Error = DataFusionError; fn try_from(value: &protobuf::CsvSink) -> Result { - Ok(Self::new(convert_required!(value.config)?)) + Ok(Self::new( + convert_required!(value.config)?, + convert_required!(value.writer_options)?, + )) } } @@ -785,7 +794,6 @@ impl TryFrom<&protobuf::FileSinkConfig> for FileSinkConfig { output_schema: Arc::new(convert_required!(conf.output_schema)?), table_partition_cols, overwrite: conf.overwrite, - file_type_writer_options: convert_required!(conf.file_type_writer_options)?, }) } } @@ -814,34 +822,223 @@ impl From for protobuf::CompressionTypeVariant { } } -impl TryFrom<&protobuf::FileTypeWriterOptions> for FileTypeWriterOptions { +impl TryFrom<&protobuf::CsvWriterOptions> for CsvWriterOptions { + type Error = DataFusionError; + + fn try_from(opts: &protobuf::CsvWriterOptions) -> Result { + let write_options = csv_writer_options_from_proto(opts)?; + let compression: CompressionTypeVariant = opts.compression().into(); + Ok(CsvWriterOptions::new(write_options, compression)) + } +} + +impl TryFrom<&protobuf::JsonWriterOptions> for JsonWriterOptions { + type Error = DataFusionError; + + fn try_from(opts: &protobuf::JsonWriterOptions) -> Result { + let compression: CompressionTypeVariant = opts.compression().into(); + Ok(JsonWriterOptions::new(compression)) + } +} + +impl TryFrom<&protobuf::CsvOptions> for CsvOptions { + type Error = DataFusionError; + + fn try_from(proto_opts: &protobuf::CsvOptions) -> Result { + Ok(CsvOptions { + has_header: proto_opts.has_header, + delimiter: proto_opts.delimiter[0], + quote: proto_opts.quote[0], + escape: proto_opts.escape.first().copied(), + compression: proto_opts.compression().into(), + schema_infer_max_rec: proto_opts.schema_infer_max_rec as usize, + date_format: (!proto_opts.date_format.is_empty()) + .then(|| proto_opts.date_format.clone()), + datetime_format: (!proto_opts.datetime_format.is_empty()) + .then(|| proto_opts.datetime_format.clone()), + timestamp_format: (!proto_opts.timestamp_format.is_empty()) + .then(|| proto_opts.timestamp_format.clone()), + timestamp_tz_format: (!proto_opts.timestamp_tz_format.is_empty()) + .then(|| proto_opts.timestamp_tz_format.clone()), + time_format: (!proto_opts.time_format.is_empty()) + .then(|| proto_opts.time_format.clone()), + null_value: (!proto_opts.null_value.is_empty()) + .then(|| proto_opts.null_value.clone()), + }) + } +} + +impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions { type Error = DataFusionError; - fn try_from(value: &protobuf::FileTypeWriterOptions) -> Result { - let file_type = value - .file_type - .as_ref() - .ok_or_else(|| proto_error("Missing required file_type field in protobuf"))?; + fn try_from(value: &protobuf::ParquetOptions) -> Result { + Ok(ParquetOptions { + enable_page_index: value.enable_page_index, + pruning: value.pruning, + skip_metadata: value.skip_metadata, + metadata_size_hint: value + .metadata_size_hint_opt.clone() + .map(|opt| match opt { + protobuf::parquet_options::MetadataSizeHintOpt::MetadataSizeHint(v) => Some(v as usize), + }) + .unwrap_or(None), + pushdown_filters: value.pushdown_filters, + reorder_filters: value.reorder_filters, + data_pagesize_limit: value.data_pagesize_limit as usize, + write_batch_size: value.write_batch_size as usize, + writer_version: value.writer_version.clone(), + compression: value.compression_opt.clone().map(|opt| match opt { + protobuf::parquet_options::CompressionOpt::Compression(v) => Some(v), + }).unwrap_or(None), + dictionary_enabled: value.dictionary_enabled_opt.as_ref().map(|protobuf::parquet_options::DictionaryEnabledOpt::DictionaryEnabled(v)| *v), + // Continuing from where we left off in the TryFrom implementation + dictionary_page_size_limit: value.dictionary_page_size_limit as usize, + statistics_enabled: value + .statistics_enabled_opt.clone() + .map(|opt| match opt { + protobuf::parquet_options::StatisticsEnabledOpt::StatisticsEnabled(v) => Some(v), + }) + .unwrap_or(None), + max_statistics_size: value + .max_statistics_size_opt.as_ref() + .map(|opt| match opt { + protobuf::parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => Some(*v as usize), + }) + .unwrap_or(None), + max_row_group_size: value.max_row_group_size as usize, + created_by: value.created_by.clone(), + column_index_truncate_length: value + .column_index_truncate_length_opt.as_ref() + .map(|opt| match opt { + protobuf::parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(v) => Some(*v as usize), + }) + .unwrap_or(None), + data_page_row_count_limit: value.data_page_row_count_limit as usize, + encoding: value + .encoding_opt.clone() + .map(|opt| match opt { + protobuf::parquet_options::EncodingOpt::Encoding(v) => Some(v), + }) + .unwrap_or(None), + bloom_filter_enabled: value.bloom_filter_enabled, + bloom_filter_fpp: value.clone() + .bloom_filter_fpp_opt + .map(|opt| match opt { + protobuf::parquet_options::BloomFilterFppOpt::BloomFilterFpp(v) => Some(v), + }) + .unwrap_or(None), + bloom_filter_ndv: value.clone() + .bloom_filter_ndv_opt + .map(|opt| match opt { + protobuf::parquet_options::BloomFilterNdvOpt::BloomFilterNdv(v) => Some(v), + }) + .unwrap_or(None), + allow_single_file_parallelism: value.allow_single_file_parallelism, + maximum_parallel_row_group_writers: value.maximum_parallel_row_group_writers as usize, + maximum_buffered_record_batches_per_stream: value.maximum_buffered_record_batches_per_stream as usize, + + }) + } +} + +impl TryFrom<&protobuf::ColumnOptions> for ColumnOptions { + type Error = DataFusionError; + fn try_from(value: &protobuf::ColumnOptions) -> Result { + Ok(ColumnOptions { + compression: value.compression_opt.clone().map(|opt| match opt { + protobuf::column_options::CompressionOpt::Compression(v) => Some(v), + }).unwrap_or(None), + dictionary_enabled: value.dictionary_enabled_opt.as_ref().map(|protobuf::column_options::DictionaryEnabledOpt::DictionaryEnabled(v)| *v), + statistics_enabled: value + .statistics_enabled_opt.clone() + .map(|opt| match opt { + protobuf::column_options::StatisticsEnabledOpt::StatisticsEnabled(v) => Some(v), + }) + .unwrap_or(None), + max_statistics_size: value + .max_statistics_size_opt.clone() + .map(|opt| match opt { + protobuf::column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => Some(v as usize), + }) + .unwrap_or(None), + encoding: value + .encoding_opt.clone() + .map(|opt| match opt { + protobuf::column_options::EncodingOpt::Encoding(v) => Some(v), + }) + .unwrap_or(None), + bloom_filter_enabled: value.bloom_filter_enabled_opt.clone().map(|opt| match opt { + protobuf::column_options::BloomFilterEnabledOpt::BloomFilterEnabled(v) => Some(v), + }) + .unwrap_or(None), + bloom_filter_fpp: value + .bloom_filter_fpp_opt.clone() + .map(|opt| match opt { + protobuf::column_options::BloomFilterFppOpt::BloomFilterFpp(v) => Some(v), + }) + .unwrap_or(None), + bloom_filter_ndv: value + .bloom_filter_ndv_opt.clone() + .map(|opt| match opt { + protobuf::column_options::BloomFilterNdvOpt::BloomFilterNdv(v) => Some(v), + }) + .unwrap_or(None), + }) + } +} - match file_type { - protobuf::file_type_writer_options::FileType::ArrowOptions(_) => { - Ok(Self::Arrow(ArrowWriterOptions::new())) +impl TryFrom<&protobuf::TableParquetOptions> for TableParquetOptions { + type Error = DataFusionError; + fn try_from(value: &protobuf::TableParquetOptions) -> Result { + let mut column_specific_options: HashMap = HashMap::new(); + for protobuf::ColumnSpecificOptions { + column_name, + options: maybe_options, + } in &value.column_specific_options + { + if let Some(options) = maybe_options { + column_specific_options.insert(column_name.clone(), options.try_into()?); } + } + Ok(TableParquetOptions { + global: value + .global + .as_ref() + .map(|v| v.try_into()) + .unwrap() + .unwrap(), + column_specific_options, + }) + } +} + +impl TryFrom<&protobuf::JsonOptions> for JsonOptions { + type Error = DataFusionError; + + fn try_from(proto_opts: &protobuf::JsonOptions) -> Result { + let compression: protobuf::CompressionTypeVariant = proto_opts.compression(); + Ok(JsonOptions { + compression: compression.into(), + schema_infer_max_rec: proto_opts.schema_infer_max_rec as usize, + }) + } +} - protobuf::file_type_writer_options::FileType::JsonOptions(opts) => { - let compression: CompressionTypeVariant = opts.compression().into(); - Ok(Self::JSON(JsonWriterOptions::new(compression))) +impl TryFrom<©_to_node::FormatOptions> for FormatOptions { + type Error = DataFusionError; + fn try_from(value: ©_to_node::FormatOptions) -> Result { + Ok(match value { + copy_to_node::FormatOptions::Csv(options) => { + FormatOptions::CSV(options.try_into()?) } - protobuf::file_type_writer_options::FileType::CsvOptions(opts) => { - let write_options = csv_writer_options_from_proto(opts)?; - let compression: CompressionTypeVariant = opts.compression().into(); - Ok(Self::CSV(CsvWriterOptions::new(write_options, compression))) + copy_to_node::FormatOptions::Json(options) => { + FormatOptions::JSON(options.try_into()?) } - protobuf::file_type_writer_options::FileType::ParquetOptions(opt) => { - let props = opt.writer_properties.clone().unwrap_or_default(); - let writer_properties = writer_properties_from_proto(&props)?; - Ok(Self::Parquet(ParquetWriterOptions::new(writer_properties))) + copy_to_node::FormatOptions::Parquet(options) => { + FormatOptions::PARQUET(options.try_into()?) } - } + copy_to_node::FormatOptions::Avro(_) => FormatOptions::AVRO, + copy_to_node::FormatOptions::Arrow(_) => FormatOptions::ARROW, + }) } } diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index 9622b8ab51d8..004948da938f 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -19,6 +19,22 @@ use std::convert::TryInto; use std::fmt::Debug; use std::sync::Arc; +use self::from_proto::parse_physical_window_expr; + +use crate::common::{byte_to_string, proto_error, str_to_byte}; +use crate::convert_required; +use crate::physical_plan::from_proto::{ + parse_physical_expr, parse_physical_sort_expr, parse_physical_sort_exprs, + parse_protobuf_file_scan_config, +}; +use crate::protobuf::physical_aggregate_expr_node::AggregateFunction; +use crate::protobuf::physical_expr_node::ExprType; +use crate::protobuf::physical_plan_node::PhysicalPlanType; +use crate::protobuf::repartition_exec_node::PartitionMethod; +use crate::protobuf::{ + self, window_agg_exec_node, PhysicalPlanNode, PhysicalSortExprNodeCollection, +}; + use datafusion::arrow::compute::SortOptions; use datafusion::arrow::datatypes::SchemaRef; use datafusion::datasource::file_format::csv::CsvSink; @@ -61,26 +77,10 @@ use datafusion::physical_plan::{ }; use datafusion_common::{internal_err, not_impl_err, DataFusionError, Result}; use datafusion_expr::ScalarUDF; + use prost::bytes::BufMut; use prost::Message; -use crate::common::str_to_byte; -use crate::common::{byte_to_string, proto_error}; -use crate::convert_required; -use crate::physical_plan::from_proto::{ - parse_physical_expr, parse_physical_sort_expr, parse_physical_sort_exprs, - parse_protobuf_file_scan_config, -}; -use crate::protobuf::physical_aggregate_expr_node::AggregateFunction; -use crate::protobuf::physical_expr_node::ExprType; -use crate::protobuf::physical_plan_node::PhysicalPlanType; -use crate::protobuf::repartition_exec_node::PartitionMethod; -use crate::protobuf::{ - self, window_agg_exec_node, PhysicalPlanNode, PhysicalSortExprNodeCollection, -}; - -use self::from_proto::parse_physical_window_expr; - pub mod from_proto; pub mod to_proto; @@ -211,7 +211,12 @@ impl AsExecutionPlan for PhysicalPlanNode { ) }) .transpose()?; - Ok(Arc::new(ParquetExec::new(base_config, predicate, None))) + Ok(Arc::new(ParquetExec::new( + base_config, + predicate, + None, + Default::default(), + ))) } PhysicalPlanType::AvroScan(scan) => { Ok(Arc::new(AvroExec::new(parse_protobuf_file_scan_config( diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs index da4e87b7a853..bdb6cc668708 100644 --- a/datafusion/proto/src/physical_plan/to_proto.rs +++ b/datafusion/proto/src/physical_plan/to_proto.rs @@ -22,16 +22,15 @@ use std::{ sync::Arc, }; -use crate::protobuf::{self, physical_window_expr_node, scalar_value::Value}; +use crate::logical_plan::csv_writer_options_to_proto; use crate::protobuf::{ - physical_aggregate_expr_node, PhysicalSortExprNode, PhysicalSortExprNodeCollection, - ScalarValue, + self, copy_to_node, physical_aggregate_expr_node, physical_window_expr_node, + scalar_value::Value, ArrowOptions, AvroOptions, PhysicalSortExprNode, + PhysicalSortExprNodeCollection, ScalarValue, }; #[cfg(feature = "parquet")] use datafusion::datasource::file_format::parquet::ParquetSink; - -use crate::logical_plan::{csv_writer_options_to_proto, writer_properties_to_proto}; use datafusion::datasource::{ file_format::csv::CsvSink, file_format::json::JsonSink, @@ -58,16 +57,16 @@ use datafusion::physical_plan::windows::{BuiltInWindowExpr, PlainAggregateWindow use datafusion::physical_plan::{ AggregateExpr, ColumnStatistics, PhysicalExpr, Statistics, WindowExpr, }; +use datafusion_common::config::{ + ColumnOptions, CsvOptions, FormatOptions, JsonOptions, ParquetOptions, + TableParquetOptions, +}; use datafusion_common::{ - file_options::{ - arrow_writer::ArrowWriterOptions, avro_writer::AvroWriterOptions, - csv_writer::CsvWriterOptions, json_writer::JsonWriterOptions, - parquet_writer::ParquetWriterOptions, - }, + file_options::{csv_writer::CsvWriterOptions, json_writer::JsonWriterOptions}, internal_err, not_impl_err, parsers::CompressionTypeVariant, stats::Precision, - DataFusionError, FileTypeWriterOptions, JoinSide, Result, + DataFusionError, JoinSide, Result, }; impl TryFrom> for protobuf::PhysicalExprNode { @@ -821,6 +820,7 @@ impl TryFrom<&JsonSink> for protobuf::JsonSink { fn try_from(value: &JsonSink) -> Result { Ok(Self { config: Some(value.config().try_into()?), + writer_options: Some(value.writer_options().try_into()?), }) } } @@ -831,6 +831,7 @@ impl TryFrom<&CsvSink> for protobuf::CsvSink { fn try_from(value: &CsvSink) -> Result { Ok(Self { config: Some(value.config().try_into()?), + writer_options: Some(value.writer_options().try_into()?), }) } } @@ -842,6 +843,7 @@ impl TryFrom<&ParquetSink> for protobuf::ParquetSink { fn try_from(value: &ParquetSink) -> Result { Ok(Self { config: Some(value.config().try_into()?), + parquet_options: Some(value.parquet_options().try_into()?), }) } } @@ -870,7 +872,6 @@ impl TryFrom<&FileSinkConfig> for protobuf::FileSinkConfig { }) }) .collect::>>()?; - let file_type_writer_options = &conf.file_type_writer_options; Ok(Self { object_store_url: conf.object_store_url.to_string(), file_groups, @@ -878,7 +879,6 @@ impl TryFrom<&FileSinkConfig> for protobuf::FileSinkConfig { output_schema: Some(conf.output_schema.as_ref().try_into()?), table_partition_cols, overwrite: conf.overwrite, - file_type_writer_options: Some(file_type_writer_options.try_into()?), }) } } @@ -895,44 +895,169 @@ impl From<&CompressionTypeVariant> for protobuf::CompressionTypeVariant { } } -impl TryFrom<&FileTypeWriterOptions> for protobuf::FileTypeWriterOptions { +impl TryFrom<&CsvWriterOptions> for protobuf::CsvWriterOptions { type Error = DataFusionError; - fn try_from(opts: &FileTypeWriterOptions) -> Result { - let file_type = match opts { - #[cfg(feature = "parquet")] - FileTypeWriterOptions::Parquet(ParquetWriterOptions { writer_options }) => { - protobuf::file_type_writer_options::FileType::ParquetOptions( - protobuf::ParquetWriterOptions { - writer_properties: Some(writer_properties_to_proto( - writer_options, - )), - }, - ) - } - FileTypeWriterOptions::CSV(CsvWriterOptions { - writer_options, - compression, - }) => protobuf::file_type_writer_options::FileType::CsvOptions( - csv_writer_options_to_proto(writer_options, compression), - ), - FileTypeWriterOptions::JSON(JsonWriterOptions { compression }) => { - let compression: protobuf::CompressionTypeVariant = compression.into(); - protobuf::file_type_writer_options::FileType::JsonOptions( - protobuf::JsonWriterOptions { - compression: compression.into(), - }, + fn try_from(opts: &CsvWriterOptions) -> Result { + Ok(csv_writer_options_to_proto( + &opts.writer_options, + &opts.compression, + )) + } +} + +impl TryFrom<&JsonWriterOptions> for protobuf::JsonWriterOptions { + type Error = DataFusionError; + + fn try_from(opts: &JsonWriterOptions) -> Result { + let compression: protobuf::CompressionTypeVariant = opts.compression.into(); + Ok(protobuf::JsonWriterOptions { + compression: compression.into(), + }) + } +} + +impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions { + type Error = DataFusionError; + + fn try_from(value: &ParquetOptions) -> Result { + Ok(protobuf::ParquetOptions { + enable_page_index: value.enable_page_index, + pruning: value.pruning, + skip_metadata: value.skip_metadata, + metadata_size_hint_opt: value.metadata_size_hint.map(|v| protobuf::parquet_options::MetadataSizeHintOpt::MetadataSizeHint(v as u64)), + pushdown_filters: value.pushdown_filters, + reorder_filters: value.reorder_filters, + data_pagesize_limit: value.data_pagesize_limit as u64, + write_batch_size: value.write_batch_size as u64, + writer_version: value.writer_version.clone(), + compression_opt: value.compression.clone().map(protobuf::parquet_options::CompressionOpt::Compression), + dictionary_enabled_opt: value.dictionary_enabled.map(protobuf::parquet_options::DictionaryEnabledOpt::DictionaryEnabled), + dictionary_page_size_limit: value.dictionary_page_size_limit as u64, + statistics_enabled_opt: value.statistics_enabled.clone().map(protobuf::parquet_options::StatisticsEnabledOpt::StatisticsEnabled), + max_statistics_size_opt: value.max_statistics_size.map(|v| protobuf::parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v as u64)), + max_row_group_size: value.max_row_group_size as u64, + created_by: value.created_by.clone(), + column_index_truncate_length_opt: value.column_index_truncate_length.map(|v| protobuf::parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(v as u64)), + data_page_row_count_limit: value.data_page_row_count_limit as u64, + encoding_opt: value.encoding.clone().map(protobuf::parquet_options::EncodingOpt::Encoding), + bloom_filter_enabled: value.bloom_filter_enabled, + bloom_filter_fpp_opt: value.bloom_filter_fpp.map(protobuf::parquet_options::BloomFilterFppOpt::BloomFilterFpp), + bloom_filter_ndv_opt: value.bloom_filter_ndv.map(protobuf::parquet_options::BloomFilterNdvOpt::BloomFilterNdv), + allow_single_file_parallelism: value.allow_single_file_parallelism, + maximum_parallel_row_group_writers: value.maximum_parallel_row_group_writers as u64, + maximum_buffered_record_batches_per_stream: value.maximum_buffered_record_batches_per_stream as u64, + }) + } +} + +impl TryFrom<&ColumnOptions> for protobuf::ColumnOptions { + type Error = DataFusionError; + + fn try_from(value: &ColumnOptions) -> Result { + Ok(protobuf::ColumnOptions { + compression_opt: value + .compression + .clone() + .map(protobuf::column_options::CompressionOpt::Compression), + dictionary_enabled_opt: value + .dictionary_enabled + .map(protobuf::column_options::DictionaryEnabledOpt::DictionaryEnabled), + statistics_enabled_opt: value + .statistics_enabled + .clone() + .map(protobuf::column_options::StatisticsEnabledOpt::StatisticsEnabled), + max_statistics_size_opt: value.max_statistics_size.map(|v| { + protobuf::column_options::MaxStatisticsSizeOpt::MaxStatisticsSize( + v as u32, ) + }), + encoding_opt: value + .encoding + .clone() + .map(protobuf::column_options::EncodingOpt::Encoding), + bloom_filter_enabled_opt: value + .bloom_filter_enabled + .map(protobuf::column_options::BloomFilterEnabledOpt::BloomFilterEnabled), + bloom_filter_fpp_opt: value + .bloom_filter_fpp + .map(protobuf::column_options::BloomFilterFppOpt::BloomFilterFpp), + bloom_filter_ndv_opt: value + .bloom_filter_ndv + .map(protobuf::column_options::BloomFilterNdvOpt::BloomFilterNdv), + }) + } +} + +impl TryFrom<&TableParquetOptions> for protobuf::TableParquetOptions { + type Error = DataFusionError; + fn try_from(value: &TableParquetOptions) -> Result { + let column_specific_options = value + .column_specific_options + .iter() + .map(|(k, v)| { + Ok(protobuf::ColumnSpecificOptions { + column_name: k.into(), + options: Some(v.try_into()?), + }) + }) + .collect::>>()?; + Ok(protobuf::TableParquetOptions { + global: Some((&value.global).try_into()?), + column_specific_options, + }) + } +} + +impl TryFrom<&CsvOptions> for protobuf::CsvOptions { + type Error = DataFusionError; // Define or use an appropriate error type + + fn try_from(opts: &CsvOptions) -> Result { + let compression: protobuf::CompressionTypeVariant = opts.compression.into(); + Ok(protobuf::CsvOptions { + has_header: opts.has_header, + delimiter: vec![opts.delimiter], + quote: vec![opts.quote], + escape: opts.escape.map_or_else(Vec::new, |e| vec![e]), + compression: compression.into(), + schema_infer_max_rec: opts.schema_infer_max_rec as u64, + date_format: opts.date_format.clone().unwrap_or_default(), + datetime_format: opts.datetime_format.clone().unwrap_or_default(), + timestamp_format: opts.timestamp_format.clone().unwrap_or_default(), + timestamp_tz_format: opts.timestamp_tz_format.clone().unwrap_or_default(), + time_format: opts.time_format.clone().unwrap_or_default(), + null_value: opts.null_value.clone().unwrap_or_default(), + }) + } +} + +impl TryFrom<&JsonOptions> for protobuf::JsonOptions { + type Error = DataFusionError; + + fn try_from(opts: &JsonOptions) -> Result { + let compression: protobuf::CompressionTypeVariant = opts.compression.into(); + Ok(protobuf::JsonOptions { + compression: compression.into(), + schema_infer_max_rec: opts.schema_infer_max_rec as u64, + }) + } +} + +impl TryFrom<&FormatOptions> for copy_to_node::FormatOptions { + type Error = DataFusionError; + fn try_from(value: &FormatOptions) -> std::result::Result { + Ok(match value { + FormatOptions::CSV(options) => { + copy_to_node::FormatOptions::Csv(options.try_into()?) } - FileTypeWriterOptions::Avro(AvroWriterOptions {}) => { - return not_impl_err!("Avro file sink protobuf serialization") + FormatOptions::JSON(options) => { + copy_to_node::FormatOptions::Json(options.try_into()?) } - FileTypeWriterOptions::Arrow(ArrowWriterOptions {}) => { - return not_impl_err!("Arrow file sink protobuf serialization") + FormatOptions::PARQUET(options) => { + copy_to_node::FormatOptions::Parquet(options.try_into()?) } - }; - Ok(Self { - file_type: Some(file_type), + FormatOptions::AVRO => copy_to_node::FormatOptions::Avro(AvroOptions {}), + FormatOptions::ARROW => copy_to_node::FormatOptions::Arrow(ArrowOptions {}), }) } } diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index fb9f2967553f..2c8cf07e9eff 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -21,33 +21,23 @@ use std::fmt::{self, Debug, Formatter}; use std::sync::Arc; use arrow::array::{ArrayRef, FixedSizeListArray}; -use arrow::csv::WriterBuilder; use arrow::datatypes::{ DataType, Field, Fields, Int32Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, Schema, SchemaRef, TimeUnit, UnionFields, UnionMode, }; - -use datafusion_common::file_options::arrow_writer::ArrowWriterOptions; -use datafusion_expr::{ScalarUDF, ScalarUDFImpl}; -use datafusion_proto::logical_plan::to_proto::serialize_expr; -use prost::Message; - use datafusion::datasource::provider::TableProviderFactory; use datafusion::datasource::TableProvider; use datafusion::execution::context::SessionState; use datafusion::execution::runtime_env::{RuntimeConfig, RuntimeEnv}; -use datafusion::parquet::file::properties::{WriterProperties, WriterVersion}; use datafusion::prelude::*; use datafusion::test_util::{TestTableFactory, TestTableProvider}; -use datafusion_common::file_options::csv_writer::CsvWriterOptions; -use datafusion_common::file_options::parquet_writer::ParquetWriterOptions; -use datafusion_common::file_options::StatementOptions; -use datafusion_common::parsers::CompressionTypeVariant; +use datafusion_common::config::{FormatOptions, TableOptions}; use datafusion_common::scalar::ScalarStructBuilder; -use datafusion_common::{internal_err, not_impl_err, plan_err, FileTypeWriterOptions}; -use datafusion_common::{DFField, DFSchema, DFSchemaRef, DataFusionError, ScalarValue}; -use datafusion_common::{FileType, Result}; -use datafusion_expr::dml::{CopyOptions, CopyTo}; +use datafusion_common::{ + internal_err, not_impl_err, plan_err, DFField, DFSchema, DFSchemaRef, + DataFusionError, Result, ScalarValue, +}; +use datafusion_expr::dml::CopyTo; use datafusion_expr::expr::{ self, Between, BinaryExpr, Case, Cast, GroupingSet, InList, Like, ScalarFunction, Sort, Unnest, @@ -57,17 +47,21 @@ use datafusion_expr::{ col, create_udaf, lit, Accumulator, AggregateFunction, BuiltinScalarFunction::{Sqrt, Substr}, ColumnarValue, Expr, ExprSchemable, LogicalPlan, Operator, PartitionEvaluator, - Signature, TryCast, Volatility, WindowFrame, WindowFrameBound, WindowFrameUnits, - WindowFunctionDefinition, WindowUDF, WindowUDFImpl, + ScalarUDF, ScalarUDFImpl, Signature, TryCast, Volatility, WindowFrame, + WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition, WindowUDF, + WindowUDFImpl, }; use datafusion_proto::bytes::{ logical_plan_from_bytes, logical_plan_from_bytes_with_extension_codec, logical_plan_to_bytes, logical_plan_to_bytes_with_extension_codec, }; +use datafusion_proto::logical_plan::to_proto::serialize_expr; use datafusion_proto::logical_plan::LogicalExtensionCodec; use datafusion_proto::logical_plan::{from_proto, DefaultLogicalExtensionCodec}; use datafusion_proto::protobuf; +use prost::Message; + #[cfg(feature = "json")] fn roundtrip_json_test(proto: &protobuf::LogicalExprNode) { let string = serde_json::to_string(proto).unwrap(); @@ -321,15 +315,16 @@ async fn roundtrip_logical_plan_copy_to_sql_options() -> Result<()> { let input = create_csv_scan(&ctx).await?; - let mut options = HashMap::new(); - options.insert("foo".to_string(), "bar".to_string()); + let mut table_options = + TableOptions::default_from_session_config(ctx.state().config_options()); + table_options.set("csv.delimiter", ";")?; let plan = LogicalPlan::Copy(CopyTo { input: Arc::new(input), output_url: "test.csv".to_string(), - file_format: FileType::CSV, partition_by: vec!["a".to_string(), "b".to_string(), "c".to_string()], - copy_options: CopyOptions::SQLOptions(StatementOptions::from(&options)), + format_options: FormatOptions::CSV(table_options.csv.clone()), + options: Default::default(), }); let bytes = logical_plan_to_bytes(&plan)?; @@ -345,24 +340,25 @@ async fn roundtrip_logical_plan_copy_to_writer_options() -> Result<()> { let input = create_csv_scan(&ctx).await?; - let writer_properties = WriterProperties::builder() - .set_bloom_filter_enabled(true) - .set_created_by("DataFusion Test".to_string()) - .set_writer_version(WriterVersion::PARQUET_2_0) - .set_write_batch_size(111) - .set_data_page_size_limit(222) - .set_data_page_row_count_limit(333) - .set_dictionary_page_size_limit(444) - .set_max_row_group_size(555) - .build(); + let table_options = + TableOptions::default_from_session_config(ctx.state().config_options()); + let mut parquet_format = table_options.parquet; + + parquet_format.global.bloom_filter_enabled = true; + parquet_format.global.created_by = "DataFusion Test".to_string(); + parquet_format.global.writer_version = "PARQUET_2_0".to_string(); + parquet_format.global.write_batch_size = 111; + parquet_format.global.data_pagesize_limit = 222; + parquet_format.global.data_page_row_count_limit = 333; + parquet_format.global.dictionary_page_size_limit = 444; + parquet_format.global.max_row_group_size = 555; + let plan = LogicalPlan::Copy(CopyTo { input: Arc::new(input), output_url: "test.parquet".to_string(), - file_format: FileType::PARQUET, + format_options: FormatOptions::PARQUET(parquet_format.clone()), partition_by: vec!["a".to_string(), "b".to_string(), "c".to_string()], - copy_options: CopyOptions::WriterOptions(Box::new( - FileTypeWriterOptions::Parquet(ParquetWriterOptions::new(writer_properties)), - )), + options: Default::default(), }); let bytes = logical_plan_to_bytes(&plan)?; @@ -372,27 +368,11 @@ async fn roundtrip_logical_plan_copy_to_writer_options() -> Result<()> { match logical_round_trip { LogicalPlan::Copy(copy_to) => { assert_eq!("test.parquet", copy_to.output_url); - assert_eq!(FileType::PARQUET, copy_to.file_format); assert_eq!(vec!["a", "b", "c"], copy_to.partition_by); - match ©_to.copy_options { - CopyOptions::WriterOptions(y) => match y.as_ref() { - FileTypeWriterOptions::Parquet(p) => { - let props = &p.writer_options; - assert_eq!("DataFusion Test", props.created_by()); - assert_eq!( - "PARQUET_2_0", - format!("{:?}", props.writer_version()) - ); - assert_eq!(111, props.write_batch_size()); - assert_eq!(222, props.data_page_size_limit()); - assert_eq!(333, props.data_page_row_count_limit()); - assert_eq!(444, props.dictionary_page_size_limit()); - assert_eq!(555, props.max_row_group_size()); - } - _ => panic!(), - }, - _ => panic!(), - } + assert_eq!( + copy_to.format_options, + FormatOptions::PARQUET(parquet_format) + ); } _ => panic!(), } @@ -408,11 +388,9 @@ async fn roundtrip_logical_plan_copy_to_arrow() -> Result<()> { let plan = LogicalPlan::Copy(CopyTo { input: Arc::new(input), output_url: "test.arrow".to_string(), - file_format: FileType::ARROW, partition_by: vec!["a".to_string(), "b".to_string(), "c".to_string()], - copy_options: CopyOptions::WriterOptions(Box::new(FileTypeWriterOptions::Arrow( - ArrowWriterOptions::new(), - ))), + format_options: FormatOptions::ARROW, + options: Default::default(), }); let bytes = logical_plan_to_bytes(&plan)?; @@ -422,15 +400,8 @@ async fn roundtrip_logical_plan_copy_to_arrow() -> Result<()> { match logical_round_trip { LogicalPlan::Copy(copy_to) => { assert_eq!("test.arrow", copy_to.output_url); - assert_eq!(FileType::ARROW, copy_to.file_format); + assert_eq!(FormatOptions::ARROW, copy_to.format_options); assert_eq!(vec!["a", "b", "c"], copy_to.partition_by); - match ©_to.copy_options { - CopyOptions::WriterOptions(y) => match y.as_ref() { - FileTypeWriterOptions::Arrow(_) => {} - _ => panic!(), - }, - _ => panic!(), - } } _ => panic!(), } @@ -444,25 +415,23 @@ async fn roundtrip_logical_plan_copy_to_csv() -> Result<()> { let input = create_csv_scan(&ctx).await?; - let writer_properties = WriterBuilder::new() - .with_delimiter(b'*') - .with_date_format("dd/MM/yyyy".to_string()) - .with_datetime_format("dd/MM/yyyy HH:mm:ss".to_string()) - .with_timestamp_format("HH:mm:ss.SSSSSS".to_string()) - .with_time_format("HH:mm:ss".to_string()) - .with_null("NIL".to_string()); + let table_options = + TableOptions::default_from_session_config(ctx.state().config_options()); + let mut csv_format = table_options.csv; + + csv_format.delimiter = b'*'; + csv_format.date_format = Some("dd/MM/yyyy".to_string()); + csv_format.datetime_format = Some("dd/MM/yyyy HH:mm:ss".to_string()); + csv_format.timestamp_format = Some("HH:mm:ss.SSSSSS".to_string()); + csv_format.time_format = Some("HH:mm:ss".to_string()); + csv_format.null_value = Some("NIL".to_string()); let plan = LogicalPlan::Copy(CopyTo { input: Arc::new(input), output_url: "test.csv".to_string(), - file_format: FileType::CSV, partition_by: vec!["a".to_string(), "b".to_string(), "c".to_string()], - copy_options: CopyOptions::WriterOptions(Box::new(FileTypeWriterOptions::CSV( - CsvWriterOptions::new( - writer_properties, - CompressionTypeVariant::UNCOMPRESSED, - ), - ))), + format_options: FormatOptions::CSV(csv_format.clone()), + options: Default::default(), }); let bytes = logical_plan_to_bytes(&plan)?; @@ -472,26 +441,8 @@ async fn roundtrip_logical_plan_copy_to_csv() -> Result<()> { match logical_round_trip { LogicalPlan::Copy(copy_to) => { assert_eq!("test.csv", copy_to.output_url); - assert_eq!(FileType::CSV, copy_to.file_format); + assert_eq!(FormatOptions::CSV(csv_format), copy_to.format_options); assert_eq!(vec!["a", "b", "c"], copy_to.partition_by); - match ©_to.copy_options { - CopyOptions::WriterOptions(y) => match y.as_ref() { - FileTypeWriterOptions::CSV(p) => { - let props = &p.writer_options; - assert_eq!(b'*', props.delimiter()); - assert_eq!("dd/MM/yyyy", props.date_format().unwrap()); - assert_eq!( - "dd/MM/yyyy HH:mm:ss", - props.datetime_format().unwrap() - ); - assert_eq!("HH:mm:ss.SSSSSS", props.timestamp_format().unwrap()); - assert_eq!("HH:mm:ss", props.time_format().unwrap()); - assert_eq!("NIL", props.null()); - } - _ => panic!(), - }, - _ => panic!(), - } } _ => panic!(), } @@ -582,24 +533,39 @@ async fn roundtrip_expr_api() -> Result<()> { let expr_list = vec![ encode(col("a").cast_to(&DataType::Utf8, &schema)?, lit("hex")), decode(lit("1234"), lit("hex")), - array_to_string(array(vec![lit(1), lit(2), lit(3)]), lit(",")), - array_dims(array(vec![lit(1), lit(2), lit(3)])), - array_ndims(array(vec![lit(1), lit(2), lit(3)])), - cardinality(array(vec![lit(1), lit(2), lit(3)])), + array_to_string(make_array(vec![lit(1), lit(2), lit(3)]), lit(",")), + array_dims(make_array(vec![lit(1), lit(2), lit(3)])), + array_ndims(make_array(vec![lit(1), lit(2), lit(3)])), + cardinality(make_array(vec![lit(1), lit(2), lit(3)])), + string_to_array(lit("abc#def#ghl"), lit("#"), lit(",")), range(lit(1), lit(10), lit(2)), gen_series(lit(1), lit(10), lit(2)), - array_has(array(vec![lit(1), lit(2), lit(3)]), lit(1)), + array_append(make_array(vec![lit(1), lit(2), lit(3)]), lit(4)), + array_prepend(lit(1), make_array(vec![lit(2), lit(3), lit(4)])), + array_concat(vec![ + make_array(vec![lit(1), lit(2)]), + make_array(vec![lit(3), lit(4)]), + ]), + make_array(vec![lit(1), lit(2), lit(3)]), + array_has(make_array(vec![lit(1), lit(2), lit(3)]), lit(1)), array_has_all( - array(vec![lit(1), lit(2), lit(3)]), - array(vec![lit(1), lit(2)]), + make_array(vec![lit(1), lit(2), lit(3)]), + make_array(vec![lit(1), lit(2)]), ), array_has_any( - array(vec![lit(1), lit(2), lit(3)]), - array(vec![lit(1), lit(4)]), + make_array(vec![lit(1), lit(2), lit(3)]), + make_array(vec![lit(1), lit(4)]), + ), + array_empty(make_array(vec![lit(1), lit(2), lit(3)])), + array_length(make_array(vec![lit(1), lit(2), lit(3)])), + array_repeat(lit(1), lit(3)), + flatten(make_array(vec![lit(1), lit(2), lit(3)])), + array_sort( + make_array(vec![lit(3), lit(4), lit(1), lit(2)]), + lit("desc"), + lit("NULLS LAST"), ), - array_empty(array(vec![lit(1), lit(2), lit(3)])), - array_length(array(vec![lit(1), lit(2), lit(3)])), - flatten(array(vec![lit(1), lit(2), lit(3)])), + array_distinct(make_array(vec![lit(1), lit(3), lit(3), lit(2), lit(2)])), ]; // ensure expressions created with the expr api can be round tripped diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs index a3c0b3eccd3c..3441a9f7fa11 100644 --- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs @@ -15,6 +15,10 @@ // specific language governing permissions and limitations // under the License. +use std::ops::Deref; +use std::sync::Arc; +use std::vec; + use arrow::csv::WriterBuilder; use datafusion::arrow::array::ArrayRef; use datafusion::arrow::compute::kernels::sort::SortOptions; @@ -32,7 +36,6 @@ use datafusion::execution::context::ExecutionProps; use datafusion::logical_expr::{ create_udf, BuiltinScalarFunction, JoinType, Operator, Volatility, }; -use datafusion::parquet::file::properties::WriterProperties; use datafusion::physical_expr::expressions::NthValueAgg; use datafusion::physical_expr::window::SlidingAggregateWindowExpr; use datafusion::physical_expr::{PhysicalSortRequirement, ScalarFunctionExpr}; @@ -66,21 +69,18 @@ use datafusion::physical_plan::{ }; use datafusion::prelude::SessionContext; use datafusion::scalar::ScalarValue; +use datafusion_common::config::TableParquetOptions; use datafusion_common::file_options::csv_writer::CsvWriterOptions; use datafusion_common::file_options::json_writer::JsonWriterOptions; -use datafusion_common::file_options::parquet_writer::ParquetWriterOptions; use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::stats::Precision; -use datafusion_common::{FileTypeWriterOptions, Result}; +use datafusion_common::Result; use datafusion_expr::{ Accumulator, AccumulatorFactoryFunction, AggregateUDF, ColumnarValue, Signature, SimpleAggregateUDF, WindowFrame, WindowFrameBound, }; use datafusion_proto::physical_plan::{AsExecutionPlan, DefaultPhysicalExtensionCodec}; use datafusion_proto::protobuf; -use std::ops::Deref; -use std::sync::Arc; -use std::vec; /// Perform a serde roundtrip and assert that the string representation of the before and after plans /// are identical. Note that this often isn't sufficient to guarantee that no information is @@ -271,6 +271,7 @@ fn roundtrip_window() -> Result<()> { "FIRST_VALUE(a) PARTITION BY [b] ORDER BY [a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", col("a", &schema)?, DataType::Int64, + false, )), &[col("b", &schema)?], &[PhysicalSortExpr { @@ -560,6 +561,7 @@ fn roundtrip_parquet_exec_with_pruning_predicate() -> Result<()> { scan_config, Some(predicate), None, + Default::default(), ))) } @@ -586,7 +588,12 @@ async fn roundtrip_parquet_exec_with_table_partition_cols() -> Result<()> { output_ordering: vec![], }; - roundtrip_test(Arc::new(ParquetExec::new(scan_config, None, None))) + roundtrip_test(Arc::new(ParquetExec::new( + scan_config, + None, + None, + Default::default(), + ))) } #[test] @@ -764,11 +771,11 @@ fn roundtrip_json_sink() -> Result<()> { output_schema: schema.clone(), table_partition_cols: vec![("plan_type".to_string(), DataType::Utf8)], overwrite: true, - file_type_writer_options: FileTypeWriterOptions::JSON(JsonWriterOptions::new( - CompressionTypeVariant::UNCOMPRESSED, - )), }; - let data_sink = Arc::new(JsonSink::new(file_sink_config)); + let data_sink = Arc::new(JsonSink::new( + file_sink_config, + JsonWriterOptions::new(CompressionTypeVariant::UNCOMPRESSED), + )); let sort_order = vec![PhysicalSortRequirement::new( Arc::new(Column::new("plan_type", 0)), Some(SortOptions { @@ -799,12 +806,11 @@ fn roundtrip_csv_sink() -> Result<()> { output_schema: schema.clone(), table_partition_cols: vec![("plan_type".to_string(), DataType::Utf8)], overwrite: true, - file_type_writer_options: FileTypeWriterOptions::CSV(CsvWriterOptions::new( - WriterBuilder::default(), - CompressionTypeVariant::ZSTD, - )), }; - let data_sink = Arc::new(CsvSink::new(file_sink_config)); + let data_sink = Arc::new(CsvSink::new( + file_sink_config, + CsvWriterOptions::new(WriterBuilder::default(), CompressionTypeVariant::ZSTD), + )); let sort_order = vec![PhysicalSortRequirement::new( Arc::new(Column::new("plan_type", 0)), Some(SortOptions { @@ -832,12 +838,7 @@ fn roundtrip_csv_sink() -> Result<()> { .unwrap(); assert_eq!( CompressionTypeVariant::ZSTD, - csv_sink - .config() - .file_type_writer_options - .try_into_csv() - .unwrap() - .compression + csv_sink.writer_options().compression ); Ok(()) @@ -857,11 +858,11 @@ fn roundtrip_parquet_sink() -> Result<()> { output_schema: schema.clone(), table_partition_cols: vec![("plan_type".to_string(), DataType::Utf8)], overwrite: true, - file_type_writer_options: FileTypeWriterOptions::Parquet( - ParquetWriterOptions::new(WriterProperties::default()), - ), }; - let data_sink = Arc::new(ParquetSink::new(file_sink_config)); + let data_sink = Arc::new(ParquetSink::new( + file_sink_config, + TableParquetOptions::default(), + )); let sort_order = vec![PhysicalSortRequirement::new( Arc::new(Column::new("plan_type", 0)), Some(SortOptions { diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index e838a4cafb2a..d45a195cb653 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -271,8 +271,14 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ), SQLExpr::Cast { - expr, data_type, .. + expr, + data_type, + format, } => { + if let Some(format) = format { + return not_impl_err!("CAST with format is not supported: {format}"); + } + let dt = self.convert_data_type(&data_type)?; let expr = self.sql_expr_to_logical_expr(*expr, schema, planner_context)?; @@ -295,15 +301,23 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } SQLExpr::TryCast { - expr, data_type, .. - } => Ok(Expr::TryCast(TryCast::new( - Box::new(self.sql_expr_to_logical_expr( - *expr, - schema, - planner_context, - )?), - self.convert_data_type(&data_type)?, - ))), + expr, + data_type, + format, + } => { + if let Some(format) = format { + return not_impl_err!("CAST with format is not supported: {format}"); + } + + Ok(Expr::TryCast(TryCast::new( + Box::new(self.sql_expr_to_logical_expr( + *expr, + schema, + planner_context, + )?), + self.convert_data_type(&data_type)?, + ))) + } SQLExpr::TypedString { data_type, value } => Ok(Expr::Cast(Cast::new( Box::new(lit(value)), @@ -478,7 +492,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { trim_where, trim_what, trim_characters, - .. } => self.sql_trim_to_expr( *expr, trim_where, @@ -583,8 +596,14 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { self.sql_expr_to_logical_expr(value, input_schema, planner_context) }) .collect::>>()?; - Ok(Expr::ScalarFunction(ScalarFunction::new( - BuiltinScalarFunction::Struct, + let struct_func = self + .context_provider + .get_function_meta("struct") + .ok_or_else(|| { + internal_datafusion_err!("Unable to find expected 'struct' function") + })?; + Ok(Expr::ScalarFunction(ScalarFunction::new_udf( + struct_func, args, ))) } @@ -801,7 +820,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { distinct, order_by, null_treatment, - .. + filter: None, // filter is passed in }) => Ok(Expr::AggregateFunction(expr::AggregateFunction::new( fun, args, @@ -814,7 +833,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { order_by, null_treatment, ))), - _ => plan_err!( + Expr::AggregateFunction(..) => { + internal_err!("Expected null filter clause in aggregate function") + } + _ => internal_err!( "AggregateExpressionWithFilter expression was not an AggregateFunction" ), } diff --git a/datafusion/sql/src/expr/value.rs b/datafusion/sql/src/expr/value.rs index 15524b9ffab1..8d19b32b8e40 100644 --- a/datafusion/sql/src/expr/value.rs +++ b/datafusion/sql/src/expr/value.rs @@ -22,9 +22,7 @@ use arrow_schema::DataType; use datafusion_common::{ not_impl_err, plan_err, DFSchema, DataFusionError, Result, ScalarValue, }; -use datafusion_expr::expr::ScalarFunction; -use datafusion_expr::expr::{BinaryExpr, Placeholder}; -use datafusion_expr::BuiltinScalarFunction; +use datafusion_expr::expr::{BinaryExpr, Placeholder, ScalarFunction}; use datafusion_expr::{lit, Expr, Operator}; use log::debug; use sqlparser::ast::{BinaryOperator, Expr as SQLExpr, Interval, Value}; @@ -143,10 +141,13 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { }) .collect::>>()?; - Ok(Expr::ScalarFunction(ScalarFunction::new( - BuiltinScalarFunction::MakeArray, - values, - ))) + if let Some(udf) = self.context_provider.get_function_meta("make_array") { + Ok(Expr::ScalarFunction(ScalarFunction::new_udf(udf, values))) + } else { + not_impl_err!( + "array_expression featrue is disable, So should implement make_array UDF by yourself" + ) + } } /// Convert a SQL interval expression to a DataFusion logical plan diff --git a/datafusion/sql/src/lib.rs b/datafusion/sql/src/lib.rs index d805f61397e9..da66ee197adb 100644 --- a/datafusion/sql/src/lib.rs +++ b/datafusion/sql/src/lib.rs @@ -36,6 +36,7 @@ mod relation; mod select; mod set_expr; mod statement; +pub mod unparser; pub mod utils; mod values; diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index 35063a6cfa06..412c3b753ed5 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -16,6 +16,8 @@ // under the License. use std::collections::{BTreeMap, HashMap, HashSet}; +use std::path::Path; +use std::str::FromStr; use std::sync::Arc; use crate::parser::{ @@ -28,15 +30,14 @@ use crate::planner::{ use crate::utils::normalize_ident; use arrow_schema::DataType; -use datafusion_common::file_options::StatementOptions; use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::{ exec_err, not_impl_err, plan_datafusion_err, plan_err, schema_err, unqualified_field_not_found, Column, Constraints, DFField, DFSchema, DFSchemaRef, - DataFusionError, OwnedTableReference, Result, ScalarValue, SchemaError, + DataFusionError, FileType, OwnedTableReference, Result, ScalarValue, SchemaError, SchemaReference, TableReference, ToDFSchema, }; -use datafusion_expr::dml::{CopyOptions, CopyTo}; +use datafusion_expr::dml::CopyTo; use datafusion_expr::expr_rewriter::normalize_col_with_schemas_and_ambiguity_check; use datafusion_expr::logical_plan::builder::project; use datafusion_expr::logical_plan::DdlStatement; @@ -829,25 +830,37 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } }; - // TODO, parse options as Vec<(String, String)> to avoid this conversion - let options = statement - .options - .iter() - .map(|(s, v)| (s.to_owned(), v.to_string())) - .collect::>(); - - let mut statement_options = StatementOptions::new(options); - let file_format = statement_options.try_infer_file_type(&statement.target)?; - let partition_by = statement_options.take_partition_by(); + let mut options = HashMap::new(); + for (key, value) in statement.options { + let value_string = match value { + Value::SingleQuotedString(s) => s.to_string(), + Value::DollarQuotedString(s) => s.to_string(), + Value::UnQuotedString(s) => s.to_string(), + Value::Number(_, _) | Value::Boolean(_) => value.to_string(), + Value::DoubleQuotedString(_) + | Value::EscapedStringLiteral(_) + | Value::NationalStringLiteral(_) + | Value::SingleQuotedByteStringLiteral(_) + | Value::DoubleQuotedByteStringLiteral(_) + | Value::RawStringLiteral(_) + | Value::HexStringLiteral(_) + | Value::Null + | Value::Placeholder(_) => { + return plan_err!("Unsupported Value in COPY statement {}", value); + } + }; + options.insert(key.to_lowercase(), value_string.to_lowercase()); + } - let copy_options = CopyOptions::SQLOptions(statement_options); + let file_type = try_infer_file_type(&mut options, &statement.target)?; + let partition_by = take_partition_by(&mut options); Ok(LogicalPlan::Copy(CopyTo { input: Arc::new(input), output_url: statement.target, - file_format, + format_options: file_type.into(), partition_by, - copy_options, + options, })) } @@ -1456,3 +1469,82 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .is_ok() } } + +/// Infers the file type for a given target based on provided options or file extension. +/// +/// This function tries to determine the file type based on the 'format' option present +/// in the provided options hashmap. If 'format' is not explicitly set, the function attempts +/// to infer the file type from the file extension of the target. It returns an error if neither +/// the format option is set nor the file extension can be determined or parsed. +/// +/// # Arguments +/// +/// * `options` - A mutable reference to a HashMap containing options where the file format +/// might be specified under the 'format' key. +/// * `target` - A string slice representing the path to the file for which the file type needs to be inferred. +/// +/// # Returns +/// +/// Returns `Result` which is Ok if the file type could be successfully inferred, +/// otherwise returns an error in case of failure to determine or parse the file format or extension. +/// +/// # Errors +/// +/// This function returns an error in two cases: +/// - If the 'format' option is not set and the file extension cannot be retrieved from `target`. +/// - If the file extension is found but cannot be converted into a valid string. +/// +pub fn try_infer_file_type( + options: &mut HashMap, + target: &str, +) -> Result { + let explicit_format = options.remove("format"); + let format = match explicit_format { + Some(s) => FileType::from_str(&s), + None => { + // try to infer file format from file extension + let extension: &str = &Path::new(target) + .extension() + .ok_or(DataFusionError::Configuration( + "Format not explicitly set and unable to get file extension!" + .to_string(), + ))? + .to_str() + .ok_or(DataFusionError::Configuration( + "Format not explicitly set and failed to parse file extension!" + .to_string(), + ))? + .to_lowercase(); + + FileType::from_str(extension) + } + }?; + + Ok(format) +} + +/// Extracts and parses the 'partition_by' option from a provided options hashmap. +/// +/// This function looks for a 'partition_by' key in the options hashmap. If found, +/// it splits the value by commas, trims each resulting string, and replaces double +/// single quotes with a single quote. It returns a vector of partition column names. +/// +/// # Arguments +/// +/// * `options` - A mutable reference to a HashMap containing options where 'partition_by' +/// might be specified. +/// +/// # Returns +/// +/// Returns a `Vec` containing partition column names. If the 'partition_by' option +/// is not present, returns an empty vector. +pub fn take_partition_by(options: &mut HashMap) -> Vec { + let partition_by = options.remove("partition_by"); + match partition_by { + Some(part_cols) => part_cols + .split(',') + .map(|s| s.trim().replace("''", "'")) + .collect::>(), + None => vec![], + } +} diff --git a/datafusion/sql/src/unparser/dialect.rs b/datafusion/sql/src/unparser/dialect.rs new file mode 100644 index 000000000000..3af33ad0afda --- /dev/null +++ b/datafusion/sql/src/unparser/dialect.rs @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// Dialect is used to capture dialect specific syntax. +/// Note: this trait will eventually be replaced by the Dialect in the SQLparser package +/// +/// See +pub trait Dialect { + fn identifier_quote_style(&self) -> Option; +} +pub struct DefaultDialect {} + +impl Dialect for DefaultDialect { + fn identifier_quote_style(&self) -> Option { + None + } +} + +pub struct PostgreSqlDialect {} + +impl Dialect for PostgreSqlDialect { + fn identifier_quote_style(&self) -> Option { + Some('"') + } +} + +pub struct MySqlDialect {} + +impl Dialect for MySqlDialect { + fn identifier_quote_style(&self) -> Option { + Some('`') + } +} + +pub struct SqliteDialect {} + +impl Dialect for SqliteDialect { + fn identifier_quote_style(&self) -> Option { + Some('`') + } +} + +pub struct CustomDialect { + identifier_quote_style: Option, +} + +impl CustomDialect { + pub fn new(identifier_quote_style: Option) -> Self { + Self { + identifier_quote_style, + } + } +} + +impl Dialect for CustomDialect { + fn identifier_quote_style(&self) -> Option { + self.identifier_quote_style + } +} diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs new file mode 100644 index 000000000000..bb14c8a70739 --- /dev/null +++ b/datafusion/sql/src/unparser/expr.rs @@ -0,0 +1,355 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion_common::{not_impl_err, Column, Result, ScalarValue}; +use datafusion_expr::{ + expr::{Alias, InList, ScalarFunction, WindowFunction}, + Between, BinaryExpr, Case, Cast, Expr, Like, Operator, +}; +use sqlparser::ast; + +use super::Unparser; + +/// Convert a DataFusion [`Expr`] to `sqlparser::ast::Expr` +/// +/// This function is the opposite of `SqlToRel::sql_to_expr` and can +/// be used to, among other things, convert `Expr`s to strings. +/// +/// # Example +/// ``` +/// use datafusion_expr::{col, lit}; +/// use datafusion_sql::unparser::expr_to_sql; +/// let expr = col("a").gt(lit(4)); +/// let sql = expr_to_sql(&expr).unwrap(); +/// +/// assert_eq!(format!("{}", sql), "a > 4") +/// ``` +pub fn expr_to_sql(expr: &Expr) -> Result { + let unparser = Unparser::default(); + unparser.expr_to_sql(expr) +} + +impl Unparser<'_> { + pub fn expr_to_sql(&self, expr: &Expr) -> Result { + match expr { + Expr::InList(InList { + expr, + list: _, + negated: _, + }) => { + not_impl_err!("Unsupported expression: {expr:?}") + } + Expr::ScalarFunction(ScalarFunction { .. }) => { + not_impl_err!("Unsupported expression: {expr:?}") + } + Expr::Between(Between { + expr, + negated: _, + low: _, + high: _, + }) => { + not_impl_err!("Unsupported expression: {expr:?}") + } + Expr::Column(col) => self.col_to_sql(col), + Expr::BinaryExpr(BinaryExpr { left, op, right }) => { + let l = self.expr_to_sql(left.as_ref())?; + let r = self.expr_to_sql(right.as_ref())?; + let op = self.op_to_sql(op)?; + + Ok(self.binary_op_to_sql(l, r, op)) + } + Expr::Case(Case { + expr, + when_then_expr: _, + else_expr: _, + }) => { + not_impl_err!("Unsupported expression: {expr:?}") + } + Expr::Cast(Cast { expr, data_type: _ }) => { + not_impl_err!("Unsupported expression: {expr:?}") + } + Expr::Literal(value) => Ok(ast::Expr::Value(self.scalar_to_sql(value)?)), + Expr::Alias(Alias { expr, name: _, .. }) => self.expr_to_sql(expr), + Expr::WindowFunction(WindowFunction { + fun: _, + args: _, + partition_by: _, + order_by: _, + window_frame: _, + null_treatment: _, + }) => { + not_impl_err!("Unsupported expression: {expr:?}") + } + Expr::Like(Like { + negated: _, + expr, + pattern: _, + escape_char: _, + case_insensitive: _, + }) => { + not_impl_err!("Unsupported expression: {expr:?}") + } + _ => not_impl_err!("Unsupported expression: {expr:?}"), + } + } + + fn col_to_sql(&self, col: &Column) -> Result { + if let Some(table_ref) = &col.relation { + let mut id = table_ref.to_vec(); + id.push(col.name.to_string()); + return Ok(ast::Expr::CompoundIdentifier( + id.iter().map(|i| self.new_ident(i.to_string())).collect(), + )); + } + Ok(ast::Expr::Identifier(self.new_ident(col.name.to_string()))) + } + + fn new_ident(&self, str: String) -> ast::Ident { + ast::Ident { + value: str, + quote_style: self.dialect.identifier_quote_style(), + } + } + + fn binary_op_to_sql( + &self, + lhs: ast::Expr, + rhs: ast::Expr, + op: ast::BinaryOperator, + ) -> ast::Expr { + ast::Expr::BinaryOp { + left: Box::new(lhs), + op, + right: Box::new(rhs), + } + } + + fn op_to_sql(&self, op: &Operator) -> Result { + match op { + Operator::Eq => Ok(ast::BinaryOperator::Eq), + Operator::NotEq => Ok(ast::BinaryOperator::NotEq), + Operator::Lt => Ok(ast::BinaryOperator::Lt), + Operator::LtEq => Ok(ast::BinaryOperator::LtEq), + Operator::Gt => Ok(ast::BinaryOperator::Gt), + Operator::GtEq => Ok(ast::BinaryOperator::GtEq), + Operator::Plus => Ok(ast::BinaryOperator::Plus), + Operator::Minus => Ok(ast::BinaryOperator::Minus), + Operator::Multiply => Ok(ast::BinaryOperator::Multiply), + Operator::Divide => Ok(ast::BinaryOperator::Divide), + Operator::Modulo => Ok(ast::BinaryOperator::Modulo), + Operator::And => Ok(ast::BinaryOperator::And), + Operator::Or => Ok(ast::BinaryOperator::Or), + Operator::IsDistinctFrom => not_impl_err!("unsupported operation: {op:?}"), + Operator::IsNotDistinctFrom => not_impl_err!("unsupported operation: {op:?}"), + Operator::RegexMatch => Ok(ast::BinaryOperator::PGRegexMatch), + Operator::RegexIMatch => Ok(ast::BinaryOperator::PGRegexIMatch), + Operator::RegexNotMatch => Ok(ast::BinaryOperator::PGRegexNotMatch), + Operator::RegexNotIMatch => Ok(ast::BinaryOperator::PGRegexNotIMatch), + Operator::ILikeMatch => Ok(ast::BinaryOperator::PGILikeMatch), + Operator::NotLikeMatch => Ok(ast::BinaryOperator::PGNotLikeMatch), + Operator::LikeMatch => Ok(ast::BinaryOperator::PGLikeMatch), + Operator::NotILikeMatch => Ok(ast::BinaryOperator::PGNotILikeMatch), + Operator::BitwiseAnd => Ok(ast::BinaryOperator::BitwiseAnd), + Operator::BitwiseOr => Ok(ast::BinaryOperator::BitwiseOr), + Operator::BitwiseXor => Ok(ast::BinaryOperator::BitwiseXor), + Operator::BitwiseShiftRight => Ok(ast::BinaryOperator::PGBitwiseShiftRight), + Operator::BitwiseShiftLeft => Ok(ast::BinaryOperator::PGBitwiseShiftLeft), + Operator::StringConcat => Ok(ast::BinaryOperator::StringConcat), + Operator::AtArrow => not_impl_err!("unsupported operation: {op:?}"), + Operator::ArrowAt => not_impl_err!("unsupported operation: {op:?}"), + } + } + + fn scalar_to_sql(&self, v: &ScalarValue) -> Result { + match v { + ScalarValue::Null => Ok(ast::Value::Null), + ScalarValue::Boolean(Some(b)) => Ok(ast::Value::Boolean(b.to_owned())), + ScalarValue::Boolean(None) => Ok(ast::Value::Null), + ScalarValue::Float32(Some(f)) => Ok(ast::Value::Number(f.to_string(), false)), + ScalarValue::Float32(None) => Ok(ast::Value::Null), + ScalarValue::Float64(Some(f)) => Ok(ast::Value::Number(f.to_string(), false)), + ScalarValue::Float64(None) => Ok(ast::Value::Null), + ScalarValue::Decimal128(Some(_), ..) => { + not_impl_err!("Unsupported scalar: {v:?}") + } + ScalarValue::Decimal128(None, ..) => Ok(ast::Value::Null), + ScalarValue::Decimal256(Some(_), ..) => { + not_impl_err!("Unsupported scalar: {v:?}") + } + ScalarValue::Decimal256(None, ..) => Ok(ast::Value::Null), + ScalarValue::Int8(Some(i)) => Ok(ast::Value::Number(i.to_string(), false)), + ScalarValue::Int8(None) => Ok(ast::Value::Null), + ScalarValue::Int16(Some(i)) => Ok(ast::Value::Number(i.to_string(), false)), + ScalarValue::Int16(None) => Ok(ast::Value::Null), + ScalarValue::Int32(Some(i)) => Ok(ast::Value::Number(i.to_string(), false)), + ScalarValue::Int32(None) => Ok(ast::Value::Null), + ScalarValue::Int64(Some(i)) => Ok(ast::Value::Number(i.to_string(), false)), + ScalarValue::Int64(None) => Ok(ast::Value::Null), + ScalarValue::UInt8(Some(ui)) => Ok(ast::Value::Number(ui.to_string(), false)), + ScalarValue::UInt8(None) => Ok(ast::Value::Null), + ScalarValue::UInt16(Some(ui)) => { + Ok(ast::Value::Number(ui.to_string(), false)) + } + ScalarValue::UInt16(None) => Ok(ast::Value::Null), + ScalarValue::UInt32(Some(ui)) => { + Ok(ast::Value::Number(ui.to_string(), false)) + } + ScalarValue::UInt32(None) => Ok(ast::Value::Null), + ScalarValue::UInt64(Some(ui)) => { + Ok(ast::Value::Number(ui.to_string(), false)) + } + ScalarValue::UInt64(None) => Ok(ast::Value::Null), + ScalarValue::Utf8(Some(str)) => { + Ok(ast::Value::SingleQuotedString(str.to_string())) + } + ScalarValue::Utf8(None) => Ok(ast::Value::Null), + ScalarValue::LargeUtf8(Some(str)) => { + Ok(ast::Value::SingleQuotedString(str.to_string())) + } + ScalarValue::LargeUtf8(None) => Ok(ast::Value::Null), + ScalarValue::Binary(Some(_)) => not_impl_err!("Unsupported scalar: {v:?}"), + ScalarValue::Binary(None) => Ok(ast::Value::Null), + ScalarValue::FixedSizeBinary(..) => { + not_impl_err!("Unsupported scalar: {v:?}") + } + ScalarValue::LargeBinary(Some(_)) => { + not_impl_err!("Unsupported scalar: {v:?}") + } + ScalarValue::LargeBinary(None) => Ok(ast::Value::Null), + ScalarValue::FixedSizeList(_a) => not_impl_err!("Unsupported scalar: {v:?}"), + ScalarValue::List(_a) => not_impl_err!("Unsupported scalar: {v:?}"), + ScalarValue::LargeList(_a) => not_impl_err!("Unsupported scalar: {v:?}"), + ScalarValue::Date32(Some(_d)) => not_impl_err!("Unsupported scalar: {v:?}"), + ScalarValue::Date32(None) => Ok(ast::Value::Null), + ScalarValue::Date64(Some(_d)) => not_impl_err!("Unsupported scalar: {v:?}"), + ScalarValue::Date64(None) => Ok(ast::Value::Null), + ScalarValue::Time32Second(Some(_t)) => { + not_impl_err!("Unsupported scalar: {v:?}") + } + ScalarValue::Time32Second(None) => Ok(ast::Value::Null), + ScalarValue::Time32Millisecond(Some(_t)) => { + not_impl_err!("Unsupported scalar: {v:?}") + } + ScalarValue::Time32Millisecond(None) => Ok(ast::Value::Null), + ScalarValue::Time64Microsecond(Some(_t)) => { + not_impl_err!("Unsupported scalar: {v:?}") + } + ScalarValue::Time64Microsecond(None) => Ok(ast::Value::Null), + ScalarValue::Time64Nanosecond(Some(_t)) => { + not_impl_err!("Unsupported scalar: {v:?}") + } + ScalarValue::Time64Nanosecond(None) => Ok(ast::Value::Null), + ScalarValue::TimestampSecond(Some(_ts), _) => { + not_impl_err!("Unsupported scalar: {v:?}") + } + ScalarValue::TimestampSecond(None, _) => Ok(ast::Value::Null), + ScalarValue::TimestampMillisecond(Some(_ts), _) => { + not_impl_err!("Unsupported scalar: {v:?}") + } + ScalarValue::TimestampMillisecond(None, _) => Ok(ast::Value::Null), + ScalarValue::TimestampMicrosecond(Some(_ts), _) => { + not_impl_err!("Unsupported scalar: {v:?}") + } + ScalarValue::TimestampMicrosecond(None, _) => Ok(ast::Value::Null), + ScalarValue::TimestampNanosecond(Some(_ts), _) => { + not_impl_err!("Unsupported scalar: {v:?}") + } + ScalarValue::TimestampNanosecond(None, _) => Ok(ast::Value::Null), + ScalarValue::IntervalYearMonth(Some(_i)) => { + not_impl_err!("Unsupported scalar: {v:?}") + } + ScalarValue::IntervalYearMonth(None) => Ok(ast::Value::Null), + ScalarValue::IntervalDayTime(Some(_i)) => { + not_impl_err!("Unsupported scalar: {v:?}") + } + ScalarValue::IntervalDayTime(None) => Ok(ast::Value::Null), + ScalarValue::IntervalMonthDayNano(Some(_i)) => { + not_impl_err!("Unsupported scalar: {v:?}") + } + ScalarValue::IntervalMonthDayNano(None) => Ok(ast::Value::Null), + ScalarValue::DurationSecond(Some(_d)) => { + not_impl_err!("Unsupported scalar: {v:?}") + } + ScalarValue::DurationSecond(None) => Ok(ast::Value::Null), + ScalarValue::DurationMillisecond(Some(_d)) => { + not_impl_err!("Unsupported scalar: {v:?}") + } + ScalarValue::DurationMillisecond(None) => Ok(ast::Value::Null), + ScalarValue::DurationMicrosecond(Some(_d)) => { + not_impl_err!("Unsupported scalar: {v:?}") + } + ScalarValue::DurationMicrosecond(None) => Ok(ast::Value::Null), + ScalarValue::DurationNanosecond(Some(_d)) => { + not_impl_err!("Unsupported scalar: {v:?}") + } + ScalarValue::DurationNanosecond(None) => Ok(ast::Value::Null), + ScalarValue::Struct(_) => not_impl_err!("Unsupported scalar: {v:?}"), + ScalarValue::Dictionary(..) => not_impl_err!("Unsupported scalar: {v:?}"), + } + } +} + +#[cfg(test)] +mod tests { + use datafusion_common::TableReference; + use datafusion_expr::{col, lit}; + + use crate::unparser::dialect::CustomDialect; + + use super::*; + + #[test] + fn expr_to_sql_ok() -> Result<()> { + let tests: Vec<(Expr, &str)> = vec![ + (col("a").gt(lit(4)), r#"a > 4"#), + ( + Expr::Column(Column { + relation: Some(TableReference::partial("a", "b")), + name: "c".to_string(), + }) + .gt(lit(4)), + r#"a.b.c > 4"#, + ), + ]; + + for (expr, expected) in tests { + let ast = expr_to_sql(&expr)?; + + let actual = format!("{}", ast); + + assert_eq!(actual, expected); + } + + Ok(()) + } + + #[test] + fn custom_dialect() -> Result<()> { + let dialect = CustomDialect::new(Some('\'')); + let unparser = Unparser::new(&dialect); + + let expr = col("a").gt(lit(4)); + let ast = unparser.expr_to_sql(&expr)?; + + let actual = format!("{}", ast); + + let expected = r#"'a' > 4"#; + assert_eq!(actual, expected); + + Ok(()) + } +} diff --git a/datafusion/common/src/file_options/parse_utils.rs b/datafusion/sql/src/unparser/mod.rs similarity index 58% rename from datafusion/common/src/file_options/parse_utils.rs rename to datafusion/sql/src/unparser/mod.rs index 38cf5eb489f7..77a9de0975ed 100644 --- a/datafusion/common/src/file_options/parse_utils.rs +++ b/datafusion/sql/src/unparser/mod.rs @@ -15,17 +15,27 @@ // specific language governing permissions and limitations // under the License. -//! Functions for parsing arbitrary passed strings to valid file_option settings -use crate::{DataFusionError, Result}; +mod expr; -/// Converts a String option to a bool, or returns an error if not a valid bool string. -pub(crate) fn parse_boolean_string(option: &str, value: String) -> Result { - match value.to_lowercase().as_str() { - "true" => Ok(true), - "false" => Ok(false), - _ => Err(DataFusionError::Configuration(format!( - "Unsupported value {value} for option {option}! \ - Valid values are true or false!" - ))), +pub use expr::expr_to_sql; + +use self::dialect::{DefaultDialect, Dialect}; +pub mod dialect; + +pub struct Unparser<'a> { + dialect: &'a dyn Dialect, +} + +impl<'a> Unparser<'a> { + pub fn new(dialect: &'a dyn Dialect) -> Self { + Self { dialect } + } +} + +impl<'a> Default for Unparser<'a> { + fn default() -> Self { + Self { + dialect: &DefaultDialect {}, + } } } diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 266f04580f11..19bcf6024b50 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -182,7 +182,7 @@ CREATE TABLE array_agg_distinct_list_table AS VALUES ('b', [0,1]) ; -# Apply array_sort to have determinisitic result, higher dimension nested array also works but not for array sort, +# Apply array_sort to have deterministic result, higher dimension nested array also works but not for array sort, # so they are covered in `datafusion/physical-expr/src/aggregate/array_agg_distinct.rs` query ?? select array_sort(c1), array_sort(c2) from ( @@ -1359,7 +1359,7 @@ NULL 4 29 1.260869565217 123 -117 23 NULL 5 -194 -13.857142857143 118 -101 14 NULL NULL 781 7.81 125 -117 100 -# TODO: array_agg_distinct output is non-determinisitic -- rewrite with array_sort(list_sort) +# TODO: array_agg_distinct output is non-deterministic -- rewrite with array_sort(list_sort) # unnest is also not available, so manually unnesting via CROSS JOIN # additional count(1) forces array_agg_distinct instead of array_agg over aggregated by c2 data # @@ -2255,9 +2255,10 @@ select median(a) from (select 1 as a where 1=0); ---- NULL -query error DataFusion error: Execution error: aggregate function needs at least one non-null element +query I select approx_median(a) from (select 1 as a where 1=0); - +---- +NULL # aggregate_decimal_sum query RT diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 434fe8c959e6..b729e5c10f3d 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -5598,6 +5598,39 @@ select ---- [] [9223372036854775807] [] [9223372036854775806] [] [-9223372036854775807] [] [-9223372036854775808] +# Test range(start, stop, step) with NULL values +query ? +select range(start, stop, step) from + (values (1), (NULL)) as start_values(start), + (values (10), (NULL)) as stop_values(stop), + (values (3), (NULL)) as step_values(step) +where start is null or stop is null or step is null +---- +NULL +NULL +NULL +NULL +NULL +NULL +NULL + +# Test range(start, stop) with NULL values +query ? +select range(start, stop) from + (values (1), (NULL)) as start_values(start), + (values (10), (NULL)) as stop_values(stop) +where start is null or stop is null +---- +NULL +NULL +NULL + +# Test range(stop) with NULL value +query ? +select range(NULL) +---- +NULL + ## should throw error query error select range(DATE '1992-09-01', NULL, INTERVAL '1' YEAR); @@ -5678,6 +5711,39 @@ select ---- [9223372036854775807] [9223372036854775807] [-9223372036854775808] [-9223372036854775808] +# Test generate_series(start, stop, step) with NULL values +query ? +select generate_series(start, stop, step) from + (values (1), (NULL)) as start_values(start), + (values (10), (NULL)) as stop_values(stop), + (values (3), (NULL)) as step_values(step) +where start is null or stop is null or step is null +---- +NULL +NULL +NULL +NULL +NULL +NULL +NULL + +# Test generate_series(start, stop) with NULL values +query ? +select generate_series(start, stop) from + (values (1), (NULL)) as start_values(start), + (values (10), (NULL)) as stop_values(stop) +where start is null or stop is null +---- +NULL +NULL +NULL + +# Test generate_series(stop) with NULL value +query ? +select generate_series(NULL) +---- +NULL + ## array_except diff --git a/datafusion/sqllogictest/test_files/copy.slt b/datafusion/sqllogictest/test_files/copy.slt index 463b51c940d1..df23a993ebce 100644 --- a/datafusion/sqllogictest/test_files/copy.slt +++ b/datafusion/sqllogictest/test_files/copy.slt @@ -21,13 +21,13 @@ create table source_table(col1 integer, col2 varchar) as values (1, 'Foo'), (2, # Copy to directory as multiple files query IT -COPY source_table TO 'test_files/scratch/copy/table/' (format parquet, compression 'zstd(10)'); +COPY source_table TO 'test_files/scratch/copy/table/' (format parquet, 'parquet.compression' 'zstd(10)'); ---- 2 # Copy to directory as partitioned files query IT -COPY source_table TO 'test_files/scratch/copy/partitioned_table1/' (format parquet, compression 'zstd(10)', partition_by 'col2'); +COPY source_table TO 'test_files/scratch/copy/partitioned_table1/' (format parquet, 'parquet.compression' 'zstd(10)', partition_by 'col2'); ---- 2 @@ -55,7 +55,7 @@ select * from validate_partitioned_parquet_bar order by col1; # Copy to directory as partitioned files query ITT COPY (values (1, 'a', 'x'), (2, 'b', 'y'), (3, 'c', 'z')) TO 'test_files/scratch/copy/partitioned_table2/' -(format parquet, compression 'zstd(10)', partition_by 'column2, column3'); +(format parquet, partition_by 'column2, column3', 'parquet.compression' 'zstd(10)'); ---- 3 @@ -83,7 +83,7 @@ select * from validate_partitioned_parquet_a_x order by column1; # Copy to directory as partitioned files query TTT COPY (values ('1', 'a', 'x'), ('2', 'b', 'y'), ('3', 'c', 'z')) TO 'test_files/scratch/copy/partitioned_table3/' -(format parquet, compression 'zstd(10)', partition_by 'column1, column3'); +(format parquet, 'parquet.compression' 'zstd(10)', partition_by 'column1, column3'); ---- 3 @@ -139,10 +139,10 @@ LOCATION 'test_files/scratch/copy/escape_quote/' PARTITIONED BY ("'test2'", "'te #select * from validate_partitioned_escape_quote; query TT -EXPLAIN COPY source_table TO 'test_files/scratch/copy/table/' (format parquet, compression 'zstd(10)'); +EXPLAIN COPY source_table TO 'test_files/scratch/copy/table/' (format parquet, 'parquet.compression' 'zstd(10)'); ---- logical_plan -CopyTo: format=parquet output_url=test_files/scratch/copy/table/ options: (compression 'zstd(10)') +CopyTo: format=parquet output_url=test_files/scratch/copy/table/ options: (parquet.compression zstd(10)) --TableScan: source_table projection=[col1, col2] physical_plan FileSinkExec: sink=ParquetSink(file_groups=[]) @@ -152,10 +152,15 @@ FileSinkExec: sink=ParquetSink(file_groups=[]) query error DataFusion error: Invalid or Unsupported Configuration: Format not explicitly set and unable to get file extension! EXPLAIN COPY source_table to 'test_files/scratch/copy/table/' -query error DataFusion error: SQL error: ParserError\("Expected end of statement, found: query"\) -EXPLAIN COPY source_table to 'test_files/scratch/copy/table/' (format parquet) query TT -EXPLAIN COPY source_table to 'test_files/scratch/copy/table/' (format parquet, per_thread_output true) +EXPLAIN COPY source_table to 'test_files/scratch/copy/table/' (format parquet) +---- +logical_plan +CopyTo: format=parquet output_url=test_files/scratch/copy/table/ options: () +--TableScan: source_table projection=[col1, col2] +physical_plan +FileSinkExec: sink=ParquetSink(file_groups=[]) +--MemoryExec: partitions=1, partition_sizes=[1] # Copy more files to directory via query query IT @@ -251,30 +256,30 @@ query IT COPY source_table TO 'test_files/scratch/copy/table_with_options/' (format parquet, -compression snappy, -'compression::col1' 'zstd(5)', -'compression::col2' snappy, -max_row_group_size 12345, -data_pagesize_limit 1234, -write_batch_size 1234, -writer_version 2.0, -dictionary_page_size_limit 123, -created_by 'DF copy.slt', -column_index_truncate_length 123, -data_page_row_count_limit 1234, -bloom_filter_enabled true, -'bloom_filter_enabled::col1' false, -'bloom_filter_fpp::col2' 0.456, -'bloom_filter_ndv::col2' 456, -encoding plain, -'encoding::col1' DELTA_BINARY_PACKED, -'dictionary_enabled::col2' true, -dictionary_enabled false, -statistics_enabled page, -'statistics_enabled::col2' none, -max_statistics_size 123, -bloom_filter_fpp 0.001, -bloom_filter_ndv 100 +'parquet.compression' snappy, +'parquet.compression::col1' 'zstd(5)', +'parquet.compression::col2' snappy, +'parquet.max_row_group_size' 12345, +'parquet.data_pagesize_limit' 1234, +'parquet.write_batch_size' 1234, +'parquet.writer_version' 2.0, +'parquet.dictionary_page_size_limit' 123, +'parquet.created_by' 'DF copy.slt', +'parquet.column_index_truncate_length' 123, +'parquet.data_page_row_count_limit' 1234, +'parquet.bloom_filter_enabled' true, +'parquet.bloom_filter_enabled::col1' false, +'parquet.bloom_filter_fpp::col2' 0.456, +'parquet.bloom_filter_ndv::col2' 456, +'parquet.encoding' plain, +'parquet.encoding::col1' DELTA_BINARY_PACKED, +'parquet.dictionary_enabled::col2' true, +'parquet.dictionary_enabled' false, +'parquet.statistics_enabled' page, +'parquet.statistics_enabled::col2' none, +'parquet.max_statistics_size' 123, +'parquet.bloom_filter_fpp' 0.001, +'parquet.bloom_filter_ndv' 100 ) ---- 2 @@ -307,7 +312,7 @@ select * from validate_parquet_single; # copy from table to folder of compressed json files query IT -COPY source_table to 'test_files/scratch/copy/table_json_gz' (format json, compression 'gzip'); +COPY source_table to 'test_files/scratch/copy/table_json_gz' (format json, 'json.compression' gzip); ---- 2 @@ -323,7 +328,7 @@ select * from validate_json_gz; # copy from table to folder of compressed csv files query IT -COPY source_table to 'test_files/scratch/copy/table_csv' (format csv, header false, compression 'gzip'); +COPY source_table to 'test_files/scratch/copy/table_csv' (format csv, 'csv.has_header' false, 'csv.compression' gzip); ---- 2 @@ -390,11 +395,11 @@ query IT COPY source_table to 'test_files/scratch/copy/table_csv_with_options' (format csv, -header false, -compression 'uncompressed', -datetime_format '%FT%H:%M:%S.%9f', -delimiter ';', -null_value 'NULLVAL'); +'csv.has_header' false, +'csv.compression' uncompressed, +'csv.datetime_format' '%FT%H:%M:%S.%9f', +'csv.delimiter' ';', +'csv.null_value' 'NULLVAL'); ---- 2 @@ -469,8 +474,8 @@ select * from validate_arrow; # Error cases: # Copy from table with options -query error DataFusion error: Invalid or Unsupported Configuration: Found unsupported option row_group_size with value 55 for JSON format! -COPY source_table to 'test_files/scratch/copy/table.json' (row_group_size 55); +query error DataFusion error: Invalid or Unsupported Configuration: Config value "row_group_size" not found on JsonOptions +COPY source_table to 'test_files/scratch/copy/table.json' ('json.row_group_size' 55); # Incomplete statement query error DataFusion error: SQL error: ParserError\("Expected \), found: EOF"\) diff --git a/datafusion/sqllogictest/test_files/create_external_table.slt b/datafusion/sqllogictest/test_files/create_external_table.slt index c08d5a55c366..3b85dd9e986f 100644 --- a/datafusion/sqllogictest/test_files/create_external_table.slt +++ b/datafusion/sqllogictest/test_files/create_external_table.slt @@ -99,3 +99,10 @@ CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH LOCATION 'foo.csv'; # Unrecognized random clause statement error DataFusion error: SQL error: ParserError\("Unexpected token FOOBAR"\) CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV FOOBAR BARBAR BARFOO LOCATION 'foo.csv'; + +# Conflicting options +statement error DataFusion error: Invalid or Unsupported Configuration: Key "parquet.column_index_truncate_length" is not applicable for CSV format +CREATE EXTERNAL TABLE csv_table (column1 int) +STORED AS CSV +LOCATION 'foo.csv' +OPTIONS ('csv.delimiter' ';', 'parquet.column_index_truncate_length' '123') diff --git a/datafusion/sqllogictest/test_files/csv_files.slt b/datafusion/sqllogictest/test_files/csv_files.slt index 0e8171b5a870..7b299c0cf143 100644 --- a/datafusion/sqllogictest/test_files/csv_files.slt +++ b/datafusion/sqllogictest/test_files/csv_files.slt @@ -23,7 +23,7 @@ c2 VARCHAR ) STORED AS CSV WITH HEADER ROW DELIMITER ',' -OPTIONS ('quote' '~') +OPTIONS ('csv.quote' '~') LOCATION '../core/tests/data/quote.csv'; statement ok @@ -33,7 +33,7 @@ c2 VARCHAR ) STORED AS CSV WITH HEADER ROW DELIMITER ',' -OPTIONS ('escape' '\"') +OPTIONS ('csv.escape' '\') LOCATION '../core/tests/data/escape.csv'; query TT @@ -64,6 +64,31 @@ id7 value"7 id8 value"8 id9 value"9 +statement ok +CREATE EXTERNAL TABLE csv_with_escape_2 ( +c1 VARCHAR, +c2 VARCHAR +) STORED AS CSV +WITH HEADER ROW +DELIMITER ',' +OPTIONS ('csv.escape' '"') +LOCATION '../core/tests/data/escape.csv'; + +# TODO: Validate this with better data. +query TT +select * from csv_with_escape_2; +---- +id0 value\0" +id1 value\1" +id2 value\2" +id3 value\3" +id4 value\4" +id5 value\5" +id6 value\6" +id7 value\7" +id8 value\8" +id9 value\9" + # Read partitioned csv statement ok diff --git a/datafusion/sqllogictest/test_files/repartition_scan.slt b/datafusion/sqllogictest/test_files/repartition_scan.slt index 9b4b449340b0..15fe670a454c 100644 --- a/datafusion/sqllogictest/test_files/repartition_scan.slt +++ b/datafusion/sqllogictest/test_files/repartition_scan.slt @@ -158,7 +158,7 @@ DROP TABLE parquet_table_with_order; # create a single csv file statement ok COPY (VALUES (1), (2), (3), (4), (5)) TO 'test_files/scratch/repartition_scan/csv_table/1.csv' -(FORMAT csv, HEADER true); +(FORMAT csv, 'csv.has_header' true); statement ok CREATE EXTERNAL TABLE csv_table(column1 int) diff --git a/datafusion/sqllogictest/test_files/set_variable.slt b/datafusion/sqllogictest/test_files/set_variable.slt index 440fb2c6ef2b..fccd144a37fb 100644 --- a/datafusion/sqllogictest/test_files/set_variable.slt +++ b/datafusion/sqllogictest/test_files/set_variable.slt @@ -65,7 +65,7 @@ SHOW datafusion.execution.batch_size datafusion.execution.batch_size 1 # set variable unknown variable -statement error DataFusion error: External error: could not find config namespace for key "aabbcc" +statement error DataFusion error: Invalid or Unsupported Configuration: could not find config namespace for key "aabbcc" SET aabbcc to '1' # set bool variable diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt index a474da85b3d4..39c105a4dcce 100644 --- a/datafusion/sqllogictest/test_files/window.slt +++ b/datafusion/sqllogictest/test_files/window.slt @@ -4299,11 +4299,283 @@ LIMIT 5; 14 94 # Tests schema and data are in sync for mixed nulls and not nulls values for builtin window function -query T rowsort -select lag(a) over () as x1 +query T +select lag(a) over (order by a ASC NULLS FIRST) as x1 from (select 2 id, 'b' a union all select 1 id, null a union all select 3 id, null); ---- NULL NULL -b +NULL + +# Test for ignore nulls in FIRST_VALUE +statement ok +CREATE TABLE t AS VALUES (null::bigint), (3), (4); + +query I +SELECT FIRST_VALUE(column1) OVER() FROM t; +---- +NULL +NULL +NULL + +query I +SELECT FIRST_VALUE(column1) RESPECT NULLS OVER() FROM t; +---- +NULL +NULL +NULL + +query I +SELECT FIRST_VALUE(column1) IGNORE NULLS OVER() FROM t; +---- +3 +3 +3 + +statement ok +DROP TABLE t; + +# Test for ignore nulls with ORDER BY in FIRST_VALUE +statement ok +CREATE TABLE t AS VALUES (3, 4), (4, 3), (null::bigint, 1), (null::bigint, 2), (5, 5), (6, 6); + +query II +SELECT column1, column2 FROM t ORDER BY column2; +---- +NULL 1 +NULL 2 +4 3 +3 4 +5 5 +6 6 + +query II +SELECT FIRST_VALUE(column1) OVER(ORDER BY column2), column2 FROM t; +---- +NULL 1 +NULL 2 +NULL 3 +NULL 4 +NULL 5 +NULL 6 + +query II +SELECT FIRST_VALUE(column1) RESPECT NULLS OVER(ORDER BY column2), column2 FROM t; +---- +NULL 1 +NULL 2 +NULL 3 +NULL 4 +NULL 5 +NULL 6 + +query II +SELECT FIRST_VALUE(column1) IGNORE NULLS OVER(ORDER BY column2), column2 FROM t; +---- +NULL 1 +NULL 2 +4 3 +4 4 +4 5 +4 6 + +query II +SELECT FIRST_VALUE(column1)OVER(ORDER BY column2 RANGE BETWEEN 1 PRECEDING AND 1 FOLLOWING), column2 FROM t; +---- +NULL 1 +NULL 2 +NULL 3 +4 4 +3 5 +5 6 + +query II +SELECT FIRST_VALUE(column1) IGNORE NULLS OVER(ORDER BY column2 RANGE BETWEEN 1 PRECEDING AND 1 FOLLOWING), column2 FROM t; +---- +NULL 1 +4 2 +4 3 +4 4 +3 5 +5 6 + +statement ok +DROP TABLE t; + +# Test for ignore nulls with ORDER BY in FIRST_VALUE with all NULL values +statement ok +CREATE TABLE t AS VALUES (null::bigint, 4), (null::bigint, 3), (null::bigint, 1), (null::bigint, 2); + +query II +SELECT FIRST_VALUE(column1) OVER(ORDER BY column2), column2 FROM t; +---- +NULL 1 +NULL 2 +NULL 3 +NULL 4 + +query II +SELECT FIRST_VALUE(column1) RESPECT NULLS OVER(ORDER BY column2), column2 FROM t; +---- +NULL 1 +NULL 2 +NULL 3 +NULL 4 + +query II +SELECT FIRST_VALUE(column1) IGNORE NULLS OVER(ORDER BY column2), column2 FROM t; +---- +NULL 1 +NULL 2 +NULL 3 +NULL 4 + +statement ok +DROP TABLE t; + +# Test for ignore nulls in LAST_VALUE +statement ok +CREATE TABLE t AS VALUES (1), (3), (null::bigint); + +query I +SELECT LAST_VALUE(column1) OVER() FROM t; +---- +NULL +NULL +NULL + +query I +SELECT LAST_VALUE(column1) RESPECT NULLS OVER() FROM t; +---- +NULL +NULL +NULL + +query I +SELECT LAST_VALUE(column1) IGNORE NULLS OVER() FROM t; +---- +3 +3 +3 + +statement ok +DROP TABLE t; + +# Test for ignore nulls with ORDER BY in LAST_VALUE +statement ok +CREATE TABLE t AS VALUES (3, 4), (4, 3), (null::bigint, 1), (null::bigint, 2), (5, 5), (6, 6); + +query II +SELECT column1, column2 FROM t ORDER BY column2 DESC NULLS LAST; +---- +6 6 +5 5 +3 4 +4 3 +NULL 2 +NULL 1 + +query II +SELECT LAST_VALUE(column1) OVER(ORDER BY column2 DESC NULLS LAST), column2 FROM t; +---- +6 6 +5 5 +3 4 +4 3 +NULL 2 +NULL 1 + +query II +SELECT LAST_VALUE(column1) IGNORE NULLS OVER(ORDER BY column2 DESC NULLS LAST), column2 FROM t; +---- +6 6 +5 5 +3 4 +4 3 +4 2 +4 1 + +query II +SELECT LAST_VALUE(column1) OVER(ORDER BY column2 DESC NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING), column2 FROM t; +---- +NULL 6 +NULL 5 +NULL 4 +NULL 3 +NULL 2 +NULL 1 + +query II +SELECT LAST_VALUE(column1) RESPECT NULLS OVER(ORDER BY column2 DESC NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING), column2 FROM t; +---- +NULL 6 +NULL 5 +NULL 4 +NULL 3 +NULL 2 +NULL 1 + +query II +SELECT LAST_VALUE(column1) IGNORE NULLS OVER(ORDER BY column2 DESC NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING), column2 FROM t; +---- +4 6 +4 5 +4 4 +4 3 +4 2 +4 1 + +query II +SELECT LAST_VALUE(column1) OVER(ORDER BY column2 DESC NULLS LAST RANGE BETWEEN 1 PRECEDING AND 1 FOLLOWING), column2 FROM t; +---- +5 6 +3 5 +4 4 +NULL 3 +NULL 2 +NULL 1 + +query II +SELECT LAST_VALUE(column1) IGNORE NULLS OVER(ORDER BY column2 DESC NULLS LAST RANGE BETWEEN 1 PRECEDING AND 1 FOLLOWING), column2 FROM t; +---- +5 6 +3 5 +4 4 +4 3 +4 2 +NULL 1 + +statement ok +DROP TABLE t; + +# Test for ignore nulls with ORDER BY in LAST_VALUE with all NULLs +statement ok +CREATE TABLE t AS VALUES (null::bigint, 4), (null::bigint, 3), (null::bigint, 1), (null::bigint, 2); + +query II +SELECT LAST_VALUE(column1) OVER(ORDER BY column2 DESC NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING), column2 FROM t; +---- +NULL 4 +NULL 3 +NULL 2 +NULL 1 + +query II +SELECT LAST_VALUE(column1) RESPECT NULLS OVER(ORDER BY column2 DESC NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING), column2 FROM t; +---- +NULL 4 +NULL 3 +NULL 2 +NULL 1 + +query II +SELECT LAST_VALUE(column1) IGNORE NULLS OVER(ORDER BY column2 DESC NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING), column2 FROM t; +---- +NULL 4 +NULL 3 +NULL 2 +NULL 1 + +statement ok +DROP TABLE t; diff --git a/datafusion/substrait/src/physical_plan/consumer.rs b/datafusion/substrait/src/physical_plan/consumer.rs index 3098dc386e6a..11ddb91ad391 100644 --- a/datafusion/substrait/src/physical_plan/consumer.rs +++ b/datafusion/substrait/src/physical_plan/consumer.rs @@ -125,8 +125,12 @@ pub async fn from_substrait_rel( } } - Ok(Arc::new(ParquetExec::new(base_config, None, None)) - as Arc) + Ok(Arc::new(ParquetExec::new( + base_config, + None, + None, + Default::default(), + )) as Arc) } _ => not_impl_err!( "Only LocalFile reads are supported when parsing physical" diff --git a/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs index e5af3f94cc05..70887e393491 100644 --- a/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs @@ -50,8 +50,12 @@ async fn parquet_exec() -> Result<()> { table_partition_cols: vec![], output_ordering: vec![], }; - let parquet_exec: Arc = - Arc::new(ParquetExec::new(scan_config, None, None)); + let parquet_exec: Arc = Arc::new(ParquetExec::new( + scan_config, + None, + None, + Default::default(), + )); let mut extension_info: ( Vec, diff --git a/docs/source/user-guide/expressions.md b/docs/source/user-guide/expressions.md index dcb599b9b3b2..05eb063c3dc9 100644 --- a/docs/source/user-guide/expressions.md +++ b/docs/source/user-guide/expressions.md @@ -207,42 +207,44 @@ select log(-1), log(0), sqrt(-1); ## Array Expressions -| Syntax | Description | -| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| array_append(array, element) | Appends an element to the end of an array. `array_append([1, 2, 3], 4) -> [1, 2, 3, 4]` | -| array_concat(array[, ..., array_n]) | Concatenates arrays. `array_concat([1, 2, 3], [4, 5, 6]) -> [1, 2, 3, 4, 5, 6]` | -| array_has(array, element) | Returns true if the array contains the element `array_has([1,2,3], 1) -> true` | -| array_has_all(array, sub-array) | Returns true if all elements of sub-array exist in array `array_has_all([1,2,3], [1,3]) -> true` | -| array_has_any(array, sub-array) | Returns true if any elements exist in both arrays `array_has_any([1,2,3], [1,4]) -> true` | -| array_dims(array) | Returns an array of the array's dimensions. `array_dims([[1, 2, 3], [4, 5, 6]]) -> [2, 3]` | -| array_distinct(array) | Returns distinct values from the array after removing duplicates. `array_distinct([1, 3, 2, 3, 1, 2, 4]) -> [1, 2, 3, 4]` | -| array_element(array, index) | Extracts the element with the index n from the array `array_element([1, 2, 3, 4], 3) -> 3` | -| flatten(array) | Converts an array of arrays to a flat array `flatten([[1], [2, 3], [4, 5, 6]]) -> [1, 2, 3, 4, 5, 6]` | -| array_length(array, dimension) | Returns the length of the array dimension. `array_length([1, 2, 3, 4, 5]) -> 5` | -| array_ndims(array) | Returns the number of dimensions of the array. `array_ndims([[1, 2, 3], [4, 5, 6]]) -> 2` | -| array_pop_front(array) | Returns the array without the first element. `array_pop_front([1, 2, 3]) -> [2, 3]` | -| array_pop_back(array) | Returns the array without the last element. `array_pop_back([1, 2, 3]) -> [1, 2]` | -| array_position(array, element) | Searches for an element in the array, returns first occurrence. `array_position([1, 2, 2, 3, 4], 2) -> 2` | -| array_positions(array, element) | Searches for an element in the array, returns all occurrences. `array_positions([1, 2, 2, 3, 4], 2) -> [2, 3]` | -| array_prepend(array, element) | Prepends an element to the beginning of an array. `array_prepend(1, [2, 3, 4]) -> [1, 2, 3, 4]` | -| array_repeat(element, count) | Returns an array containing element `count` times. `array_repeat(1, 3) -> [1, 1, 1]` | -| array_remove(array, element) | Removes the first element from the array equal to the given value. `array_remove([1, 2, 2, 3, 2, 1, 4], 2) -> [1, 2, 3, 2, 1, 4]` | -| array_remove_n(array, element, max) | Removes the first `max` elements from the array equal to the given value. `array_remove_n([1, 2, 2, 3, 2, 1, 4], 2, 2) -> [1, 3, 2, 1, 4]` | -| array_remove_all(array, element) | Removes all elements from the array equal to the given value. `array_remove_all([1, 2, 2, 3, 2, 1, 4], 2) -> [1, 3, 1, 4]` | -| array_replace(array, from, to) | Replaces the first occurrence of the specified element with another specified element. `array_replace([1, 2, 2, 3, 2, 1, 4], 2, 5) -> [1, 5, 2, 3, 2, 1, 4]` | -| array_replace_n(array, from, to, max) | Replaces the first `max` occurrences of the specified element with another specified element. `array_replace_n([1, 2, 2, 3, 2, 1, 4], 2, 5, 2) -> [1, 5, 5, 3, 2, 1, 4]` | -| array_replace_all(array, from, to) | Replaces all occurrences of the specified element with another specified element. `array_replace_all([1, 2, 2, 3, 2, 1, 4], 2, 5) -> [1, 5, 5, 3, 5, 1, 4]` | -| array_slice(array, begin,end) | Returns a slice of the array. `array_slice([1, 2, 3, 4, 5, 6, 7, 8], 3, 6) -> [3, 4, 5, 6]` | -| array_slice(array, begin, end, stride) | Returns a slice of the array with added stride feature. `array_slice([1, 2, 3, 4, 5, 6, 7, 8], 3, 6, 2) -> [3, 5, 6]` | -| array_to_string(array, delimiter) | Converts each element to its text representation. `array_to_string([1, 2, 3, 4], ',') -> 1,2,3,4` | -| array_intersect(array1, array2) | Returns an array of the elements in the intersection of array1 and array2. `array_intersect([1, 2, 3, 4], [5, 6, 3, 4]) -> [3, 4]` | -| array_union(array1, array2) | Returns an array of the elements in the union of array1 and array2 without duplicates. `array_union([1, 2, 3, 4], [5, 6, 3, 4]) -> [1, 2, 3, 4, 5, 6]` | -| array_except(array1, array2) | Returns an array of the elements that appear in the first array but not in the second. `array_except([1, 2, 3, 4], [5, 6, 3, 4]) -> [3, 4]` | -| array_resize(array, size, value) | Resizes the list to contain size elements. Initializes new elements with value or empty if value is not set. `array_resize([1, 2, 3], 5, 0) -> [1, 2, 3, 4, 5, 6]` | -| cardinality(array) | Returns the total number of elements in the array. `cardinality([[1, 2, 3], [4, 5, 6]]) -> 6` | -| make_array(value1, [value2 [, ...]]) | Returns an Arrow array using the specified input expressions. `make_array(1, 2, 3) -> [1, 2, 3]` | -| range(start [, stop, step]) | Returns an Arrow array between start and stop with step. `SELECT range(2, 10, 3) -> [2, 5, 8]` | -| trim_array(array, n) | Deprecated | +| Syntax | Description | +| ---------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| array_append(array, element) | Appends an element to the end of an array. `array_append([1, 2, 3], 4) -> [1, 2, 3, 4]` | +| array_concat(array[, ..., array_n]) | Concatenates arrays. `array_concat([1, 2, 3], [4, 5, 6]) -> [1, 2, 3, 4, 5, 6]` | +| array_has(array, element) | Returns true if the array contains the element `array_has([1,2,3], 1) -> true` | +| array_has_all(array, sub-array) | Returns true if all elements of sub-array exist in array `array_has_all([1,2,3], [1,3]) -> true` | +| array_has_any(array, sub-array) | Returns true if any elements exist in both arrays `array_has_any([1,2,3], [1,4]) -> true` | +| array_dims(array) | Returns an array of the array's dimensions. `array_dims([[1, 2, 3], [4, 5, 6]]) -> [2, 3]` | +| array_distinct(array) | Returns distinct values from the array after removing duplicates. `array_distinct([1, 3, 2, 3, 1, 2, 4]) -> [1, 2, 3, 4]` | +| array_element(array, index) | Extracts the element with the index n from the array `array_element([1, 2, 3, 4], 3) -> 3` | +| flatten(array) | Converts an array of arrays to a flat array `flatten([[1], [2, 3], [4, 5, 6]]) -> [1, 2, 3, 4, 5, 6]` | +| array_length(array, dimension) | Returns the length of the array dimension. `array_length([1, 2, 3, 4, 5]) -> 5` | +| array_ndims(array) | Returns the number of dimensions of the array. `array_ndims([[1, 2, 3], [4, 5, 6]]) -> 2` | +| array_pop_front(array) | Returns the array without the first element. `array_pop_front([1, 2, 3]) -> [2, 3]` | +| array_pop_back(array) | Returns the array without the last element. `array_pop_back([1, 2, 3]) -> [1, 2]` | +| array_position(array, element) | Searches for an element in the array, returns first occurrence. `array_position([1, 2, 2, 3, 4], 2) -> 2` | +| array_positions(array, element) | Searches for an element in the array, returns all occurrences. `array_positions([1, 2, 2, 3, 4], 2) -> [2, 3]` | +| array_prepend(array, element) | Prepends an element to the beginning of an array. `array_prepend(1, [2, 3, 4]) -> [1, 2, 3, 4]` | +| array_repeat(element, count) | Returns an array containing element `count` times. `array_repeat(1, 3) -> [1, 1, 1]` | +| array_remove(array, element) | Removes the first element from the array equal to the given value. `array_remove([1, 2, 2, 3, 2, 1, 4], 2) -> [1, 2, 3, 2, 1, 4]` | +| array_remove_n(array, element, max) | Removes the first `max` elements from the array equal to the given value. `array_remove_n([1, 2, 2, 3, 2, 1, 4], 2, 2) -> [1, 3, 2, 1, 4]` | +| array_remove_all(array, element) | Removes all elements from the array equal to the given value. `array_remove_all([1, 2, 2, 3, 2, 1, 4], 2) -> [1, 3, 1, 4]` | +| array_replace(array, from, to) | Replaces the first occurrence of the specified element with another specified element. `array_replace([1, 2, 2, 3, 2, 1, 4], 2, 5) -> [1, 5, 2, 3, 2, 1, 4]` | +| array_replace_n(array, from, to, max) | Replaces the first `max` occurrences of the specified element with another specified element. `array_replace_n([1, 2, 2, 3, 2, 1, 4], 2, 5, 2) -> [1, 5, 5, 3, 2, 1, 4]` | +| array_replace_all(array, from, to) | Replaces all occurrences of the specified element with another specified element. `array_replace_all([1, 2, 2, 3, 2, 1, 4], 2, 5) -> [1, 5, 5, 3, 5, 1, 4]` | +| array_slice(array, begin,end) | Returns a slice of the array. `array_slice([1, 2, 3, 4, 5, 6, 7, 8], 3, 6) -> [3, 4, 5, 6]` | +| array_slice(array, begin, end, stride) | Returns a slice of the array with added stride feature. `array_slice([1, 2, 3, 4, 5, 6, 7, 8], 3, 6, 2) -> [3, 5, 6]` | +| array_to_string(array, delimiter) | Converts each element to its text representation. `array_to_string([1, 2, 3, 4], ',') -> 1,2,3,4` | +| array_intersect(array1, array2) | Returns an array of the elements in the intersection of array1 and array2. `array_intersect([1, 2, 3, 4], [5, 6, 3, 4]) -> [3, 4]` | +| array_union(array1, array2) | Returns an array of the elements in the union of array1 and array2 without duplicates. `array_union([1, 2, 3, 4], [5, 6, 3, 4]) -> [1, 2, 3, 4, 5, 6]` | +| array_except(array1, array2) | Returns an array of the elements that appear in the first array but not in the second. `array_except([1, 2, 3, 4], [5, 6, 3, 4]) -> [3, 4]` | +| array_resize(array, size, value) | Resizes the list to contain size elements. Initializes new elements with value or empty if value is not set. `array_resize([1, 2, 3], 5, 0) -> [1, 2, 3, 4, 5, 6]` | +| array_sort(array, desc, null_first) | Returns sorted array. `array_sort([3, 1, 2, 5, 4]) -> [1, 2, 3, 4, 5]` | +| cardinality(array) | Returns the total number of elements in the array. `cardinality([[1, 2, 3], [4, 5, 6]]) -> 6` | +| make_array(value1, [value2 [, ...]]) | Returns an Arrow array using the specified input expressions. `make_array(1, 2, 3) -> [1, 2, 3]` | +| range(start [, stop, step]) | Returns an Arrow array between start and stop with step. `SELECT range(2, 10, 3) -> [2, 5, 8]` | +| string_to_array(array, delimiter, null_string) | Splits a `string` based on a `delimiter` and returns an array of parts. Any parts matching the optional `null_string` will be replaced with `NULL`. `string_to_array('abc#def#ghi', '#', ' ') -> ['abc', 'def', 'ghi']` | +| trim_array(array, n) | Deprecated | ## Regular Expressions diff --git a/docs/source/user-guide/sql/data_types.md b/docs/source/user-guide/sql/data_types.md index 9f99d7bcb8ca..bfbd3433f1cf 100644 --- a/docs/source/user-guide/sql/data_types.md +++ b/docs/source/user-guide/sql/data_types.md @@ -30,11 +30,11 @@ the `arrow_typeof` function. For example: ```sql select arrow_typeof(interval '1 month'); -+-------------------------------------+ -| arrowtypeof(IntervalYearMonth("1")) | -+-------------------------------------+ -| Interval(YearMonth) | -+-------------------------------------+ ++---------------------------------------------------------------------+ +| arrow_typeof(IntervalMonthDayNano("79228162514264337593543950336")) | ++---------------------------------------------------------------------+ +| Interval(MonthDayNano) | ++---------------------------------------------------------------------+ ``` You can cast a SQL expression to a specific Arrow type using the `arrow_cast` function diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index b0385b492365..420de5f3fdba 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -1949,9 +1949,10 @@ from_unixtime(expression) - [array_concat](#array_concat) - [array_contains](#array_contains) - [array_dims](#array_dims) +- [array_distinct](#array_distinct) - [array_has](#array_has) - [array_has_all](#array_has_all) -- [array_has_any](#array_has_any)] +- [array_has_any](#array_has_any) - [array_element](#array_element) - [array_except](#array_except) - [array_extract](#array_extract) @@ -1987,6 +1988,7 @@ from_unixtime(expression) - [list_cat](#list_cat) - [list_concat](#list_concat) - [list_dims](#list_dims) +- [list_distinct](#list_distinct) - [list_element](#list_element) - [list_extract](#list_extract) - [list_has](#list_has) @@ -2204,6 +2206,34 @@ array_dims(array) - list_dims +### `array_distinct` + +Returns distinct values from the array after removing duplicates. + +``` +array_distinct(array) +``` + +#### Arguments + +- **array**: Array expression. + Can be a constant, column, or function, and any combination of array operators. + +#### Example + +``` +❯ select array_distinct([1, 3, 2, 3, 1, 2, 4]); ++---------------------------------+ +| array_distinct(List([1,2,3,4])) | ++---------------------------------+ +| [1, 2, 3, 4] | ++---------------------------------+ +``` + +#### Aliases + +- list_distinct + ### `array_element` Extracts the element with the index n from the array. @@ -3113,6 +3143,7 @@ _Alias of [make_array](#make_array)._ ### `string_to_array` Splits a string in to an array of substrings based on a delimiter. Any substrings matching the optional `null_str` argument are replaced with NULL. +`SELECT string_to_array('abc##def', '##')` or `SELECT string_to_array('abc def', ' ', 'def')` ``` starts_with(str, delimiter[, null_str])