From 09b755d92bd2ebf2c92c7568d10a365b8081da6c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Tue, 3 Jan 2023 14:16:53 +0000 Subject: [PATCH] Move default catalog and schema onto ConfigOptions (#3887) --- datafusion/core/src/config.rs | 6 ++ datafusion/core/src/execution/context.rs | 61 +++++++++--------- .../test_files/information_schema.slt | 2 + docs/source/user-guide/configs.md | 62 ++++++++++--------- 4 files changed, 72 insertions(+), 59 deletions(-) diff --git a/datafusion/core/src/config.rs b/datafusion/core/src/config.rs index 4e6cddcd5ae0..4b3df66da671 100644 --- a/datafusion/core/src/config.rs +++ b/datafusion/core/src/config.rs @@ -158,6 +158,12 @@ config_namespace! { /// concurrency. Defaults to the number of cpu cores on the system. pub create_default_catalog_and_schema: bool, default = true + /// The default catalog name - this impacts what SQL queries use if not specified + pub default_catalog: String, default = "datafusion".to_string() + + /// The default schema name - this impacts what SQL queries use if not specified + pub default_schema: String, default = "public".to_string() + /// Should DataFusion provide access to `information_schema` /// virtual tables for displaying schema information pub information_schema: bool, default = false diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs index 1e52c84ac103..5c72f21702a3 100644 --- a/datafusion/core/src/execution/context.rs +++ b/datafusion/core/src/execution/context.rs @@ -99,11 +99,6 @@ use super::options::{ AvroReadOptions, CsvReadOptions, NdJsonReadOptions, ParquetReadOptions, }; -/// The default catalog name - this impacts what SQL queries use if not specified -const DEFAULT_CATALOG: &str = "datafusion"; -/// The default schema name - this impacts what SQL queries use if not specified -const DEFAULT_SCHEMA: &str = "public"; - /// SessionContext is the main interface for executing queries with DataFusion. It stands for /// the connection between user and DataFusion/Ballista cluster. /// The context provides the following functionality @@ -380,18 +375,32 @@ impl SessionContext { // so for now, we default to default catalog let tokens: Vec<&str> = schema_name.split('.').collect(); let (catalog, schema_name) = match tokens.len() { - 1 => Ok((DEFAULT_CATALOG, schema_name.as_str())), - 2 => Ok((tokens[0], tokens[1])), - _ => Err(DataFusionError::Execution(format!( - "Unable to parse catalog from {schema_name}" - ))), - }?; - let catalog = self.catalog(catalog).ok_or_else(|| { - DataFusionError::Execution(format!( - "Missing '{DEFAULT_CATALOG}' catalog" - )) - })?; - + 1 => { + let state = self.state.read(); + let name = &state.config.options.catalog.default_catalog; + let catalog = + state.catalog_list.catalog(name).ok_or_else(|| { + DataFusionError::Execution(format!( + "Missing default catalog '{name}'" + )) + })?; + (catalog, tokens[0]) + } + 2 => { + let name = &tokens[0]; + let catalog = self.catalog(name).ok_or_else(|| { + DataFusionError::Execution(format!( + "Missing catalog '{name}'" + )) + })?; + (catalog, tokens[1]) + } + _ => { + return Err(DataFusionError::Execution(format!( + "Unable to parse catalog from {schema_name}" + ))) + } + }; let schema = catalog.schema(schema_name); match (if_not_exists, schema) { @@ -1097,11 +1106,6 @@ impl Hasher for IdHasher { /// Configuration options for session context #[derive(Clone)] pub struct SessionConfig { - /// Default catalog name for table resolution - default_catalog: String, - /// Default schema name for table resolution (not in ConfigOptions - /// due to `resolve_table_ref` which passes back references) - default_schema: String, /// Configuration options options: ConfigOptions, /// Opaque extensions. @@ -1111,8 +1115,6 @@ pub struct SessionConfig { impl Default for SessionConfig { fn default() -> Self { Self { - default_catalog: DEFAULT_CATALOG.to_owned(), - default_schema: DEFAULT_SCHEMA.to_owned(), options: ConfigOptions::new(), // Assume no extensions by default. extensions: HashMap::with_capacity_and_hasher( @@ -1218,8 +1220,8 @@ impl SessionConfig { catalog: impl Into, schema: impl Into, ) -> Self { - self.default_catalog = catalog.into(); - self.default_schema = schema.into(); + self.options.catalog.default_catalog = catalog.into(); + self.options.catalog.default_schema = schema.into(); self } @@ -1434,7 +1436,7 @@ impl SessionState { default_catalog .register_schema( - &config.default_schema, + &config.config_options().catalog.default_schema, Arc::new(MemorySchemaProvider::new()), ) .expect("memory catalog provider can register schema"); @@ -1442,7 +1444,7 @@ impl SessionState { Self::register_default_schema(&config, &runtime, &default_catalog); catalog_list.register_catalog( - config.default_catalog.clone(), + config.config_options().catalog.default_catalog.clone(), Arc::new(default_catalog), ); } @@ -1564,9 +1566,10 @@ impl SessionState { &'a self, table_ref: impl Into>, ) -> ResolvedTableReference<'a> { + let catalog = &self.config_options().catalog; table_ref .into() - .resolve(&self.config.default_catalog, &self.config.default_schema) + .resolve(&catalog.default_catalog, &catalog.default_schema) } fn schema_for_ref<'a>( diff --git a/datafusion/core/tests/sqllogictests/test_files/information_schema.slt b/datafusion/core/tests/sqllogictests/test_files/information_schema.slt index 2d663d33bf6c..24b54aa492cc 100644 --- a/datafusion/core/tests/sqllogictests/test_files/information_schema.slt +++ b/datafusion/core/tests/sqllogictests/test_files/information_schema.slt @@ -103,6 +103,8 @@ query R SHOW ALL ---- datafusion.catalog.create_default_catalog_and_schema true +datafusion.catalog.default_catalog datafusion +datafusion.catalog.default_schema public datafusion.catalog.format NULL datafusion.catalog.has_header false datafusion.catalog.information_schema true diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index cfc2ddc18814..57d23ce69060 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -35,33 +35,35 @@ Values are parsed according to the [same rules used in casts from Utf8](https:// If the value in the environment variable cannot be cast to the type of the configuration option, the default value will be used instead and a warning emitted. Environment variables are read during `SessionConfig` initialisation so they must be set beforehand and will not affect running sessions. -| key | default | description | -| --------------------------------------------------------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| datafusion.catalog.create_default_catalog_and_schema | true | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of cpu cores on the system. | -| datafusion.catalog.information_schema | false | Should DataFusion provide access to `information_schema` virtual tables for displaying schema information | -| datafusion.catalog.location | NULL | Location scanned to load tables for `default` schema | -| datafusion.catalog.format | NULL | Type of `TableProvider` to use when loading `default` schema | -| datafusion.catalog.has_header | false | If the file has a header | -| datafusion.execution.batch_size | 8192 | Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would results in too much metadata memory consumption | -| datafusion.execution.coalesce_batches | true | When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting | -| datafusion.execution.collect_statistics | false | Should DataFusion collect statistics after listing files | -| datafusion.execution.target_partitions | 0 | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of cpu cores on the system | -| datafusion.execution.time_zone | +00:00 | The default time zone Some functions, e.g. EXTRACT(HOUR from SOME_TIME), shift the underlying datetime according to this time zone, and then extract the hour | -| datafusion.execution.parquet.enable_page_index | false | If true, uses parquet data page level metadata (Page Index) statistics to reduce the number of rows decoded. | -| datafusion.execution.parquet.pruning | true | If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file | -| datafusion.execution.parquet.skip_metadata | true | If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata | -| datafusion.execution.parquet.metadata_size_hint | NULL | If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two read are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer | -| datafusion.execution.parquet.pushdown_filters | false | If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded | -| datafusion.execution.parquet.reorder_filters | false | If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query | -| datafusion.optimizer.enable_round_robin_repartition | true | When set to true, the physical plan optimizer will try to add round robin repartition to increase parallelism to leverage more CPU cores | -| datafusion.optimizer.filter_null_join_keys | false | When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down. | -| datafusion.optimizer.repartition_aggregations | true | Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level" | -| datafusion.optimizer.repartition_joins | true | Should DataFusion repartition data using the join keys to execute joins in parallel using the provided `target_partitions` level" | -| datafusion.optimizer.repartition_windows | true | Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level" | -| datafusion.optimizer.skip_failed_rules | true | When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail | -| datafusion.optimizer.max_passes | 3 | Number of times that the optimizer will attempt to optimize the plan | -| datafusion.optimizer.top_down_join_key_reordering | true | When set to true, the physical plan optimizer will run a top down process to reorder the join keys | -| datafusion.optimizer.prefer_hash_join | true | When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin. HashJoin can work more efficiently than SortMergeJoin but consumes more memory | -| datafusion.optimizer.hash_join_single_partition_threshold | 1048576 | The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition | -| datafusion.explain.logical_plan_only | false | When set to true, the explain statement will only print logical plans | -| datafusion.explain.physical_plan_only | false | When set to true, the explain statement will only print physical plans | +| key | default | description | +| --------------------------------------------------------- | ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| datafusion.catalog.create_default_catalog_and_schema | true | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of cpu cores on the system. | +| datafusion.catalog.default_catalog | datafusion | The default catalog name - this impacts what SQL queries use if not specified | +| datafusion.catalog.default_schema | public | The default schema name - this impacts what SQL queries use if not specified | +| datafusion.catalog.information_schema | false | Should DataFusion provide access to `information_schema` virtual tables for displaying schema information | +| datafusion.catalog.location | NULL | Location scanned to load tables for `default` schema | +| datafusion.catalog.format | NULL | Type of `TableProvider` to use when loading `default` schema | +| datafusion.catalog.has_header | false | If the file has a header | +| datafusion.execution.batch_size | 8192 | Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would results in too much metadata memory consumption | +| datafusion.execution.coalesce_batches | true | When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting | +| datafusion.execution.collect_statistics | false | Should DataFusion collect statistics after listing files | +| datafusion.execution.target_partitions | 0 | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of cpu cores on the system | +| datafusion.execution.time_zone | +00:00 | The default time zone Some functions, e.g. EXTRACT(HOUR from SOME_TIME), shift the underlying datetime according to this time zone, and then extract the hour | +| datafusion.execution.parquet.enable_page_index | false | If true, uses parquet data page level metadata (Page Index) statistics to reduce the number of rows decoded. | +| datafusion.execution.parquet.pruning | true | If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file | +| datafusion.execution.parquet.skip_metadata | true | If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata | +| datafusion.execution.parquet.metadata_size_hint | NULL | If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two read are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer | +| datafusion.execution.parquet.pushdown_filters | false | If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded | +| datafusion.execution.parquet.reorder_filters | false | If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query | +| datafusion.optimizer.enable_round_robin_repartition | true | When set to true, the physical plan optimizer will try to add round robin repartition to increase parallelism to leverage more CPU cores | +| datafusion.optimizer.filter_null_join_keys | false | When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down. | +| datafusion.optimizer.repartition_aggregations | true | Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level" | +| datafusion.optimizer.repartition_joins | true | Should DataFusion repartition data using the join keys to execute joins in parallel using the provided `target_partitions` level" | +| datafusion.optimizer.repartition_windows | true | Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level" | +| datafusion.optimizer.skip_failed_rules | true | When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail | +| datafusion.optimizer.max_passes | 3 | Number of times that the optimizer will attempt to optimize the plan | +| datafusion.optimizer.top_down_join_key_reordering | true | When set to true, the physical plan optimizer will run a top down process to reorder the join keys | +| datafusion.optimizer.prefer_hash_join | true | When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin. HashJoin can work more efficiently than SortMergeJoin but consumes more memory | +| datafusion.optimizer.hash_join_single_partition_threshold | 1048576 | The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition | +| datafusion.explain.logical_plan_only | false | When set to true, the explain statement will only print logical plans | +| datafusion.explain.physical_plan_only | false | When set to true, the explain statement will only print physical plans |