apache · alamb · Jul 13, 2024 · Jul 12, 2024 · Jul 12, 2024
diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md
@@ -71,8 +71,8 @@ cargo run --example dataframe
 - [`parquet_index.rs`](examples/parquet_index.rs): Create an secondary index over several parquet files and use it to speed up queries
 - [`parquet_sql_multiple_files.rs`](examples/parquet_sql_multiple_files.rs): Build and run a query plan from a SQL statement against multiple local Parquet files
 - [`parquet_exec_visitor.rs`](examples/parquet_exec_visitor.rs): Extract statistics by visiting an ExecutionPlan after execution
-- [`parse_sql_expr.rs`](examples/parse_sql_expr.rs): Parse SQL text into Datafusion `Expr`.
-- [`plan_to_sql.rs`](examples/plan_to_sql.rs): Generate SQL from Datafusion `Expr` and `LogicalPlan`
+- [`parse_sql_expr.rs`](examples/parse_sql_expr.rs): Parse SQL text into DataFusion `Expr`.
+- [`plan_to_sql.rs`](examples/plan_to_sql.rs): Generate SQL from DataFusion `Expr` and `LogicalPlan`
 - [`pruning.rs`](examples/pruning.rs): Use pruning to rule out files based on statistics
 - [`query-aws-s3.rs`](examples/external_dependency/query-aws-s3.rs): Configure `object_store` and run a query against files stored in AWS S3
 - [`query-http-csv.rs`](examples/query-http-csv.rs): Configure `object_store` and run a query against files vi HTTP

diff --git a/datafusion-examples/examples/expr_api.rs b/datafusion-examples/examples/expr_api.rs
@@ -83,7 +83,7 @@ async fn main() -> Result<()> {
     Ok(())
 }
 
-/// Datafusion's `expr_fn` API makes it easy to create [`Expr`]s for the
+/// DataFusion's `expr_fn` API makes it easy to create [`Expr`]s for the
 /// full range of expression types such as aggregates and window functions.
 fn expr_fn_demo() -> Result<()> {
     // Let's say you want to call the "first_value" aggregate function

diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
@@ -309,7 +309,7 @@ config_namespace! {
         /// Currently experimental
         pub split_file_groups_by_statistics: bool, default = false
 
-        /// Should Datafusion keep the columns used for partition_by in the output RecordBatches
+        /// Should DataFusion keep the columns used for partition_by in the output RecordBatches
         pub keep_partition_by_columns: bool, default = false
     }
 }

diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs
@@ -1472,7 +1472,7 @@ impl DataFrame {
     ///
     /// The method supports case sensitive rename with wrapping column name into one of following symbols (  "  or  '  or  `  )
     ///
-    /// Alternatively setting Datafusion param `datafusion.sql_parser.enable_ident_normalization` to `false` will enable  
+    /// Alternatively setting DataFusion param `datafusion.sql_parser.enable_ident_normalization` to `false` will enable
     /// case sensitive rename without need to wrap column name into special symbols
     ///
     /// # Example

diff --git a/datafusion/expr/src/signature.rs b/datafusion/expr/src/signature.rs
@@ -93,7 +93,7 @@ pub enum TypeSignature {
     Variadic(Vec<DataType>),
     /// The acceptable signature and coercions rules to coerce arguments to this
     /// signature are special for this function. If this signature is specified,
-    /// Datafusion will call [`ScalarUDFImpl::coerce_types`] to prepare argument types.
+    /// DataFusion will call [`ScalarUDFImpl::coerce_types`] to prepare argument types.
     ///
     /// [`ScalarUDFImpl::coerce_types`]: crate::udf::ScalarUDFImpl::coerce_types
     UserDefined,

diff --git a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs
@@ -1080,7 +1080,7 @@ mod tests {
                 ),
             };
 
-            // Datafusion ignores timezones for comparisons of ScalarValue
+            // DataFusion ignores timezones for comparisons of ScalarValue
             // so double check it here
             assert_eq!(lit_tz_none, lit_tz_utc);
 

diff --git a/datafusion/physical-expr/src/intervals/cp_solver.rs b/datafusion/physical-expr/src/intervals/cp_solver.rs
@@ -176,7 +176,7 @@ impl ExprIntervalGraphNode {
         &self.interval
     }
 
-    /// This function creates a DAEG node from Datafusion's [`ExprTreeNode`]
+    /// This function creates a DAEG node from DataFusion's [`ExprTreeNode`]
     /// object. Literals are created with definite, singleton intervals while
     /// any other expression starts with an indefinite interval ([-∞, ∞]).
     pub fn make_node(node: &ExprTreeNode<NodeIndex>, schema: &Schema) -> Result<Self> {

diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs
@@ -324,7 +324,7 @@ impl AggregateExec {
 
     /// Create a new hash aggregate execution plan with the given schema.
     /// This constructor isn't part of the public API, it is used internally
-    /// by Datafusion to enforce schema consistency during when re-creating
+    /// by DataFusion to enforce schema consistency during when re-creating
     /// `AggregateExec`s inside optimization rules. Schema field names of an
     /// `AggregateExec` depends on the names of aggregate expressions. Since
     /// a rule may re-write aggregate expressions (e.g. reverse them) during

diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs
@@ -253,7 +253,7 @@ fn ensure_not_set<T>(field: &Option<T>, name: &str) -> Result<(), ParserError> {
     Ok(())
 }
 
-/// Datafusion SQL Parser based on [`sqlparser`]
+/// DataFusion SQL Parser based on [`sqlparser`]
 ///
 /// Parses DataFusion's SQL dialect, often delegating to [`sqlparser`]'s [`Parser`].
 ///

diff --git a/datafusion/sqllogictest/README.md b/datafusion/sqllogictest/README.md
@@ -225,7 +225,7 @@ query <type_string> <sort_mode>
 <expected_result>
 ```
 
-- `test_name`: Uniquely identify the test name (Datafusion only)
+- `test_name`: Uniquely identify the test name (DataFusion only)
 - `type_string`: A short string that specifies the number of result columns and the expected datatype of each result
   column. There is one character in the <type_string> for each result column. The characters codes are:
   - 'B' - **B**oolean,

diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt
@@ -257,7 +257,7 @@ datafusion.execution.batch_size 8192 Default batch size while creating new batch
 datafusion.execution.coalesce_batches true When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting
 datafusion.execution.collect_statistics false Should DataFusion collect statistics after listing files
 datafusion.execution.enable_recursive_ctes true Should DataFusion support recursive CTEs
-datafusion.execution.keep_partition_by_columns false Should Datafusion keep the columns used for partition_by in the output RecordBatches
+datafusion.execution.keep_partition_by_columns false Should DataFusion keep the columns used for partition_by in the output RecordBatches
 datafusion.execution.listing_table_ignore_subdirectory true Should sub directories be ignored when scanning directories for data files. Defaults to true (ignores subdirectories), consistent with Hive. Note that this setting does not affect reading partitioned tables (e.g. `/table/year=2021/month=01/data.parquet`).
 datafusion.execution.max_buffered_batches_per_output_file 2 This is the maximum number of RecordBatches buffered for each output file being worked. Higher values can potentially give faster write performance at the cost of higher peak memory consumption
 datafusion.execution.meta_fetch_concurrency 32 Number of files to read in parallel when inferring schema and statistics

diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt
@@ -2236,7 +2236,7 @@ SELECT SUM(c12) OVER(ORDER BY c1, c2 GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING)
 7.728066219895 NULL
 
 # test_c9_rn_ordering_alias
-# These tests check whether Datafusion is aware of the ordering generated by the ROW_NUMBER() window function.
+# These tests check whether DataFusion is aware of the ordering generated by the ROW_NUMBER() window function.
 # Physical plan shouldn't have a SortExec after the BoundedWindowAggExec since the table after BoundedWindowAggExec is already ordered by rn1 ASC and c9 DESC.
 query TT
 EXPLAIN SELECT c9, rn1 FROM (SELECT c9,
@@ -2275,7 +2275,7 @@ SELECT c9, rn1 FROM (SELECT c9,
 145294611 5
 
 # test_c9_rn_ordering_alias_opposite_direction
-# These tests check whether Datafusion is aware of the ordering generated by the ROW_NUMBER() window function.
+# These tests check whether DataFusion is aware of the ordering generated by the ROW_NUMBER() window function.
 # Physical plan shouldn't have a SortExec after the BoundedWindowAggExec since the table after BoundedWindowAggExec is already ordered by rn1 ASC and c9 DESC.
 query TT
 EXPLAIN SELECT c9, rn1 FROM (SELECT c9,
@@ -2314,7 +2314,7 @@ SELECT c9, rn1 FROM (SELECT c9,
 4076864659 5
 
 # test_c9_rn_ordering_alias_opposite_direction2
-# These tests check whether Datafusion is aware of the ordering generated by the ROW_NUMBER() window function.
+# These tests check whether DataFusion is aware of the ordering generated by the ROW_NUMBER() window function.
 # Physical plan _should_ have a SortExec after BoundedWindowAggExec since the table after BoundedWindowAggExec is ordered by rn1 ASC and c9 DESC, which is conflicting with the requirement rn1 DESC.
 query TT
 EXPLAIN SELECT c9, rn1 FROM (SELECT c9,

diff --git a/docs/source/contributor-guide/inviting.md b/docs/source/contributor-guide/inviting.md
@@ -59,7 +59,7 @@ the person. Here is an example:
 To: [email protected]
 Subject: [DISCUSS] $PERSONS_NAME for Committer
 
-$PERSONS_NAME has been an active contributor to the Datafusion community for the
+$PERSONS_NAME has been an active contributor to the DataFusion community for the
 last 6 months[1][2], helping others, answering questions, and improving the
 project's code.
 

diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md
@@ -86,7 +86,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus
 | datafusion.execution.listing_table_ignore_subdirectory                  | true                      | Should sub directories be ignored when scanning directories for data files. Defaults to true (ignores subdirectories), consistent with Hive. Note that this setting does not affect reading partitioned tables (e.g. `/table/year=2021/month=01/data.parquet`).                                                                                                                                                                                                                                                                                                                                         |
 | datafusion.execution.enable_recursive_ctes                              | true                      | Should DataFusion support recursive CTEs                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
 | datafusion.execution.split_file_groups_by_statistics                    | false                     | Attempt to eliminate sorts by packing & sorting files with non-overlapping statistics into the same file groups. Currently experimental                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-| datafusion.execution.keep_partition_by_columns                          | false                     | Should Datafusion keep the columns used for partition_by in the output RecordBatches                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.execution.keep_partition_by_columns                          | false                     | Should DataFusion keep the columns used for partition_by in the output RecordBatches                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
 | datafusion.optimizer.enable_distinct_aggregation_soft_limit             | true                      | When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read.                                                                                                                                                                                                                                                                                                                                                                           |
 | datafusion.optimizer.enable_round_robin_repartition                     | true                      | When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
 | datafusion.optimizer.enable_topk_aggregation                            | true                      | When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |