[doc] typos and clarity

apache · Oct 12, 2021 · 2e4452a · 2e4452a
1 parent d69fc9a
commit 2e4452a
Show file tree

Hide file tree

Showing 10 changed files with 20 additions and 23 deletions.
diff --git a/ballista/rust/client/src/context.rs b/ballista/rust/client/src/context.rs
@@ -239,7 +239,7 @@ impl BallistaContext {
     /// Create a DataFrame from a SQL statement.
     ///
     /// This method is `async` because queries of type `CREATE EXTERNAL TABLE`
-    /// might require the schema to be infered.
+    /// might require the schema to be inferred.
     pub async fn sql(&self, sql: &str) -> Result<Arc<dyn DataFrame>> {
         let mut ctx = {
             let state = self.state.lock().unwrap();

diff --git a/ballista/rust/core/src/serde/physical_plan/to_proto.rs b/ballista/rust/core/src/serde/physical_plan/to_proto.rs
@@ -261,7 +261,7 @@ impl TryInto<protobuf::PhysicalPlanNode> for Arc<dyn ExecutionPlan> {
                             .as_ref()
                             .ok_or_else(|| {
                                 BallistaError::General(
-                                    "projection in CsvExec dosn not exist.".to_owned(),
+                                    "projection in CsvExec does not exist.".to_owned(),
                                 )
                             })?
                             .iter()
@@ -320,7 +320,7 @@ impl TryInto<protobuf::PhysicalPlanNode> for Arc<dyn ExecutionPlan> {
                             .as_ref()
                             .ok_or_else(|| {
                                 BallistaError::General(
-                                    "projection in AvroExec dosn not exist.".to_owned(),
+                                    "projection in AvroExec does not exist.".to_owned(),
                                 )
                             })?
                             .iter()

diff --git a/datafusion/src/datasource/file_format/mod.rs b/datafusion/src/datasource/file_format/mod.rs
@@ -27,7 +27,6 @@ use std::fmt;
 use std::sync::Arc;
 
 use crate::arrow::datatypes::SchemaRef;
-use crate::datasource::{create_max_min_accs, get_col_stats};
 use crate::error::Result;
 use crate::logical_plan::Expr;
 use crate::physical_plan::{ExecutionPlan, Statistics};
@@ -44,15 +43,15 @@ pub struct PhysicalPlanConfig {
     pub object_store: Arc<dyn ObjectStore>,
     /// Schema before projection
     pub schema: SchemaRef,
-    /// Partitioned fields to process in the executor
+    /// List of files to be processed, grouped into partitions
     pub files: Vec<Vec<PartitionedFile>>,
-    /// Estimated overall statistics of source plan
+    /// Estimated overall statistics of the plan, taking `filters` into account
     pub statistics: Statistics,
     /// Columns on which to project the data
     pub projection: Option<Vec<usize>>,
     /// The maximum number of records per arrow column
     pub batch_size: usize,
-    /// The filters that where pushed down to this execution plan
+    /// The filters that were pushed down to this execution plan
     pub filters: Vec<Expr>,
     /// The minimum number of records required from this source plan
     pub limit: Option<usize>,

diff --git a/datafusion/src/datasource/file_format/parquet.rs b/datafusion/src/datasource/file_format/parquet.rs
@@ -36,9 +36,9 @@ use parquet::file::statistics::Statistics as ParquetStatistics;
 
 use super::FileFormat;
 use super::PhysicalPlanConfig;
-use super::{create_max_min_accs, get_col_stats};
 use crate::arrow::datatypes::{DataType, Field};
 use crate::datasource::object_store::{ObjectReader, ObjectReaderStream};
+use crate::datasource::{create_max_min_accs, get_col_stats};
 use crate::error::DataFusionError;
 use crate::error::Result;
 use crate::logical_plan::combine_filters;

diff --git a/datafusion/src/datasource/listing.rs b/datafusion/src/datasource/listing.rs
@@ -46,7 +46,7 @@ pub struct ListingOptions {
     pub file_extension: String,
     /// The file format
     pub format: Arc<dyn FileFormat>,
-    /// The expected partition column names.
+    /// The expected partition column names in the folder structure.
     /// For example `Vec["a", "b"]` means that the two first levels of
     /// partitioning expected should be named "a" and "b":
     /// - If there is a third level of partitioning it will be ignored.
@@ -55,11 +55,11 @@ pub struct ListingOptions {
     /// TODO implement case where partitions.len() > 0
     pub partitions: Vec<String>,
     /// Set true to try to guess statistics from the files.
-    /// This can add a lot of overhead as it requires files to
-    /// be opened and partially parsed.
+    /// This can add a lot of overhead as it will usually require files
+    /// to be opened and at least partially parsed.
     pub collect_stat: bool,
-    /// Group files to avoid that the number of partitions
-    /// exceeds this limit
+    /// Group files to avoid that the number of partitions exceeds
+    /// this limit
     pub target_partitions: usize,
 }
 
@@ -80,8 +80,8 @@ impl ListingOptions {
         }
     }
 
-    /// Infer the schema of the files at the given uri, including the partitioning
-    /// columns.
+    /// Infer the schema of the files at the given path on the provided object store.
+    /// The inferred schema should include the partitioning columns.
     ///
     /// This method will not be called by the table itself but before creating it.
     /// This way when creating the logical plan we can decide to resolve the schema

diff --git a/datafusion/src/datasource/mod.rs b/datafusion/src/datasource/mod.rs
@@ -40,7 +40,6 @@ use std::pin::Pin;
 /// if the optional `limit` is provided, includes only sufficient files
 /// needed to read up to `limit` number of rows
 /// TODO fix case where `num_rows` and `total_byte_size` are not defined (stat should be None instead of Some(0))
-/// TODO move back to crate::datasource::mod.rs once legacy cleaned up
 pub async fn get_statistics_with_limit(
     all_files: impl Stream<Item = Result<(PartitionedFile, Statistics)>>,
     schema: SchemaRef,
@@ -126,7 +125,6 @@ pub async fn get_statistics_with_limit(
 #[derive(Debug, Clone)]
 /// A single file that should be read, along with its schema, statistics
 /// and partition column values that need to be appended to each row.
-/// TODO move back to crate::datasource::mod.rs once legacy cleaned up
 pub struct PartitionedFile {
     /// Path for the file (e.g. URL, filesystem path, etc)
     pub file_meta: FileMeta,
@@ -159,7 +157,6 @@ impl std::fmt::Display for PartitionedFile {
 
 #[derive(Debug, Clone)]
 /// A collection of files that should be read in a single task
-/// TODO move back to crate::datasource::mod.rs once legacy cleaned up
 pub struct FilePartition {
     /// The index of the partition among all partitions
     pub index: usize,

diff --git a/datafusion/src/datasource/object_store/local.rs b/datafusion/src/datasource/object_store/local.rs
@@ -152,12 +152,13 @@ async fn list_all(prefix: String) -> Result<FileMetaStream> {
     }
 }
 
-/// Create a stream of `ObjectReader` by opening each file in the `files` vector
+/// Create a stream of `ObjectReader` by converting each file in the `files` vector
+/// into instances of `LocalFileReader`
 pub fn local_object_reader_stream(files: Vec<String>) -> ObjectReaderStream {
     Box::pin(futures::stream::iter(files).map(|f| Ok(local_object_reader(f))))
 }
 
-/// Helper method to convert a file location to an ObjectReader
+/// Helper method to convert a file location to a `LocalFileReader`
 pub fn local_object_reader(file: String) -> Arc<dyn ObjectReader> {
     LocalFileSystem
         .file_reader(local_file_meta(file).sized_file)

diff --git a/datafusion/src/datasource/object_store/mod.rs b/datafusion/src/datasource/object_store/mod.rs
@@ -73,7 +73,7 @@ pub enum ListEntry {
 #[derive(Debug, Clone)]
 pub struct SizedFile {
     /// Path of the file. It is relative to the current object
-    /// store (it does not specify the xx:// scheme).
+    /// store (it does not specify the `xx://` scheme).
     pub path: String,
     /// File size in total
     pub size: u64,

diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs
@@ -183,7 +183,7 @@ impl ExecutionContext {
     /// Creates a dataframe that will execute a SQL query.
     ///
     /// This method is `async` because queries of type `CREATE EXTERNAL TABLE`
-    /// might require the schema to be infered.
+    /// might require the schema to be inferred.
     pub async fn sql(&mut self, sql: &str) -> Result<Arc<dyn DataFrame>> {
         let plan = self.create_logical_plan(sql)?;
         match plan {

diff --git a/datafusion/src/physical_plan/expressions/binary.rs b/datafusion/src/physical_plan/expressions/binary.rs
@@ -484,7 +484,7 @@ pub fn binary_operator_data_type(
     rhs_type: &DataType,
 ) -> Result<DataType> {
     // validate that it is possible to perform the operation on incoming types.
-    // (or the return datatype cannot be infered)
+    // (or the return datatype cannot be inferred)
     let common_type = common_binary_type(lhs_type, op, rhs_type)?;
 
     match op {