From cd1d146d4916229dbcd304679abf804d8593d316 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 8 Jul 2024 17:00:10 -0400 Subject: [PATCH] Improve `DataFrame` Users Guide (#11324) * Improve `DataFrame` Users Guide * typo * Update docs/source/user-guide/dataframe.md Co-authored-by: Oleks V --------- Co-authored-by: Oleks V --- datafusion/core/src/lib.rs | 6 ++ docs/source/user-guide/dataframe.md | 123 +++++++++++----------------- 2 files changed, 53 insertions(+), 76 deletions(-) diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index fb7abcd795e88..956e9f7246a36 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -626,6 +626,12 @@ doc_comment::doctest!( user_guide_configs ); +#[cfg(doctest)] +doc_comment::doctest!( + "../../../docs/source/user-guide/dataframe.md", + user_guide_dataframe +); + #[cfg(doctest)] doc_comment::doctest!( "../../../docs/source/user-guide/expressions.md", diff --git a/docs/source/user-guide/dataframe.md b/docs/source/user-guide/dataframe.md index f011e68fadb29..c3d0b6c2d6881 100644 --- a/docs/source/user-guide/dataframe.md +++ b/docs/source/user-guide/dataframe.md @@ -19,17 +19,30 @@ # DataFrame API -A DataFrame represents a logical set of rows with the same named columns, similar to a [Pandas DataFrame](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html) or -[Spark DataFrame](https://spark.apache.org/docs/latest/sql-programming-guide.html). +A DataFrame represents a logical set of rows with the same named columns, +similar to a [Pandas DataFrame] or [Spark DataFrame]. -DataFrames are typically created by calling a method on -`SessionContext`, such as `read_csv`, and can then be modified -by calling the transformation methods, such as `filter`, `select`, `aggregate`, and `limit` -to build up a query definition. +DataFrames are typically created by calling a method on [`SessionContext`], such +as [`read_csv`], and can then be modified by calling the transformation methods, +such as [`filter`], [`select`], [`aggregate`], and [`limit`] to build up a query +definition. -The query can be executed by calling the `collect` method. +The query can be executed by calling the [`collect`] method. -The DataFrame struct is part of DataFusion's prelude and can be imported with the following statement. +DataFusion DataFrames use lazy evaluation, meaning that each transformation +creates a new plan but does not actually perform any immediate actions. This +approach allows for the overall plan to be optimized before execution. The plan +is evaluated (executed) when an action method is invoked, such as [`collect`]. +See the [Library Users Guide] for more details. + +The DataFrame API is well documented in the [API reference on docs.rs]. +Please refer to the [Expressions Reference] for more information on +building logical expressions (`Expr`) to use with the DataFrame API. + +## Example + +The DataFrame struct is part of DataFusion's `prelude` and can be imported with +the following statement. ```rust use datafusion::prelude::*; @@ -38,73 +51,31 @@ use datafusion::prelude::*; Here is a minimal example showing the execution of a query using the DataFrame API. ```rust -let ctx = SessionContext::new(); -let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; -let df = df.filter(col("a").lt_eq(col("b")))? - .aggregate(vec![col("a")], vec![min(col("b"))])? - .limit(0, Some(100))?; -// Print results -df.show().await?; +use datafusion::prelude::*; +use datafusion::error::Result; + +#[tokio::main] +async fn main() -> Result<()> { + let ctx = SessionContext::new(); + let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + let df = df.filter(col("a").lt_eq(col("b")))? + .aggregate(vec![col("a")], vec![min(col("b"))])? + .limit(0, Some(100))?; + // Print results + df.show().await?; + Ok(()) +} ``` -The DataFrame API is well documented in the [API reference on docs.rs](https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html). - -Refer to the [Expressions Reference](expressions) for available functions for building logical expressions for use with the -DataFrame API. - -## DataFrame Transformations - -These methods create a new DataFrame after applying a transformation to the logical plan that the DataFrame represents. - -DataFusion DataFrames use lazy evaluation, meaning that each transformation is just creating a new query plan and -not actually performing any transformations. This approach allows for the overall plan to be optimized before -execution. The plan is evaluated (executed) when an action method is invoked, such as `collect`. - -| Function | Notes | -| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | -| aggregate | Perform an aggregate query with optional grouping expressions. | -| distinct | Filter out duplicate rows. | -| distinct_on | Filter out duplicate rows based on provided expressions. | -| drop_columns | Create a projection with all but the provided column names. | -| except | Calculate the exception of two DataFrames. The two DataFrames must have exactly the same schema | -| filter | Filter a DataFrame to only include rows that match the specified filter expression. | -| intersect | Calculate the intersection of two DataFrames. The two DataFrames must have exactly the same schema | -| join | Join this DataFrame with another DataFrame using the specified columns as join keys. | -| join_on | Join this DataFrame with another DataFrame using arbitrary expressions. | -| limit | Limit the number of rows returned from this DataFrame. | -| repartition | Repartition a DataFrame based on a logical partitioning scheme. | -| sort | Sort the DataFrame by the specified sorting expressions. Any expression can be turned into a sort expression by calling its `sort` method. | -| select | Create a projection based on arbitrary expressions. Example: `df.select(vec![col("c1"), abs(col("c2"))])?` | -| select_columns | Create a projection based on column names. Example: `df.select_columns(&["id", "name"])?`. | -| union | Calculate the union of two DataFrames, preserving duplicate rows. The two DataFrames must have exactly the same schema. | -| union_distinct | Calculate the distinct union of two DataFrames. The two DataFrames must have exactly the same schema. | -| with_column | Add an additional column to the DataFrame. | -| with_column_renamed | Rename one column by applying a new projection. | - -## DataFrame Actions - -These methods execute the logical plan represented by the DataFrame and either collects the results into memory, prints them to stdout, or writes them to disk. - -| Function | Notes | -| -------------------------- | --------------------------------------------------------------------------------------------------------------------------- | -| collect | Executes this DataFrame and collects all results into a vector of RecordBatch. | -| collect_partitioned | Executes this DataFrame and collects all results into a vector of vector of RecordBatch maintaining the input partitioning. | -| count | Executes this DataFrame to get the total number of rows. | -| execute_stream | Executes this DataFrame and returns a stream over a single partition. | -| execute_stream_partitioned | Executes this DataFrame and returns one stream per partition. | -| show | Execute this DataFrame and print the results to stdout. | -| show_limit | Execute this DataFrame and print a subset of results to stdout. | -| write_csv | Execute this DataFrame and write the results to disk in CSV format. | -| write_json | Execute this DataFrame and write the results to disk in JSON format. | -| write_parquet | Execute this DataFrame and write the results to disk in Parquet format. | -| write_table | Execute this DataFrame and write the results via the insert_into method of the registered TableProvider | - -## Other DataFrame Methods - -| Function | Notes | -| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| explain | Return a DataFrame with the explanation of its plan so far. | -| registry | Return a `FunctionRegistry` used to plan udf's calls. | -| schema | Returns the schema describing the output of this DataFrame in terms of columns returned, where each column has a name, data type, and nullability attribute. | -| to_logical_plan | Return the optimized logical plan represented by this DataFrame. | -| to_unoptimized_plan | Return the unoptimized logical plan represented by this DataFrame. | +[pandas dataframe]: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html +[spark dataframe]: https://spark.apache.org/docs/latest/sql-programming-guide.html +[`sessioncontext`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html +[`read_csv`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.read_csv +[`filter`]: https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html#method.filter +[`select`]: https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html#method.select +[`aggregate`]: https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html#method.aggregate +[`limit`]: https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html#method.limit +[`collect`]: https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html#method.collect +[library users guide]: ../library-user-guide/using-the-dataframe-api.md +[api reference on docs.rs]: https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html +[expressions reference]: expressions