diff --git a/datafusion/core/tests/capitalized_example.csv b/datafusion/core/tests/capitalized_example.csv new file mode 100644 index 000000000000..dbc8f5c5a0a6 --- /dev/null +++ b/datafusion/core/tests/capitalized_example.csv @@ -0,0 +1,5 @@ +A,b,c +1,2,3 +1,10,5 +2,5,6 +2,1,4 \ No newline at end of file diff --git a/docs/source/user-guide/example-usage.md b/docs/source/user-guide/example-usage.md index 48ca791ffe6f..ced84ffa6ec5 100644 --- a/docs/source/user-guide/example-usage.md +++ b/docs/source/user-guide/example-usage.md @@ -19,6 +19,10 @@ # Example Usage +In this example some simple processing is performed on a csv file. Please be aware that all identifiers are made lower-case in SQL, so if your csv file has capital letters (ex: Name) you should put your column name in double quotes or the example won't work. + +The following example uses [this file](../../../datafusion/core/tests/capitalized_example.csv) + ## Update `Cargo.toml` Add the following to your `Cargo.toml` file: @@ -37,10 +41,10 @@ use datafusion::prelude::*; async fn main() -> datafusion::error::Result<()> { // register the table let ctx = SessionContext::new(); - ctx.register_csv("example", "tests/example.csv", CsvReadOptions::new()).await?; + ctx.register_csv("example", "tests/capitalized_example.csv", CsvReadOptions::new()).await?; // create a plan to run a SQL query - let df = ctx.sql("SELECT a, MIN(b) FROM example GROUP BY a LIMIT 100").await?; + let df = ctx.sql("SELECT \"A\", MIN(b) FROM example GROUP BY \"A\" LIMIT 100").await?; // execute and print results df.show().await?; @@ -57,10 +61,10 @@ use datafusion::prelude::*; async fn main() -> datafusion::error::Result<()> { // create the dataframe let ctx = SessionContext::new(); - let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?; + let df = ctx.read_csv("tests/capitalized_example.csv", CsvReadOptions::new()).await?; - let df = df.filter(col("a").lt_eq(col("b")))? - .aggregate(vec![col("a")], vec![min(col("b"))])?; + let df = df.filter(col("A").lt_eq(col("c")))? + .aggregate(vec![col("A")], vec![min(col("b"))])?; // execute and print results df.show_limit(100).await?; @@ -72,8 +76,9 @@ async fn main() -> datafusion::error::Result<()> { ```text +---+--------+ -| a | MIN(b) | +| A | MIN(b) | +---+--------+ +| 2 | 1 | | 1 | 2 | +---+--------+ ``` diff --git a/docs/source/user-guide/sql/select.md b/docs/source/user-guide/sql/select.md index 008981fde0d3..3eea252d7080 100644 --- a/docs/source/user-guide/sql/select.md +++ b/docs/source/user-guide/sql/select.md @@ -20,6 +20,9 @@ # SELECT syntax The queries in DataFusion scan data from tables and return 0 or more rows. +Please be aware that column names in queries are made lower-case, but not on the inferred schema. Accordingly, if you +want to query against a capitalized field, make sure to use double quotes. Please see this +[example](https://arrow.apache.org/datafusion/user-guide/example-usage.html) for clarification. In this documentation we describe the SQL syntax in DataFusion. DataFusion supports the following syntax for queries: