From a5940f401b194b8b45961be7caac46118bea84a6 Mon Sep 17 00:00:00 2001 From: DreaMer963 Date: Fri, 4 Feb 2022 22:26:12 +0800 Subject: [PATCH] format --- README.md | 356 +----------------- docs/source/index.rst | 1 + .../source/specification/quarterly_roadmap.md | 72 ++++ docs/source/user-guide/sql/index.rst | 1 + docs/source/user-guide/sql/sql_status.md | 241 ++++++++++++ 5 files changed, 323 insertions(+), 348 deletions(-) create mode 100644 docs/source/specification/quarterly_roadmap.md create mode 100644 docs/source/user-guide/sql/sql_status.md diff --git a/README.md b/README.md index 1ea972cc4df1..dc350f69bb9c 100644 --- a/README.md +++ b/README.md @@ -73,363 +73,23 @@ Here are some of the projects known to use DataFusion: ## Example Usage -Run a SQL query against data stored in a CSV: +Please see [example usage](https://arrow.apache.org/datafusion/user-guide/example-usage.html) to find how to use DataFusion. -```rust -use datafusion::prelude::*; -use datafusion::arrow::util::pretty::print_batches; -use datafusion::arrow::record_batch::RecordBatch; - -#[tokio::main] -async fn main() -> datafusion::error::Result<()> { - // register the table - let mut ctx = ExecutionContext::new(); - ctx.register_csv("example", "tests/example.csv", CsvReadOptions::new()).await?; - - // create a plan to run a SQL query - let df = ctx.sql("SELECT a, MIN(b) FROM example GROUP BY a LIMIT 100").await?; - - // execute and print results - df.show().await?; - Ok(()) -} -``` - -Use the DataFrame API to process data stored in a CSV: - -```rust -use datafusion::prelude::*; -use datafusion::arrow::util::pretty::print_batches; -use datafusion::arrow::record_batch::RecordBatch; - -#[tokio::main] -async fn main() -> datafusion::error::Result<()> { - // create the dataframe - let mut ctx = ExecutionContext::new(); - let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?; - - let df = df.filter(col("a").lt_eq(col("b")))? - .aggregate(vec![col("a")], vec![min(col("b"))])?; - - // execute and print results - df.show_limit(100).await?; - Ok(()) -} -``` - -Both of these examples will produce - -```text -+---+--------+ -| a | MIN(b) | -+---+--------+ -| 1 | 2 | -+---+--------+ -``` - -## Using DataFusion as a library - -DataFusion is [published on crates.io](https://crates.io/crates/datafusion), and is [well documented on docs.rs](https://docs.rs/datafusion/). - -To get started, add the following to your `Cargo.toml` file: - -```toml -[dependencies] -datafusion = "6.0.0" -``` - -## Using DataFusion as a binary - -DataFusion also includes a simple command-line interactive SQL utility. See the [CLI reference](https://arrow.apache.org/datafusion/cli/index.html) for more information. - -# Roadmap - -A quarterly roadmap will be published to give the DataFusion community visibility into the priorities of the projects contributors. This roadmap is not binding. - -## 2022 Q1 - -### DataFusion Core - -- Publish official Arrow2 branch -- Implementation of memory manager (i.e. to enable spilling to disk as needed) - -### Benchmarking - -- Inclusion in Db-Benchmark with all quries covered -- All TPCH queries covered - -### Performance Improvements - -- Predicate evaluation -- Improve multi-column comparisons (that can't be vectorized at the moment) -- Null constant support - -### New Features - -- Read JSON as table -- Simplify DDL with Datafusion-Cli -- Add Decimal128 data type and the attendant features such as Arrow Kernel and UDF support -- Add new experimental e-graph based optimizer - -### Ballista - -- Begin work on design documents and plan / priorities for development - -### Extensions ([datafusion-contrib](https://github.com/datafusion-contrib])) - -- Stable S3 support -- Begin design discussions and prototyping of a stream provider - -## Beyond 2022 Q1 - -There is no clear timeline for the below, but community members have expressed interest in working on these topics. - -### DataFusion Core - -- Custom SQL support -- Split DataFusion into multiple crates -- Push based query execution and code generation - -### Ballista - -- Evolve architecture so that it can be deployed in a multi-tenant cloud native environment -- Ensure Ballista is scalable, elastic, and stable for production usage -- Develop distributed ML capabilities - -# Status - -## General - -- [x] SQL Parser -- [x] SQL Query Planner -- [x] Query Optimizer -- [x] Constant folding -- [x] Join Reordering -- [x] Limit Pushdown -- [x] Projection push down -- [x] Predicate push down -- [x] Type coercion -- [x] Parallel query execution - -## SQL Support - -- [x] Projection -- [x] Filter (WHERE) -- [x] Filter post-aggregate (HAVING) -- [x] Limit -- [x] Aggregate -- [x] Common math functions -- [x] cast -- [x] try_cast -- [x] [`VALUES` lists](https://www.postgresql.org/docs/current/queries-values.html) -- Postgres compatible String functions - - [x] ascii - - [x] bit_length - - [x] btrim - - [x] char_length - - [x] character_length - - [x] chr - - [x] concat - - [x] concat_ws - - [x] initcap - - [x] left - - [x] length - - [x] lpad - - [x] ltrim - - [x] octet_length - - [x] regexp_replace - - [x] repeat - - [x] replace - - [x] reverse - - [x] right - - [x] rpad - - [x] rtrim - - [x] split_part - - [x] starts_with - - [x] strpos - - [x] substr - - [x] to_hex - - [x] translate - - [x] trim -- Miscellaneous/Boolean functions - - [x] nullif -- Approximation functions - - [x] approx_distinct -- Common date/time functions - - [ ] Basic date functions - - [ ] Basic time functions - - [x] Basic timestamp functions - - [x] [to_timestamp](docs/user-guide/book/sql/datafusion-functions.html#to_timestamp) - - [x] [to_timestamp_millis](docs/user-guide/book/sql/datafusion-functions.html#to_timestamp_millis) - - [x] [to_timestamp_micros](docs/user-guide/book/sql/datafusion-functions.html#to_timestamp_micros) - - [x] [to_timestamp_seconds](docs/user-guide/book/sql/datafusion-functions.html#to_timestamp_seconds) -- nested functions - - [x] Array of columns -- [x] Schema Queries - - [x] SHOW TABLES - - [x] SHOW COLUMNS - - [x] information_schema.{tables, columns} - - [ ] information_schema other views -- [x] Sorting -- [ ] Nested types -- [ ] Lists -- [x] Subqueries -- [x] Common table expressions -- [x] Set Operations - - [x] UNION ALL - - [x] UNION - - [x] INTERSECT - - [x] INTERSECT ALL - - [x] EXCEPT - - [x] EXCEPT ALL -- [x] Joins - - [x] INNER JOIN - - [x] LEFT JOIN - - [x] RIGHT JOIN - - [x] FULL JOIN - - [x] CROSS JOIN -- [ ] Window - - [x] Empty window - - [x] Common window functions - - [x] Window with PARTITION BY clause - - [x] Window with ORDER BY clause - - [ ] Window with FILTER clause - - [ ] [Window with custom WINDOW FRAME](https://github.com/apache/arrow-datafusion/issues/361) - - [ ] UDF and UDAF for window functions - -## Data Sources - -- [x] CSV -- [x] Parquet primitive types -- [ ] Parquet nested types - -## Extensibility - -DataFusion is designed to be extensible at all points. To that end, you can provide your own custom: - -- [x] User Defined Functions (UDFs) -- [x] User Defined Aggregate Functions (UDAFs) -- [x] User Defined Table Source (`TableProvider`) for tables -- [x] User Defined `Optimizer` passes (plan rewrites) -- [x] User Defined `LogicalPlan` nodes -- [x] User Defined `ExecutionPlan` nodes - -## Rust Version Compatbility - -This crate is tested with the latest stable version of Rust. We do not currently test against other, older versions of the Rust compiler. - -# Supported SQL - -This library currently supports many SQL constructs, including - -- `CREATE EXTERNAL TABLE X STORED AS PARQUET LOCATION '...';` to register a table's locations -- `SELECT ... FROM ...` together with any expression -- `ALIAS` to name an expression -- `CAST` to change types, including e.g. `Timestamp(Nanosecond, None)` -- Many mathematical unary and binary expressions such as `+`, `/`, `sqrt`, `tan`, `>=`. -- `WHERE` to filter -- `GROUP BY` together with one of the following aggregations: `MIN`, `MAX`, `COUNT`, `SUM`, `AVG`, `CORR`, `VAR`, `COVAR`, `STDDEV` (sample and population) -- `ORDER BY` together with an expression and optional `ASC` or `DESC` and also optional `NULLS FIRST` or `NULLS LAST` - -## Supported Functions - -DataFusion strives to implement a subset of the [PostgreSQL SQL dialect](https://www.postgresql.org/docs/current/functions.html) where possible. We explicitly choose a single dialect to maximize interoperability with other tools and allow reuse of the PostgreSQL documents and tutorials as much as possible. - -Currently, only a subset of the PostgreSQL dialect is implemented, and we will document any deviations. - -## Schema Metadata / Information Schema Support - -DataFusion supports the showing metadata about the tables available. This information can be accessed using the views of the ISO SQL `information_schema` schema or the DataFusion specific `SHOW TABLES` and `SHOW COLUMNS` commands. - -More information can be found in the [Postgres docs](https://www.postgresql.org/docs/13/infoschema-schema.html)). - -To show tables available for use in DataFusion, use the `SHOW TABLES` command or the `information_schema.tables` view: - -```sql -> show tables; -+---------------+--------------------+------------+------------+ -| table_catalog | table_schema | table_name | table_type | -+---------------+--------------------+------------+------------+ -| datafusion | public | t | BASE TABLE | -| datafusion | information_schema | tables | VIEW | -+---------------+--------------------+------------+------------+ - -> select * from information_schema.tables; - -+---------------+--------------------+------------+--------------+ -| table_catalog | table_schema | table_name | table_type | -+---------------+--------------------+------------+--------------+ -| datafusion | public | t | BASE TABLE | -| datafusion | information_schema | TABLES | SYSTEM TABLE | -+---------------+--------------------+------------+--------------+ -``` - -To show the schema of a table in DataFusion, use the `SHOW COLUMNS` command or the or `information_schema.columns` view: - -```sql -> show columns from t; -+---------------+--------------+------------+-------------+-----------+-------------+ -| table_catalog | table_schema | table_name | column_name | data_type | is_nullable | -+---------------+--------------+------------+-------------+-----------+-------------+ -| datafusion | public | t | a | Int32 | NO | -| datafusion | public | t | b | Utf8 | NO | -| datafusion | public | t | c | Float32 | NO | -+---------------+--------------+------------+-------------+-----------+-------------+ - -> select table_name, column_name, ordinal_position, is_nullable, data_type from information_schema.columns; -+------------+-------------+------------------+-------------+-----------+ -| table_name | column_name | ordinal_position | is_nullable | data_type | -+------------+-------------+------------------+-------------+-----------+ -| t | a | 0 | NO | Int32 | -| t | b | 1 | NO | Utf8 | -| t | c | 2 | NO | Float32 | -+------------+-------------+------------------+-------------+-----------+ -``` - -## Supported Data Types - -DataFusion uses Arrow, and thus the Arrow type system, for query -execution. The SQL types from -[sqlparser-rs](https://github.com/ballista-compute/sqlparser-rs/blob/main/src/ast/data_type.rs#L57) -are mapped to Arrow types according to the following table - -| SQL Data Type | Arrow DataType | -| ------------- | --------------------------------- | -| `CHAR` | `Utf8` | -| `VARCHAR` | `Utf8` | -| `UUID` | _Not yet supported_ | -| `CLOB` | _Not yet supported_ | -| `BINARY` | _Not yet supported_ | -| `VARBINARY` | _Not yet supported_ | -| `DECIMAL` | `Float64` | -| `FLOAT` | `Float32` | -| `SMALLINT` | `Int16` | -| `INT` | `Int32` | -| `BIGINT` | `Int64` | -| `REAL` | `Float32` | -| `DOUBLE` | `Float64` | -| `BOOLEAN` | `Boolean` | -| `DATE` | `Date32` | -| `TIME` | `Time64(TimeUnit::Millisecond)` | -| `TIMESTAMP` | `Timestamp(TimeUnit::Nanosecond)` | -| `INTERVAL` | _Not yet supported_ | -| `REGCLASS` | _Not yet supported_ | -| `TEXT` | _Not yet supported_ | -| `BYTEA` | _Not yet supported_ | -| `CUSTOM` | _Not yet supported_ | -| `ARRAY` | _Not yet supported_ | - -# Roadmap +## Roadmap Please see [Roadmap](docs/source/specification/roadmap.md) for information of where the project is headed. -# Architecture Overview +## Architecture Overview There is no formal document describing DataFusion's architecture yet, but the following presentations offer a good overview of its different components and how they interact together. - (March 2021): The DataFusion architecture is described in _Query Engine Design and the Rust-Based DataFusion in Apache Arrow_: [recording](https://www.youtube.com/watch?v=K6eCAVEk4kU) (DataFusion content starts [~ 15 minutes in](https://www.youtube.com/watch?v=K6eCAVEk4kU&t=875s)) and [slides](https://www.slideshare.net/influxdata/influxdb-iox-tech-talks-query-engine-design-and-the-rustbased-datafusion-in-apache-arrow-244161934) - (February 2021): How DataFusion is used within the Ballista Project is described in \*Ballista: Distributed Compute with Rust and Apache Arrow: [recording](https://www.youtube.com/watch?v=ZZHQaOap9pQ) -# Developer's guide +## User's guide + +Please see [User Guide](https://arrow.apache.org/datafusion/) for more information about DataFusion. + +## Developer's guide Please see [Developers Guide](DEVELOPERS.md) for information about developing DataFusion. diff --git a/docs/source/index.rst b/docs/source/index.rst index bf6b25096b4b..5109e60338fa 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -55,6 +55,7 @@ Table of content specification/roadmap specification/invariants specification/output-field-name-semantic + specification/quarterly_roadmap .. _toc.readme: diff --git a/docs/source/specification/quarterly_roadmap.md b/docs/source/specification/quarterly_roadmap.md new file mode 100644 index 000000000000..5bb805d7e7f0 --- /dev/null +++ b/docs/source/specification/quarterly_roadmap.md @@ -0,0 +1,72 @@ + + +# Roadmap + +A quarterly roadmap will be published to give the DataFusion community visibility into the priorities of the projects contributors. This roadmap is not binding. + +## 2022 Q1 + +### DataFusion Core + +- Publish official Arrow2 branch +- Implementation of memory manager (i.e. to enable spilling to disk as needed) + +### Benchmarking + +- Inclusion in Db-Benchmark with all quries covered +- All TPCH queries covered + +### Performance Improvements + +- Predicate evaluation +- Improve multi-column comparisons (that can't be vectorized at the moment) +- Null constant support + +### New Features + +- Read JSON as table +- Simplify DDL with Datafusion-Cli +- Add Decimal128 data type and the attendant features such as Arrow Kernel and UDF support +- Add new experimental e-graph based optimizer + +### Ballista + +- Begin work on design documents and plan / priorities for development + +### Extensions ([datafusion-contrib](https://github.com/datafusion-contrib])) + +- Stable S3 support +- Begin design discussions and prototyping of a stream provider + +## Beyond 2022 Q1 + +There is no clear timeline for the below, but community members have expressed interest in working on these topics. + +### DataFusion Core + +- Custom SQL support +- Split DataFusion into multiple crates +- Push based query execution and code generation + +### Ballista + +- Evolve architecture so that it can be deployed in a multi-tenant cloud native environment +- Ensure Ballista is scalable, elastic, and stable for production usage +- Develop distributed ML capabilities diff --git a/docs/source/user-guide/sql/index.rst b/docs/source/user-guide/sql/index.rst index 2489f6ba1f10..fc96acc8733c 100644 --- a/docs/source/user-guide/sql/index.rst +++ b/docs/source/user-guide/sql/index.rst @@ -21,6 +21,7 @@ SQL Reference .. toctree:: :maxdepth: 2 + sql_status select ddl DataFusion Functions diff --git a/docs/source/user-guide/sql/sql_status.md b/docs/source/user-guide/sql/sql_status.md new file mode 100644 index 000000000000..0df14e58a8be --- /dev/null +++ b/docs/source/user-guide/sql/sql_status.md @@ -0,0 +1,241 @@ + + +# Status + +## General + +- [x] SQL Parser +- [x] SQL Query Planner +- [x] Query Optimizer +- [x] Constant folding +- [x] Join Reordering +- [x] Limit Pushdown +- [x] Projection push down +- [x] Predicate push down +- [x] Type coercion +- [x] Parallel query execution + +## SQL Support + +- [x] Projection +- [x] Filter (WHERE) +- [x] Filter post-aggregate (HAVING) +- [x] Limit +- [x] Aggregate +- [x] Common math functions +- [x] cast +- [x] try_cast +- [x] [`VALUES` lists](https://www.postgresql.org/docs/current/queries-values.html) +- Postgres compatible String functions + - [x] ascii + - [x] bit_length + - [x] btrim + - [x] char_length + - [x] character_length + - [x] chr + - [x] concat + - [x] concat_ws + - [x] initcap + - [x] left + - [x] length + - [x] lpad + - [x] ltrim + - [x] octet_length + - [x] regexp_replace + - [x] repeat + - [x] replace + - [x] reverse + - [x] right + - [x] rpad + - [x] rtrim + - [x] split_part + - [x] starts_with + - [x] strpos + - [x] substr + - [x] to_hex + - [x] translate + - [x] trim +- Miscellaneous/Boolean functions + - [x] nullif +- Approximation functions + - [x] approx_distinct +- Common date/time functions + - [ ] Basic date functions + - [ ] Basic time functions + - [x] Basic timestamp functions + - [x] [to_timestamp](docs/user-guide/book/sql/datafusion-functions.html#to_timestamp) + - [x] [to_timestamp_millis](docs/user-guide/book/sql/datafusion-functions.html#to_timestamp_millis) + - [x] [to_timestamp_micros](docs/user-guide/book/sql/datafusion-functions.html#to_timestamp_micros) + - [x] [to_timestamp_seconds](docs/user-guide/book/sql/datafusion-functions.html#to_timestamp_seconds) +- nested functions + - [x] Array of columns +- [x] Schema Queries + - [x] SHOW TABLES + - [x] SHOW COLUMNS + - [x] information_schema.{tables, columns} + - [ ] information_schema other views +- [x] Sorting +- [ ] Nested types +- [ ] Lists +- [x] Subqueries +- [x] Common table expressions +- [x] Set Operations + - [x] UNION ALL + - [x] UNION + - [x] INTERSECT + - [x] INTERSECT ALL + - [x] EXCEPT + - [x] EXCEPT ALL +- [x] Joins + - [x] INNER JOIN + - [x] LEFT JOIN + - [x] RIGHT JOIN + - [x] FULL JOIN + - [x] CROSS JOIN +- [ ] Window + - [x] Empty window + - [x] Common window functions + - [x] Window with PARTITION BY clause + - [x] Window with ORDER BY clause + - [ ] Window with FILTER clause + - [ ] [Window with custom WINDOW FRAME](https://github.com/apache/arrow-datafusion/issues/361) + - [ ] UDF and UDAF for window functions + +## Data Sources + +- [x] CSV +- [x] Parquet primitive types +- [ ] Parquet nested types + +## Extensibility + +DataFusion is designed to be extensible at all points. To that end, you can provide your own custom: + +- [x] User Defined Functions (UDFs) +- [x] User Defined Aggregate Functions (UDAFs) +- [x] User Defined Table Source (`TableProvider`) for tables +- [x] User Defined `Optimizer` passes (plan rewrites) +- [x] User Defined `LogicalPlan` nodes +- [x] User Defined `ExecutionPlan` nodes + +## Rust Version Compatbility + +This crate is tested with the latest stable version of Rust. We do not currently test against other, older versions of the Rust compiler. + +# Supported SQL + +This library currently supports many SQL constructs, including + +- `CREATE EXTERNAL TABLE X STORED AS PARQUET LOCATION '...';` to register a table's locations +- `SELECT ... FROM ...` together with any expression +- `ALIAS` to name an expression +- `CAST` to change types, including e.g. `Timestamp(Nanosecond, None)` +- Many mathematical unary and binary expressions such as `+`, `/`, `sqrt`, `tan`, `>=`. +- `WHERE` to filter +- `GROUP BY` together with one of the following aggregations: `MIN`, `MAX`, `COUNT`, `SUM`, `AVG`, `CORR`, `VAR`, `COVAR`, `STDDEV` (sample and population) +- `ORDER BY` together with an expression and optional `ASC` or `DESC` and also optional `NULLS FIRST` or `NULLS LAST` + +## Supported Functions + +DataFusion strives to implement a subset of the [PostgreSQL SQL dialect](https://www.postgresql.org/docs/current/functions.html) where possible. We explicitly choose a single dialect to maximize interoperability with other tools and allow reuse of the PostgreSQL documents and tutorials as much as possible. + +Currently, only a subset of the PostgreSQL dialect is implemented, and we will document any deviations. + +## Schema Metadata / Information Schema Support + +DataFusion supports the showing metadata about the tables available. This information can be accessed using the views of the ISO SQL `information_schema` schema or the DataFusion specific `SHOW TABLES` and `SHOW COLUMNS` commands. + +More information can be found in the [Postgres docs](https://www.postgresql.org/docs/13/infoschema-schema.html)). + +To show tables available for use in DataFusion, use the `SHOW TABLES` command or the `information_schema.tables` view: + +```sql +> show tables; ++---------------+--------------------+------------+------------+ +| table_catalog | table_schema | table_name | table_type | ++---------------+--------------------+------------+------------+ +| datafusion | public | t | BASE TABLE | +| datafusion | information_schema | tables | VIEW | ++---------------+--------------------+------------+------------+ + +> select * from information_schema.tables; + ++---------------+--------------------+------------+--------------+ +| table_catalog | table_schema | table_name | table_type | ++---------------+--------------------+------------+--------------+ +| datafusion | public | t | BASE TABLE | +| datafusion | information_schema | TABLES | SYSTEM TABLE | ++---------------+--------------------+------------+--------------+ +``` + +To show the schema of a table in DataFusion, use the `SHOW COLUMNS` command or the or `information_schema.columns` view: + +```sql +> show columns from t; ++---------------+--------------+------------+-------------+-----------+-------------+ +| table_catalog | table_schema | table_name | column_name | data_type | is_nullable | ++---------------+--------------+------------+-------------+-----------+-------------+ +| datafusion | public | t | a | Int32 | NO | +| datafusion | public | t | b | Utf8 | NO | +| datafusion | public | t | c | Float32 | NO | ++---------------+--------------+------------+-------------+-----------+-------------+ + +> select table_name, column_name, ordinal_position, is_nullable, data_type from information_schema.columns; ++------------+-------------+------------------+-------------+-----------+ +| table_name | column_name | ordinal_position | is_nullable | data_type | ++------------+-------------+------------------+-------------+-----------+ +| t | a | 0 | NO | Int32 | +| t | b | 1 | NO | Utf8 | +| t | c | 2 | NO | Float32 | ++------------+-------------+------------------+-------------+-----------+ +``` + +## Supported Data Types + +DataFusion uses Arrow, and thus the Arrow type system, for query +execution. The SQL types from +[sqlparser-rs](https://github.com/ballista-compute/sqlparser-rs/blob/main/src/ast/data_type.rs#L57) +are mapped to Arrow types according to the following table + +| SQL Data Type | Arrow DataType | +| ------------- | --------------------------------- | +| `CHAR` | `Utf8` | +| `VARCHAR` | `Utf8` | +| `UUID` | _Not yet supported_ | +| `CLOB` | _Not yet supported_ | +| `BINARY` | _Not yet supported_ | +| `VARBINARY` | _Not yet supported_ | +| `DECIMAL` | `Float64` | +| `FLOAT` | `Float32` | +| `SMALLINT` | `Int16` | +| `INT` | `Int32` | +| `BIGINT` | `Int64` | +| `REAL` | `Float32` | +| `DOUBLE` | `Float64` | +| `BOOLEAN` | `Boolean` | +| `DATE` | `Date32` | +| `TIME` | `Time64(TimeUnit::Millisecond)` | +| `TIMESTAMP` | `Timestamp(TimeUnit::Nanosecond)` | +| `INTERVAL` | _Not yet supported_ | +| `REGCLASS` | _Not yet supported_ | +| `TEXT` | _Not yet supported_ | +| `BYTEA` | _Not yet supported_ | +| `CUSTOM` | _Not yet supported_ | +| `ARRAY` | _Not yet supported_ |