diff --git a/.github_changelog_generator b/.github_changelog_generator index 6ee6508b7216..45eef2f51836 100644 --- a/.github_changelog_generator +++ b/.github_changelog_generator @@ -18,8 +18,6 @@ # under the License. # -# point to the old changelog in apache/arrow -front-matter=For older versions, see [apache/arrow/CHANGELOG.md](https://github.com/apache/arrow/blob/master/CHANGELOG.md)\n # some issues are just documentation add-sections={"documentation":{"prefix":"**Documentation updates:**","labels":["documentation"]},"performance":{"prefix":"**Performance improvements:**","labels":["performance"]}} # uncomment to not show PRs. TBD if we shown them or not. diff --git a/README.md b/README.md index 5ca080231c62..1d2db9244448 100644 --- a/README.md +++ b/README.md @@ -129,7 +129,7 @@ To get started, add the following to your `Cargo.toml` file: ```toml [dependencies] -datafusion = "5.0.0" +datafusion = "6.0.0" ``` ## Using DataFusion as a binary diff --git a/ballista-examples/Cargo.toml b/ballista-examples/Cargo.toml index e6c15e0178ea..6b99f9b20c37 100644 --- a/ballista-examples/Cargo.toml +++ b/ballista-examples/Cargo.toml @@ -31,7 +31,7 @@ rust-version = "1.56" [dependencies] arrow-flight = { version = "6.1.0" } datafusion = { path = "../datafusion" } -ballista = { path = "../ballista/rust/client" } +ballista = { path = "../ballista/rust/client", version = "0.6.0"} prost = "0.8" tonic = "0.5" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } diff --git a/ballista/CHANGELOG.md b/ballista/CHANGELOG.md index 287229b05faa..b8268fc6537f 100644 --- a/ballista/CHANGELOG.md +++ b/ballista/CHANGELOG.md @@ -17,10 +17,96 @@ under the License. --> -For older versions, see [apache/arrow/CHANGELOG.md](https://github.com/apache/arrow/blob/master/CHANGELOG.md) - # Changelog +## [ballista-0.6.0](https://github.com/apache/arrow-datafusion/tree/ballista-0.6.0) (2021-11-13) + +[Full Changelog](https://github.com/apache/arrow-datafusion/compare/ballista-0.5.0...ballista-0.6.0) + +**Breaking changes:** + +- File partitioning for ListingTable [\#1141](https://github.com/apache/arrow-datafusion/pull/1141) ([rdettai](https://github.com/rdettai)) +- Register tables in BallistaContext using TableProviders instead of Dataframe [\#1028](https://github.com/apache/arrow-datafusion/pull/1028) ([rdettai](https://github.com/rdettai)) +- Make TableProvider.scan\(\) and PhysicalPlanner::create\_physical\_plan\(\) async [\#1013](https://github.com/apache/arrow-datafusion/pull/1013) ([rdettai](https://github.com/rdettai)) +- Reorganize table providers by table format [\#1010](https://github.com/apache/arrow-datafusion/pull/1010) ([rdettai](https://github.com/rdettai)) +- Move CBOs and Statistics to physical plan [\#965](https://github.com/apache/arrow-datafusion/pull/965) ([rdettai](https://github.com/rdettai)) +- Update to sqlparser v 0.10.0 [\#934](https://github.com/apache/arrow-datafusion/pull/934) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([alamb](https://github.com/alamb)) +- FilePartition and PartitionedFile for scanning flexibility [\#932](https://github.com/apache/arrow-datafusion/pull/932) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([yjshen](https://github.com/yjshen)) +- Improve SQLMetric APIs, port existing metrics [\#908](https://github.com/apache/arrow-datafusion/pull/908) ([alamb](https://github.com/alamb)) +- Add support for EXPLAIN ANALYZE [\#858](https://github.com/apache/arrow-datafusion/pull/858) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([alamb](https://github.com/alamb)) +- Rename concurrency to target\_partitions [\#706](https://github.com/apache/arrow-datafusion/pull/706) ([andygrove](https://github.com/andygrove)) + +**Implemented enhancements:** + +- Update datafusion-cli to support Ballista, or implement new ballista-cli [\#886](https://github.com/apache/arrow-datafusion/issues/886) +- Prepare Ballista crates for publishing [\#509](https://github.com/apache/arrow-datafusion/issues/509) +- Add drop table support [\#1266](https://github.com/apache/arrow-datafusion/pull/1266) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([viirya](https://github.com/viirya)) +- use arrow 6.1.0 [\#1255](https://github.com/apache/arrow-datafusion/pull/1255) ([Jimexist](https://github.com/Jimexist)) +- Add support for `create table as` via MemTable [\#1243](https://github.com/apache/arrow-datafusion/pull/1243) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([Dandandan](https://github.com/Dandandan)) +- add values list expression [\#1165](https://github.com/apache/arrow-datafusion/pull/1165) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([Jimexist](https://github.com/Jimexist)) +- Multiple files per partitions for CSV Avro Json [\#1138](https://github.com/apache/arrow-datafusion/pull/1138) ([rdettai](https://github.com/rdettai)) +- Implement INTERSECT & INTERSECT DISTINCT [\#1135](https://github.com/apache/arrow-datafusion/pull/1135) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([xudong963](https://github.com/xudong963)) +- Simplify file struct abstractions [\#1120](https://github.com/apache/arrow-datafusion/pull/1120) ([rdettai](https://github.com/rdettai)) +- Implement `is [not] distinct from` [\#1117](https://github.com/apache/arrow-datafusion/pull/1117) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([Dandandan](https://github.com/Dandandan)) +- add digest\(utf8, method\) function and refactor all current hash digest functions [\#1090](https://github.com/apache/arrow-datafusion/pull/1090) ([Jimexist](https://github.com/Jimexist)) +- \[crypto\] add `blake3` algorithm to `digest` function [\#1086](https://github.com/apache/arrow-datafusion/pull/1086) ([Jimexist](https://github.com/Jimexist)) +- \[crypto\] add blake2b and blake2s functions [\#1081](https://github.com/apache/arrow-datafusion/pull/1081) ([Jimexist](https://github.com/Jimexist)) +- Update sqlparser-rs to 0.11 [\#1052](https://github.com/apache/arrow-datafusion/pull/1052) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([alamb](https://github.com/alamb)) +- remove hard coded partition count in ballista logicalplan deserialization [\#1044](https://github.com/apache/arrow-datafusion/pull/1044) ([xudong963](https://github.com/xudong963)) +- Indexed field access for List [\#1006](https://github.com/apache/arrow-datafusion/pull/1006) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([Igosuki](https://github.com/Igosuki)) +- Update DataFusion to arrow 6.0 [\#984](https://github.com/apache/arrow-datafusion/pull/984) ([alamb](https://github.com/alamb)) +- Implement Display for Expr, improve operator display [\#971](https://github.com/apache/arrow-datafusion/pull/971) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([matthewmturner](https://github.com/matthewmturner)) +- ObjectStore API to read from remote storage systems [\#950](https://github.com/apache/arrow-datafusion/pull/950) ([yjshen](https://github.com/yjshen)) +- fixes \#933 replace placeholder fmt\_as fr ExecutionPlan impls [\#939](https://github.com/apache/arrow-datafusion/pull/939) ([tiphaineruy](https://github.com/tiphaineruy)) +- Support `NotLike` in Ballista [\#916](https://github.com/apache/arrow-datafusion/pull/916) ([Dandandan](https://github.com/Dandandan)) +- Avro Table Provider [\#910](https://github.com/apache/arrow-datafusion/pull/910) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([Igosuki](https://github.com/Igosuki)) +- Add BaselineMetrics, Timestamp metrics, add for `CoalescePartitionsExec`, rename output\_time -\> elapsed\_compute [\#909](https://github.com/apache/arrow-datafusion/pull/909) ([alamb](https://github.com/alamb)) +- \[Ballista\] Add executor last seen info to the ui [\#895](https://github.com/apache/arrow-datafusion/pull/895) ([msathis](https://github.com/msathis)) +- add cross join support to ballista [\#891](https://github.com/apache/arrow-datafusion/pull/891) ([houqp](https://github.com/houqp)) +- Add Ballista support to DataFusion CLI [\#889](https://github.com/apache/arrow-datafusion/pull/889) ([andygrove](https://github.com/andygrove)) +- Add support for PostgreSQL regex match [\#870](https://github.com/apache/arrow-datafusion/pull/870) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([b41sh](https://github.com/b41sh)) + +**Fixed bugs:** + +- Test execution\_plans::shuffle\_writer::tests::test Fail [\#1040](https://github.com/apache/arrow-datafusion/issues/1040) +- Integration test fails to build docker images [\#918](https://github.com/apache/arrow-datafusion/issues/918) +- Ballista: Remove hard-coded concurrency from logical plan serde code [\#708](https://github.com/apache/arrow-datafusion/issues/708) +- How can I make ballista distributed compute work? [\#327](https://github.com/apache/arrow-datafusion/issues/327) +- fix subquery alias [\#1067](https://github.com/apache/arrow-datafusion/pull/1067) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([xudong963](https://github.com/xudong963)) +- Fix compilation for ballista in stand-alone mode [\#1008](https://github.com/apache/arrow-datafusion/pull/1008) ([Igosuki](https://github.com/Igosuki)) + +**Documentation updates:** + +- Add Ballista roadmap [\#1166](https://github.com/apache/arrow-datafusion/pull/1166) ([andygrove](https://github.com/andygrove)) +- Adds note on compatible rust version [\#1097](https://github.com/apache/arrow-datafusion/pull/1097) ([1nF0rmed](https://github.com/1nF0rmed)) +- implement `approx_distinct` function using HyperLogLog [\#1087](https://github.com/apache/arrow-datafusion/pull/1087) ([Jimexist](https://github.com/Jimexist)) +- Improve User Guide [\#954](https://github.com/apache/arrow-datafusion/pull/954) ([andygrove](https://github.com/andygrove)) +- Update plan\_query\_stages doc [\#951](https://github.com/apache/arrow-datafusion/pull/951) ([rdettai](https://github.com/rdettai)) +- \[DataFusion\] - Add show and show\_limit function for DataFrame [\#923](https://github.com/apache/arrow-datafusion/pull/923) ([francis-du](https://github.com/francis-du)) +- update docs related to protoc and optional syntax [\#902](https://github.com/apache/arrow-datafusion/pull/902) ([Jimexist](https://github.com/Jimexist)) +- Improve Ballista crate README content [\#878](https://github.com/apache/arrow-datafusion/pull/878) ([andygrove](https://github.com/andygrove)) + +**Performance improvements:** + +- optimize build profile for datafusion python binding, cli and ballista [\#1137](https://github.com/apache/arrow-datafusion/pull/1137) ([houqp](https://github.com/houqp)) + +**Closed issues:** + +- InList expr with NULL literals do not work [\#1190](https://github.com/apache/arrow-datafusion/issues/1190) +- update the homepage README to include values, `approx_distinct`, etc. [\#1171](https://github.com/apache/arrow-datafusion/issues/1171) +- \[Python\]: Inconsistencies with Python package name [\#1011](https://github.com/apache/arrow-datafusion/issues/1011) +- Wanting to contribute to project where to start? [\#983](https://github.com/apache/arrow-datafusion/issues/983) +- delete redundant code [\#973](https://github.com/apache/arrow-datafusion/issues/973) +- How to build DataFusion python wheel [\#853](https://github.com/apache/arrow-datafusion/issues/853) +- Produce a design for a metrics framework [\#21](https://github.com/apache/arrow-datafusion/issues/21) + +**Merged pull requests:** + +- \[nit\] simplify ballista executor `CollectExec` impl codes [\#1140](https://github.com/apache/arrow-datafusion/pull/1140) ([panarch](https://github.com/panarch)) + + +For older versions, see [apache/arrow/CHANGELOG.md](https://github.com/apache/arrow/blob/master/CHANGELOG.md) + ## [ballista-0.5.0](https://github.com/apache/arrow-datafusion/tree/ballista-0.5.0) (2021-08-10) [Full Changelog](https://github.com/apache/arrow-datafusion/compare/4.0.0...ballista-0.5.0) diff --git a/ballista/rust/client/Cargo.toml b/ballista/rust/client/Cargo.toml index adac150c92bb..f444689021d2 100644 --- a/ballista/rust/client/Cargo.toml +++ b/ballista/rust/client/Cargo.toml @@ -34,7 +34,7 @@ futures = "0.3" log = "0.4" tokio = "1.0" -datafusion = { path = "../../../datafusion", version = "5.1.0" } +datafusion = { path = "../../../datafusion", version = "6.0.0" } [features] default = [] diff --git a/ballista/rust/core/Cargo.toml b/ballista/rust/core/Cargo.toml index f90d03b07a2a..3d15e21e4292 100644 --- a/ballista/rust/core/Cargo.toml +++ b/ballista/rust/core/Cargo.toml @@ -45,7 +45,7 @@ chrono = "0.4" arrow-flight = { version = "6.1.0" } -datafusion = { path = "../../../datafusion", version = "5.1.0" } +datafusion = { path = "../../../datafusion", version = "6.0.0" } [dev-dependencies] tempfile = "3" diff --git a/ballista/rust/executor/Cargo.toml b/ballista/rust/executor/Cargo.toml index 5c01f1c2d147..08116f514725 100644 --- a/ballista/rust/executor/Cargo.toml +++ b/ballista/rust/executor/Cargo.toml @@ -35,7 +35,7 @@ anyhow = "1" async-trait = "0.1.36" ballista-core = { path = "../core", version = "0.6.0" } configure_me = "0.4.0" -datafusion = { path = "../../../datafusion", version = "5.1.0" } +datafusion = { path = "../../../datafusion", version = "6.0.0" } env_logger = "0.9" futures = "0.3" log = "0.4" diff --git a/ballista/rust/scheduler/Cargo.toml b/ballista/rust/scheduler/Cargo.toml index ac0d98738f80..a71be406fecc 100644 --- a/ballista/rust/scheduler/Cargo.toml +++ b/ballista/rust/scheduler/Cargo.toml @@ -35,7 +35,7 @@ anyhow = "1" ballista-core = { path = "../core", version = "0.6.0" } clap = "2" configure_me = "0.4.0" -datafusion = { path = "../../../datafusion", version = "5.1.0" } +datafusion = { path = "../../../datafusion", version = "6.0.0" } env_logger = "0.9" etcd-client = { version = "0.7", optional = true } futures = "0.3" diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index b1cc09ae58cf..3212b67f967b 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -30,6 +30,6 @@ rust-version = "1.56" clap = "2.33" rustyline = "9.0" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } -datafusion = { path = "../datafusion", version = "5.1.0" } +datafusion = { path = "../datafusion", version = "6.0.0" } arrow = { version = "6.1.0" } ballista = { path = "../ballista/rust/client", version = "0.6.0" } diff --git a/datafusion/CHANGELOG.md b/datafusion/CHANGELOG.md index 41afa286b796..c22b0553474c 100644 --- a/datafusion/CHANGELOG.md +++ b/datafusion/CHANGELOG.md @@ -17,10 +17,197 @@ under the License. --> -For older versions, see [apache/arrow/CHANGELOG.md](https://github.com/apache/arrow/blob/master/CHANGELOG.md) - # Changelog +## [6.0.0](https://github.com/apache/arrow-datafusion/tree/6.0.0) (2021-11-13) + +[Full Changelog](https://github.com/apache/arrow-datafusion/compare/5.0.0...6.0.0) + +**Breaking changes:** + +- Removed deprecated with\_concurrency [\#1200](https://github.com/apache/arrow-datafusion/pull/1200) ([rdettai](https://github.com/rdettai)) +- File partitioning for ListingTable [\#1141](https://github.com/apache/arrow-datafusion/pull/1141) ([rdettai](https://github.com/rdettai)) +- Add function volatility to Signature [\#1071](https://github.com/apache/arrow-datafusion/pull/1071) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([pjmore](https://github.com/pjmore)) +- fix: allow duplicate field names in table join, fix output with duplicated names [\#1023](https://github.com/apache/arrow-datafusion/pull/1023) ([houqp](https://github.com/houqp)) +- Make TableProvider.scan\(\) and PhysicalPlanner::create\_physical\_plan\(\) async [\#1013](https://github.com/apache/arrow-datafusion/pull/1013) ([rdettai](https://github.com/rdettai)) +- Reorganize table providers by table format [\#1010](https://github.com/apache/arrow-datafusion/pull/1010) ([rdettai](https://github.com/rdettai)) +- Make Metrics::labels\(\) public [\#999](https://github.com/apache/arrow-datafusion/pull/999) ([alamb](https://github.com/alamb)) +- Rename NthValue::{first\_value,last\_value,nth\_value} to satisfy clippy in Rust 1.55 [\#986](https://github.com/apache/arrow-datafusion/pull/986) ([alamb](https://github.com/alamb)) +- Move CBOs and Statistics to physical plan [\#965](https://github.com/apache/arrow-datafusion/pull/965) ([rdettai](https://github.com/rdettai)) +- Update to sqlparser v 0.10.0 [\#934](https://github.com/apache/arrow-datafusion/pull/934) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([alamb](https://github.com/alamb)) +- FilePartition and PartitionedFile for scanning flexibility [\#932](https://github.com/apache/arrow-datafusion/pull/932) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([yjshen](https://github.com/yjshen)) +- Improve SQLMetric APIs, port existing metrics [\#908](https://github.com/apache/arrow-datafusion/pull/908) ([alamb](https://github.com/alamb)) +- Add support for EXPLAIN ANALYZE [\#858](https://github.com/apache/arrow-datafusion/pull/858) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([alamb](https://github.com/alamb)) +- Rename concurrency to target\_partitions [\#706](https://github.com/apache/arrow-datafusion/pull/706) ([andygrove](https://github.com/andygrove)) + +**Implemented enhancements:** + +- Add booleans support to the `CASE` statement [\#1156](https://github.com/apache/arrow-datafusion/issues/1156) +- Implement General Purpose Constant Folding with the Expression Evaluator [\#1070](https://github.com/apache/arrow-datafusion/issues/1070) +- Mark volatility categories of functions [\#1069](https://github.com/apache/arrow-datafusion/issues/1069) +- Add "show" support to DataFrame API [\#937](https://github.com/apache/arrow-datafusion/issues/937) +- Add support for TRIM BOTH/LEADING/TRAILING [\#935](https://github.com/apache/arrow-datafusion/issues/935) +- Add "baseline" metrics to all built in operators [\#866](https://github.com/apache/arrow-datafusion/issues/866) +- Add SQL support for referencing fields in structs [\#119](https://github.com/apache/arrow-datafusion/issues/119) +- add filename completer for create table statement [\#1278](https://github.com/apache/arrow-datafusion/pull/1278) ([Jimexist](https://github.com/Jimexist)) +- Add drop table support [\#1266](https://github.com/apache/arrow-datafusion/pull/1266) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([viirya](https://github.com/viirya)) +- Dataframe supports except and update readme [\#1261](https://github.com/apache/arrow-datafusion/pull/1261) ([xudong963](https://github.com/xudong963)) +- Implement EXCEPT & EXCEPT DISTINCT [\#1259](https://github.com/apache/arrow-datafusion/pull/1259) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([xudong963](https://github.com/xudong963)) +- Add DataFrame support for `INTERSECT` and update readme [\#1258](https://github.com/apache/arrow-datafusion/pull/1258) ([xudong963](https://github.com/xudong963)) +- use arrow 6.1.0 [\#1255](https://github.com/apache/arrow-datafusion/pull/1255) ([Jimexist](https://github.com/Jimexist)) +- fix 1250, add editor support for datafusion cli with validation [\#1251](https://github.com/apache/arrow-datafusion/pull/1251) ([Jimexist](https://github.com/Jimexist)) +- Add support for `create table as` via MemTable [\#1243](https://github.com/apache/arrow-datafusion/pull/1243) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([Dandandan](https://github.com/Dandandan)) +- Add cli show columns command to describe tables [\#1231](https://github.com/apache/arrow-datafusion/pull/1231) ([Jimexist](https://github.com/Jimexist)) +- datafusion-cli to add list table command [\#1229](https://github.com/apache/arrow-datafusion/pull/1229) ([Jimexist](https://github.com/Jimexist)) +- datafusion cli to handle EoF and interrupt signal [\#1225](https://github.com/apache/arrow-datafusion/pull/1225) ([Jimexist](https://github.com/Jimexist)) +- add \q as quit command and add \? for help [\#1224](https://github.com/apache/arrow-datafusion/pull/1224) ([Jimexist](https://github.com/Jimexist)) +- Add algebraic simplifications to constant\_folding [\#1208](https://github.com/apache/arrow-datafusion/pull/1208) ([matthewmturner](https://github.com/matthewmturner)) +- Improve GetIndexedFieldExpr adding utf8 key based access for struct v… [\#1204](https://github.com/apache/arrow-datafusion/pull/1204) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([Igosuki](https://github.com/Igosuki)) +- Fix `between` in select query [\#1202](https://github.com/apache/arrow-datafusion/pull/1202) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([capkurmagati](https://github.com/capkurmagati)) +- Move code to fold Stable functions like `now()` from `Simplifier` to `ConstEvaluator` [\#1176](https://github.com/apache/arrow-datafusion/pull/1176) ([alamb](https://github.com/alamb)) +- DataFrame supports window function [\#1167](https://github.com/apache/arrow-datafusion/pull/1167) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([xudong963](https://github.com/xudong963)) +- add values list expression [\#1165](https://github.com/apache/arrow-datafusion/pull/1165) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([Jimexist](https://github.com/Jimexist)) +- Add booleans support to the CASE statement [\#1161](https://github.com/apache/arrow-datafusion/pull/1161) ([xudong963](https://github.com/xudong963)) +- Improve error messages when operations are not supported [\#1158](https://github.com/apache/arrow-datafusion/pull/1158) ([alamb](https://github.com/alamb)) +- Generic constant expression evaluation [\#1153](https://github.com/apache/arrow-datafusion/pull/1153) ([alamb](https://github.com/alamb)) +- python `lit` function to support bool and byte vec [\#1152](https://github.com/apache/arrow-datafusion/pull/1152) ([Jimexist](https://github.com/Jimexist)) +- \[nit\] simplify datafusion optimizer module codes [\#1146](https://github.com/apache/arrow-datafusion/pull/1146) ([panarch](https://github.com/panarch)) +- Add ScalarValue support for arbitrary list elements [\#1142](https://github.com/apache/arrow-datafusion/pull/1142) ([jonmmease](https://github.com/jonmmease)) +- Multiple files per partitions for CSV Avro Json [\#1138](https://github.com/apache/arrow-datafusion/pull/1138) ([rdettai](https://github.com/rdettai)) +- Implement INTERSECT & INTERSECT DISTINCT [\#1135](https://github.com/apache/arrow-datafusion/pull/1135) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([xudong963](https://github.com/xudong963)) +- Simplify file struct abstractions [\#1120](https://github.com/apache/arrow-datafusion/pull/1120) ([rdettai](https://github.com/rdettai)) +- Implement `is [not] distinct from` [\#1117](https://github.com/apache/arrow-datafusion/pull/1117) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([Dandandan](https://github.com/Dandandan)) +- Clean up spawned task on drop for `RepartitionExec`, `SortPreservingMergeExec`, `WindowAggExec` [\#1112](https://github.com/apache/arrow-datafusion/pull/1112) ([crepererum](https://github.com/crepererum)) +- add hyperloglog implementation \(`add` and `count`\) [\#1095](https://github.com/apache/arrow-datafusion/pull/1095) ([Jimexist](https://github.com/Jimexist)) +- Add ScalarValue::Struct variant [\#1091](https://github.com/apache/arrow-datafusion/pull/1091) ([jonmmease](https://github.com/jonmmease)) +- add digest\(utf8, method\) function and refactor all current hash digest functions [\#1090](https://github.com/apache/arrow-datafusion/pull/1090) ([Jimexist](https://github.com/Jimexist)) +- \[crypto\] add `blake3` algorithm to `digest` function [\#1086](https://github.com/apache/arrow-datafusion/pull/1086) ([Jimexist](https://github.com/Jimexist)) +- \[crypto\] add blake2b and blake2s functions [\#1081](https://github.com/apache/arrow-datafusion/pull/1081) ([Jimexist](https://github.com/Jimexist)) +- \[nit\] make schema qualifier error message in field lookup more readable [\#1079](https://github.com/apache/arrow-datafusion/pull/1079) ([Jimexist](https://github.com/Jimexist)) +- \[window function\] add `percent_rank` window function [\#1077](https://github.com/apache/arrow-datafusion/pull/1077) ([Jimexist](https://github.com/Jimexist)) +- \[window function\] add `cume_dist` implementation [\#1076](https://github.com/apache/arrow-datafusion/pull/1076) ([Jimexist](https://github.com/Jimexist)) +- Add a LogicalPlanBuilder::schema\(\) function [\#1075](https://github.com/apache/arrow-datafusion/pull/1075) ([alamb](https://github.com/alamb)) +- Add support for UNION \[DISTINCT\] sql [\#1068](https://github.com/apache/arrow-datafusion/pull/1068) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([xudong963](https://github.com/xudong963)) +- fix: fix joins on Float32/Float64 columns bug [\#1054](https://github.com/apache/arrow-datafusion/pull/1054) ([francis-du](https://github.com/francis-du)) +- Update sqlparser-rs to 0.11 [\#1052](https://github.com/apache/arrow-datafusion/pull/1052) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([alamb](https://github.com/alamb)) +- Support querying CSV files without providing the schema [\#1050](https://github.com/apache/arrow-datafusion/pull/1050) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([xudong963](https://github.com/xudong963)) +- remove hard coded partition count in ballista logicalplan deserialization [\#1044](https://github.com/apache/arrow-datafusion/pull/1044) ([xudong963](https://github.com/xudong963)) +- feat: add lit\_timestamp\_nanosecond [\#1030](https://github.com/apache/arrow-datafusion/pull/1030) ([NGA-TRAN](https://github.com/NGA-TRAN)) +- Ignore metadata on schema merge [\#1024](https://github.com/apache/arrow-datafusion/pull/1024) ([Smurphy000](https://github.com/Smurphy000)) +- add ExecutionConfig.with\_optimizer\_rules [\#1022](https://github.com/apache/arrow-datafusion/pull/1022) ([seddonm1](https://github.com/seddonm1)) +- Add baseline execution stats to `WindowAggExec` and `UnionExec`, and fixup `CoalescePartitionsExec` [\#1018](https://github.com/apache/arrow-datafusion/pull/1018) ([alamb](https://github.com/alamb)) +- Derive PartialOrd for Expr [\#1015](https://github.com/apache/arrow-datafusion/pull/1015) ([alamb](https://github.com/alamb)) +- Indexed field access for List [\#1006](https://github.com/apache/arrow-datafusion/pull/1006) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([Igosuki](https://github.com/Igosuki)) +- Add metrics for Limit and Projection, and CoalesceBatches [\#1004](https://github.com/apache/arrow-datafusion/pull/1004) ([alamb](https://github.com/alamb)) +- Update DataFusion to arrow 6.0 [\#984](https://github.com/apache/arrow-datafusion/pull/984) ([alamb](https://github.com/alamb)) +- Implement Display for Expr, improve operator display [\#971](https://github.com/apache/arrow-datafusion/pull/971) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([matthewmturner](https://github.com/matthewmturner)) +- Add metrics for FilterExec [\#960](https://github.com/apache/arrow-datafusion/pull/960) ([alamb](https://github.com/alamb)) +- Change compound column field name rules [\#952](https://github.com/apache/arrow-datafusion/pull/952) ([waynexia](https://github.com/waynexia)) +- ObjectStore API to read from remote storage systems [\#950](https://github.com/apache/arrow-datafusion/pull/950) ([yjshen](https://github.com/yjshen)) +- Add baseline metrics to `SortPreservingMergeExec` [\#948](https://github.com/apache/arrow-datafusion/pull/948) ([alamb](https://github.com/alamb)) +- Add support for TRIM LEADING/TRAILING/BOTH syntax [\#947](https://github.com/apache/arrow-datafusion/pull/947) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([adsharma](https://github.com/adsharma)) +- fixes \#933 replace placeholder fmt\_as fr ExecutionPlan impls [\#939](https://github.com/apache/arrow-datafusion/pull/939) ([tiphaineruy](https://github.com/tiphaineruy)) +- Add metrics for SortExect + HashAggregateExec [\#938](https://github.com/apache/arrow-datafusion/pull/938) ([alamb](https://github.com/alamb)) +- Add some additional asserts in `utils::from_plan` [\#930](https://github.com/apache/arrow-datafusion/pull/930) ([alamb](https://github.com/alamb)) +- Avro Table Provider [\#910](https://github.com/apache/arrow-datafusion/pull/910) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([Igosuki](https://github.com/Igosuki)) +- Add BaselineMetrics, Timestamp metrics, add for `CoalescePartitionsExec`, rename output\_time -\> elapsed\_compute [\#909](https://github.com/apache/arrow-datafusion/pull/909) ([alamb](https://github.com/alamb)) +- add cross join support to ballista [\#891](https://github.com/apache/arrow-datafusion/pull/891) ([houqp](https://github.com/houqp)) +- Add Ballista support to DataFusion CLI [\#889](https://github.com/apache/arrow-datafusion/pull/889) ([andygrove](https://github.com/andygrove)) +- support like on DictionaryArray [\#876](https://github.com/apache/arrow-datafusion/pull/876) ([b41sh](https://github.com/b41sh)) +- Register table based on known schema without file IO [\#872](https://github.com/apache/arrow-datafusion/pull/872) ([Dandandan](https://github.com/Dandandan)) +- Add support for PostgreSQL regex match [\#870](https://github.com/apache/arrow-datafusion/pull/870) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([b41sh](https://github.com/b41sh)) +- Include planning time in datafusion-cli printing [\#860](https://github.com/apache/arrow-datafusion/pull/860) ([Dandandan](https://github.com/Dandandan)) +- Implement basic common subexpression eliminate optimization [\#792](https://github.com/apache/arrow-datafusion/pull/792) ([waynexia](https://github.com/waynexia)) +- Impl `ops::Not` for `expr` [\#763](https://github.com/apache/arrow-datafusion/pull/763) ([Jimexist](https://github.com/Jimexist)) + +**Fixed bugs:** + +- Can not use `between` in the select list: [\#1196](https://github.com/apache/arrow-datafusion/issues/1196) +- ORDER BY does not work with literals: Sort operation is not applicable to scalar value 'foo' [\#1195](https://github.com/apache/arrow-datafusion/issues/1195) +- window functions with NULL literals in `partition by` and `order by` do not work: Internal\("Sort operation is not applicable to scalar value NULL"\) [\#1194](https://github.com/apache/arrow-datafusion/issues/1194) +- Operation name not included in internal errors -- Internal\("Data type Boolean not supported for binary operation on dyn arrays"\) [\#1157](https://github.com/apache/arrow-datafusion/issues/1157) +- Physical plan explain UNION query says "ExecutionPlan\(PlaceHolder\)" [\#933](https://github.com/apache/arrow-datafusion/issues/933) +- Can not use LIKE on DictionaryArray encoded strings [\#815](https://github.com/apache/arrow-datafusion/issues/815) +- physical\_plan::repartition::tests::repartition\_with\_dropping\_output\_stream failing locally [\#614](https://github.com/apache/arrow-datafusion/issues/614) +- Fix some `BuiltinScalarFunction` panics with zero arguments [\#1249](https://github.com/apache/arrow-datafusion/pull/1249) ([capkurmagati](https://github.com/capkurmagati)) +- fix: not do boolean folding on NULL and/or expr [\#1245](https://github.com/apache/arrow-datafusion/pull/1245) ([NGA-TRAN](https://github.com/NGA-TRAN)) +- ignore case of `with header row` in sql when creating external table [\#1237](https://github.com/apache/arrow-datafusion/pull/1237) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([lichuan6](https://github.com/lichuan6)) +- fix: Min/Max aggregation data type should not be dictionary [\#1235](https://github.com/apache/arrow-datafusion/pull/1235) ([NGA-TRAN](https://github.com/NGA-TRAN)) +- Fix build with `--no-default-features` [\#1219](https://github.com/apache/arrow-datafusion/pull/1219) ([alamb](https://github.com/alamb)) +- Prevent "future cannot be sent between threads safely" compilation error [\#1155](https://github.com/apache/arrow-datafusion/pull/1155) ([jonmmease](https://github.com/jonmmease)) +- Clean up spawned task on drop for `AnalyzeExec`, `CoalescePartitionsExec`, `HashAggregateExec` [\#1121](https://github.com/apache/arrow-datafusion/pull/1121) ([crepererum](https://github.com/crepererum)) +- Clean up spawned task on `SortStream` drop [\#1105](https://github.com/apache/arrow-datafusion/pull/1105) ([crepererum](https://github.com/crepererum)) +- fix UNION ALL bug: thread 'main' panicked at 'index out of bounds: the len is 1 but the index is 1', ./src/datatypes/schema.rs:165:10 [\#1088](https://github.com/apache/arrow-datafusion/pull/1088) ([xudong963](https://github.com/xudong963)) +- python: fix generated table name in dataframe creation [\#1078](https://github.com/apache/arrow-datafusion/pull/1078) ([houqp](https://github.com/houqp)) +- fix subquery alias [\#1067](https://github.com/apache/arrow-datafusion/pull/1067) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([xudong963](https://github.com/xudong963)) +- fix pattern handling in regexp\_match function [\#1065](https://github.com/apache/arrow-datafusion/pull/1065) ([houqp](https://github.com/houqp)) +- fix: joins on Timestamp columns [\#1055](https://github.com/apache/arrow-datafusion/pull/1055) ([francis-du](https://github.com/francis-du)) +- Fix metric name typo [\#943](https://github.com/apache/arrow-datafusion/pull/943) ([alamb](https://github.com/alamb)) +- EXPLAIN ANALYZE should run all Optimizer passes [\#929](https://github.com/apache/arrow-datafusion/pull/929) ([alamb](https://github.com/alamb)) + +**Documentation updates:** + +- update docs to fix DataFusion User Guide link [\#1238](https://github.com/apache/arrow-datafusion/pull/1238) ([jiangzhx](https://github.com/jiangzhx)) +- \[docs\] datafusion cli run via homebrew [\#1198](https://github.com/apache/arrow-datafusion/pull/1198) ([Jimexist](https://github.com/Jimexist)) +- add support for unary and binary values in values list, update docs [\#1172](https://github.com/apache/arrow-datafusion/pull/1172) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([Jimexist](https://github.com/Jimexist)) +- Add additional docstring comments to `from_plan` [\#1168](https://github.com/apache/arrow-datafusion/pull/1168) ([alamb](https://github.com/alamb)) +- \[nit\] fix document issue for `approx_distinct` [\#1110](https://github.com/apache/arrow-datafusion/pull/1110) ([Jimexist](https://github.com/Jimexist)) +- implement `approx_distinct` function using HyperLogLog [\#1087](https://github.com/apache/arrow-datafusion/pull/1087) ([Jimexist](https://github.com/Jimexist)) +- Remove unused `use` statements from examples [\#1032](https://github.com/apache/arrow-datafusion/pull/1032) ([alamb](https://github.com/alamb)) +- consolidate datafusion docs with sphinx [\#993](https://github.com/apache/arrow-datafusion/pull/993) ([houqp](https://github.com/houqp)) +- Updated user-guide library docs with optimized config [\#976](https://github.com/apache/arrow-datafusion/pull/976) ([matthewmturner](https://github.com/matthewmturner)) +- Improve User Guide [\#954](https://github.com/apache/arrow-datafusion/pull/954) ([andygrove](https://github.com/andygrove)) +- \[MINOR\] Fix typos in doc comments [\#945](https://github.com/apache/arrow-datafusion/pull/945) ([alamb](https://github.com/alamb)) +- \[DataFusion\] - Add show and show\_limit function for DataFrame [\#923](https://github.com/apache/arrow-datafusion/pull/923) ([francis-du](https://github.com/francis-du)) +- Typo fix in DataFusion crate documentation [\#914](https://github.com/apache/arrow-datafusion/pull/914) ([antoinewdg](https://github.com/antoinewdg)) + +**Performance improvements:** + +- Improve avro reader performance by avoiding some cloning on avro\_rs::Value [\#1206](https://github.com/apache/arrow-datafusion/pull/1206) ([Igosuki](https://github.com/Igosuki)) +- optimize build profile for datafusion python binding, cli and ballista [\#1137](https://github.com/apache/arrow-datafusion/pull/1137) ([houqp](https://github.com/houqp)) +- Avoid stack overflow by reducing stack usage of `BinaryExpr::evaluate` in debug builds [\#1047](https://github.com/apache/arrow-datafusion/pull/1047) ([alamb](https://github.com/alamb)) +- Add ScalarValue::eq\_array optimized comparison function [\#844](https://github.com/apache/arrow-datafusion/pull/844) ([alamb](https://github.com/alamb)) +- Rework GroupByHash to for faster performance and support grouping by nulls [\#808](https://github.com/apache/arrow-datafusion/pull/808) ([alamb](https://github.com/alamb)) + +**Closed issues:** + +- InList expr with NULL literals do not work [\#1190](https://github.com/apache/arrow-datafusion/issues/1190) +- update the homepage README to include values, `approx_distinct`, etc. [\#1171](https://github.com/apache/arrow-datafusion/issues/1171) +- \[Python\]: Inconsistencies with Python package name [\#1011](https://github.com/apache/arrow-datafusion/issues/1011) +- Wanting to contribute to project where to start? [\#983](https://github.com/apache/arrow-datafusion/issues/983) +- delete redundant code [\#973](https://github.com/apache/arrow-datafusion/issues/973) +- How to build DataFusion python wheel [\#853](https://github.com/apache/arrow-datafusion/issues/853) +- Add support for partition pruning [\#204](https://github.com/apache/arrow-datafusion/issues/204) +- \[Datafusion\] Support joins on TimestampMillisecond columns [\#187](https://github.com/apache/arrow-datafusion/issues/187) +- TPC-H Query 21 [\#173](https://github.com/apache/arrow-datafusion/issues/173) +- TPC-H Query 13 [\#164](https://github.com/apache/arrow-datafusion/issues/164) +- TPC-H Query 8 [\#162](https://github.com/apache/arrow-datafusion/issues/162) +- implement split\_part\(string, delimiter, position\) [\#157](https://github.com/apache/arrow-datafusion/issues/157) +- Join Statement: Schema contains duplicate unqualified field name [\#155](https://github.com/apache/arrow-datafusion/issues/155) +- ParquetTable should avoid scanning all files twice [\#136](https://github.com/apache/arrow-datafusion/issues/136) +- Add support for reading partitioned Parquet files [\#133](https://github.com/apache/arrow-datafusion/issues/133) +- Add support for Parquet schema merging [\#132](https://github.com/apache/arrow-datafusion/issues/132) +- Catalog abstraction [\#126](https://github.com/apache/arrow-datafusion/issues/126) +- Optimizer rules should work with qualified column names [\#125](https://github.com/apache/arrow-datafusion/issues/125) +- Add optional qualifier to Expr::Column [\#121](https://github.com/apache/arrow-datafusion/issues/121) +- Implement modulus expression [\#99](https://github.com/apache/arrow-datafusion/issues/99) +- \[Rust\] Add constant folding to expressions during logically planning [\#98](https://github.com/apache/arrow-datafusion/issues/98) +- \[Rust\] Implement pretty print for physical query plan [\#93](https://github.com/apache/arrow-datafusion/issues/93) +- Can not group by boolean columns \(add boolean to valid keys of groupBy\) [\#91](https://github.com/apache/arrow-datafusion/issues/91) +- improve performance of building literal arrays [\#90](https://github.com/apache/arrow-datafusion/issues/90) +- \[rust\]\[datafusion\] optimize count\(\*\) queries on parquet sources [\#89](https://github.com/apache/arrow-datafusion/issues/89) +- Produce a design for a metrics framework [\#21](https://github.com/apache/arrow-datafusion/issues/21) + +**Merged pull requests:** + +- Add timezome string to stablize test [\#1265](https://github.com/apache/arrow-datafusion/pull/1265) ([viirya](https://github.com/viirya)) +- numerical\_coercion pattern match optimize [\#1256](https://github.com/apache/arrow-datafusion/pull/1256) ([Jimexist](https://github.com/Jimexist)) +- fix and update window function sql tests [\#1059](https://github.com/apache/arrow-datafusion/pull/1059) ([Jimexist](https://github.com/Jimexist)) +- reduce ScalarValue from trait boilerplate with macro [\#989](https://github.com/apache/arrow-datafusion/pull/989) ([houqp](https://github.com/houqp)) + + +For older versions, see [apache/arrow/CHANGELOG.md](https://github.com/apache/arrow/blob/master/CHANGELOG.md) + ## [5.0.0](https://github.com/apache/arrow-datafusion/tree/5.0.0) (2021-08-10) [Full Changelog](https://github.com/apache/arrow-datafusion/compare/4.0.0...5.0.0) diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index 7df8c8fd8620..f0f368a75c6e 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "datafusion" description = "DataFusion is an in-memory query engine that uses Apache Arrow as the memory model" -version = "5.1.0" +version = "6.0.0" homepage = "https://github.com/apache/arrow-datafusion" repository = "https://github.com/apache/arrow-datafusion" readme = "../README.md" diff --git a/dev/release/README.md b/dev/release/README.md index 7a515732df65..775678a457e0 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -82,7 +82,11 @@ git fetch apache git checkout apache/master ``` -Update datafusion version in `datafusion/Cargo.toml` to `5.1.0`. +Update datafusion version in `datafusion/Cargo.toml` to `5.1.0`: + +``` +./dev/update_datafusion_versions.py 5.1.0 +``` If there is a ballista release, update versions in ballista Cargo.tomls, run @@ -101,19 +105,9 @@ git commit -a -m 'Update version' ### Update CHANGELOG.md -Create local release rc tags: - -``` -git tag -f 5.1.0-rc-local -# if there is ballista release -git tag -f ballista-0.5.0-rc-local -# if there is python binding release -git tag -f python-0.3.0-rc-local -``` - -Manully edit the previous release version tag in +Manully edit the base version tag argument in `dev/release/update_change_log-{ballista,datafusion,python}.sh`. Commits -between the previous verstion tag and the new rc tag will be used to +between the base verstion tag and the latest upstream master will be used to populate the changelog content. ```bash @@ -123,9 +117,6 @@ CHANGELOG_GITHUB_TOKEN= ./dev/release/update_change_log-all.sh git commit -a -m 'Create changelog for release' ``` -Note that when reviewing the change log, rather than editing the -`CHANGELOG.md`, it is preferred to update the issues and their labels. - You can add `invalid` or `development-process` label to exclude items from release notes. Add `datafusion`, `ballista` and `python` labels to group items into each sub-project's change log. diff --git a/dev/release/update_change_log-all.sh b/dev/release/update_change_log-all.sh index 9ef09eb93553..c5639cc23b70 100755 --- a/dev/release/update_change_log-all.sh +++ b/dev/release/update_change_log-all.sh @@ -18,6 +18,8 @@ # under the License. # +set -e + # Usage: # CHANGELOG_GITHUB_TOKEN= ./update_change_log-datafusion.sh diff --git a/dev/release/update_change_log-ballista.sh b/dev/release/update_change_log-ballista.sh index 05c5f6fe6984..b5ce827a8091 100755 --- a/dev/release/update_change_log-ballista.sh +++ b/dev/release/update_change_log-ballista.sh @@ -25,4 +25,8 @@ SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" CURRENT_VER=$(grep version "${SOURCE_TOP_DIR}/ballista/rust/client/Cargo.toml" | head -n 1 | awk '{print $3}' | tr -d '"') -${SOURCE_DIR}/update_change_log.sh ballista 4.0.0 "ballista-${CURRENT_VER}-rc-local" +${SOURCE_DIR}/update_change_log.sh \ + ballista \ + ballista-0.5.0 \ + --exclude-tags-regex "python-.+" \ + --future-release "ballista-${CURRENT_VER}" diff --git a/dev/release/update_change_log-datafusion.sh b/dev/release/update_change_log-datafusion.sh index 1570c9125275..4259c86465e9 100755 --- a/dev/release/update_change_log-datafusion.sh +++ b/dev/release/update_change_log-datafusion.sh @@ -25,4 +25,8 @@ SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" CURRENT_VER=$(grep version "${SOURCE_TOP_DIR}/datafusion/Cargo.toml" | head -n 1 | awk '{print $3}' | tr -d '"') -${SOURCE_DIR}/update_change_log.sh datafusion 4.0.0 "${CURRENT_VER}-rc-local" +${SOURCE_DIR}/update_change_log.sh \ + datafusion \ + 5.0.0 \ + --exclude-tags-regex "(python|ballista)-.+" \ + --future-release "${CURRENT_VER}" diff --git a/dev/release/update_change_log-python.sh b/dev/release/update_change_log-python.sh index 6b864f9be1b2..6d428e808d9b 100755 --- a/dev/release/update_change_log-python.sh +++ b/dev/release/update_change_log-python.sh @@ -25,4 +25,8 @@ SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" CURRENT_VER=$(grep version "${SOURCE_TOP_DIR}/python/Cargo.toml" | head -n 1 | awk '{print $3}' | tr -d '"') -${SOURCE_DIR}/update_change_log.sh python 4.0.0 "python-${CURRENT_VER}-rc-local" +${SOURCE_DIR}/update_change_log.sh \ + python \ + python-0.3.0 \ + --exclude-tags-regex "ballista-.+" \ + --future-release "python-${CURRENT_VER}" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 0c9c2332ce70..1d1570d68a62 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -27,34 +27,44 @@ # arrow-datafusion/.github_changelog_generator # # Usage: -# CHANGELOG_GITHUB_TOKEN= ./update_change_log.sh +# CHANGELOG_GITHUB_TOKEN= ./update_change_log.sh set -e SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" -if [[ "$#" -ne 3 ]]; then - echo "USAGE: $0 PROJECT FROM_VER TO_VER" +echo $1 + +if [[ "$#" -lt 2 ]]; then + echo "USAGE: $0 PROJECT SINCE_TAG EXTRA_ARGS..." exit 1 fi PROJECT=$1 -FROM_VER=$2 -TO_VER=$3 +SINCE_TAG=$2 +shift 2 + OUTPUT_PATH="${PROJECT}/CHANGELOG.md" pushd ${SOURCE_TOP_DIR} + +# reset content in changelog +git co "${SINCE_TAG}" "${OUTPUT_PATH}" +# remove license header so github-changelog-generator has a clean base to append +sed -i '1,18d' "${OUTPUT_PATH}" + docker run -it --rm \ -e CHANGELOG_GITHUB_TOKEN=$CHANGELOG_GITHUB_TOKEN \ -v "$(pwd)":/usr/local/src/your-app \ - githubchangeloggenerator/github-changelog-generator \ + githubchangeloggenerator/github-changelog-generator:1.16.2 \ --user apache \ --project arrow-datafusion \ - --since-tag "${FROM_VER}" \ + --since-tag "${SINCE_TAG}" \ --include-labels "${PROJECT}" \ + --base "${OUTPUT_PATH}" \ --output "${OUTPUT_PATH}" \ - --future-release "${TO_VER}" + "$@" sed -i "s/\\\n/\n\n/" "${OUTPUT_PATH}" diff --git a/dev/update_ballista_versions.py b/dev/update_ballista_versions.py index 7023541ad49e..57e055e964a2 100755 --- a/dev/update_ballista_versions.py +++ b/dev/update_ballista_versions.py @@ -35,10 +35,12 @@ def update_cargo_toml(cargo_toml: str, new_version: str): data = f.read() doc = tomlkit.parse(data) - doc.get('package')['version'] = new_version + if cargo_toml.startswith("ballista/"): + doc.get('package')['version'] = new_version # ballista crates also depend on each other ballista_deps = ( + 'ballista', 'ballista-core', 'ballista-executor', 'ballista-scheduler', @@ -80,6 +82,7 @@ def main(): 'ballista/rust/scheduler', 'ballista/rust/executor', 'ballista/rust/client', + 'datafusion-cli', ] ]) new_version = args.new_version @@ -89,7 +92,10 @@ def main(): for cargo_toml in ballista_crates: update_cargo_toml(cargo_toml, new_version) - for path in ("benchmarks/docker-compose.yaml", "docs/user-guide/src/distributed/docker-compose.md"): + for path in ( + "benchmarks/docker-compose.yaml", + "docs/source/user-guide/distributed/deployment/docker-compose.md", + ): path = os.path.join(repo_root, path) update_docker_compose(path, new_version) diff --git a/dev/update_datafusion_versions.py b/dev/update_datafusion_versions.py index d312f21a8108..af16b515f753 100755 --- a/dev/update_datafusion_versions.py +++ b/dev/update_datafusion_versions.py @@ -22,6 +22,7 @@ # dependencies: # pip install tomlkit +import re import os import argparse from pathlib import Path @@ -61,6 +62,15 @@ def update_downstream_versions(cargo_toml: str, new_version: str): f.write(tomlkit.dumps(doc)) +def update_docs(path: str, new_version: str): + print(f"updating docs in {path}") + with open(path, 'r+') as fd: + content = fd.read() + fd.seek(0) + content = re.sub(r'datafusion = "(.+)"', f'datafusion = "{new_version}"', content) + fd.write(content) + + def main(): parser = argparse.ArgumentParser( description=( @@ -79,6 +89,8 @@ def main(): for cargo_toml in repo_root.rglob('Cargo.toml'): update_downstream_versions(cargo_toml, new_version) + update_docs("README.md", new_version) + if __name__ == "__main__": main() diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md index a4964abdd4bb..a07cb003c5cd 100644 --- a/python/CHANGELOG.md +++ b/python/CHANGELOG.md @@ -17,10 +17,67 @@ under the License. --> -For older versions, see [apache/arrow/CHANGELOG.md](https://github.com/apache/arrow/blob/master/CHANGELOG.md) - # Changelog +## [python-0.4.0](https://github.com/apache/arrow-datafusion/tree/python-0.4.0) (2021-11-13) + +[Full Changelog](https://github.com/apache/arrow-datafusion/compare/python-0.3.0...python-0.4.0) + +**Breaking changes:** + +- Add function volatility to Signature [\#1071](https://github.com/apache/arrow-datafusion/pull/1071) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([pjmore](https://github.com/pjmore)) +- Make TableProvider.scan\(\) and PhysicalPlanner::create\_physical\_plan\(\) async [\#1013](https://github.com/apache/arrow-datafusion/pull/1013) ([rdettai](https://github.com/rdettai)) +- Reorganize table providers by table format [\#1010](https://github.com/apache/arrow-datafusion/pull/1010) ([rdettai](https://github.com/rdettai)) + +**Implemented enhancements:** + +- Build abi3 wheels for python binding [\#921](https://github.com/apache/arrow-datafusion/issues/921) +- Release documentation for python binding [\#837](https://github.com/apache/arrow-datafusion/issues/837) +- use arrow 6.1.0 [\#1255](https://github.com/apache/arrow-datafusion/pull/1255) ([Jimexist](https://github.com/Jimexist)) +- python `lit` function to support bool and byte vec [\#1152](https://github.com/apache/arrow-datafusion/pull/1152) ([Jimexist](https://github.com/Jimexist)) +- add python binding for `approx_distinct` aggregate function [\#1134](https://github.com/apache/arrow-datafusion/pull/1134) ([Jimexist](https://github.com/Jimexist)) +- refactor datafusion python `lit` function to allow different types [\#1130](https://github.com/apache/arrow-datafusion/pull/1130) ([Jimexist](https://github.com/Jimexist)) +- \[python\] add digest python function [\#1127](https://github.com/apache/arrow-datafusion/pull/1127) ([Jimexist](https://github.com/Jimexist)) +- \[crypto\] add `blake3` algorithm to `digest` function [\#1086](https://github.com/apache/arrow-datafusion/pull/1086) ([Jimexist](https://github.com/Jimexist)) +- \[crypto\] add blake2b and blake2s functions [\#1081](https://github.com/apache/arrow-datafusion/pull/1081) ([Jimexist](https://github.com/Jimexist)) +- fix: fix joins on Float32/Float64 columns bug [\#1054](https://github.com/apache/arrow-datafusion/pull/1054) ([francis-du](https://github.com/francis-du)) +- Update DataFusion to arrow 6.0 [\#984](https://github.com/apache/arrow-datafusion/pull/984) ([alamb](https://github.com/alamb)) +- \[Python\] Add support to perform sql query on in-memory datasource. [\#981](https://github.com/apache/arrow-datafusion/pull/981) ([mmuru](https://github.com/mmuru)) +- \[Python\] - Support show function for DataFrame api of python library [\#942](https://github.com/apache/arrow-datafusion/pull/942) ([francis-du](https://github.com/francis-du)) +- Rework the python bindings using conversion traits from arrow-rs [\#873](https://github.com/apache/arrow-datafusion/pull/873) ([kszucs](https://github.com/kszucs)) + +**Fixed bugs:** + +- Error in `python test` check / maturn python build: `function or associated item not found in `proc_macro::Literal` [\#961](https://github.com/apache/arrow-datafusion/issues/961) +- Use UUID to create unique table names in python binding [\#1111](https://github.com/apache/arrow-datafusion/pull/1111) ([hippowdon](https://github.com/hippowdon)) +- python: fix generated table name in dataframe creation [\#1078](https://github.com/apache/arrow-datafusion/pull/1078) ([houqp](https://github.com/houqp)) +- fix: joins on Timestamp columns [\#1055](https://github.com/apache/arrow-datafusion/pull/1055) ([francis-du](https://github.com/francis-du)) +- register datafusion.functions as a python package [\#995](https://github.com/apache/arrow-datafusion/pull/995) ([houqp](https://github.com/houqp)) + +**Documentation updates:** + +- python: update docs to use new APIs [\#1287](https://github.com/apache/arrow-datafusion/pull/1287) ([houqp](https://github.com/houqp)) +- Fix typo on Python functions [\#1207](https://github.com/apache/arrow-datafusion/pull/1207) ([j-a-m-l](https://github.com/j-a-m-l)) +- fix deadlink in python/readme [\#1002](https://github.com/apache/arrow-datafusion/pull/1002) ([waynexia](https://github.com/waynexia)) + +**Performance improvements:** + +- optimize build profile for datafusion python binding, cli and ballista [\#1137](https://github.com/apache/arrow-datafusion/pull/1137) ([houqp](https://github.com/houqp)) + +**Closed issues:** + +- InList expr with NULL literals do not work [\#1190](https://github.com/apache/arrow-datafusion/issues/1190) +- update the homepage README to include values, `approx_distinct`, etc. [\#1171](https://github.com/apache/arrow-datafusion/issues/1171) +- \[Python\]: Inconsistencies with Python package name [\#1011](https://github.com/apache/arrow-datafusion/issues/1011) +- Wanting to contribute to project where to start? [\#983](https://github.com/apache/arrow-datafusion/issues/983) +- delete redundant code [\#973](https://github.com/apache/arrow-datafusion/issues/973) +- \[Python\]: register custom datasource [\#906](https://github.com/apache/arrow-datafusion/issues/906) +- How to build DataFusion python wheel [\#853](https://github.com/apache/arrow-datafusion/issues/853) +- Produce a design for a metrics framework [\#21](https://github.com/apache/arrow-datafusion/issues/21) + + +For older versions, see [apache/arrow/CHANGELOG.md](https://github.com/apache/arrow/blob/master/CHANGELOG.md) + ## [python-0.3.0](https://github.com/apache/arrow-datafusion/tree/python-0.3.0) (2021-08-10) [Full Changelog](https://github.com/apache/arrow-datafusion/compare/4.0.0...python-0.3.0) diff --git a/python/Cargo.toml b/python/Cargo.toml index 3d3ebfa34540..568f3c7b35d3 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "datafusion-python" -version = "0.3.0" +version = "0.4.0" homepage = "https://github.com/apache/arrow" repository = "https://github.com/apache/arrow" authors = ["Apache Arrow "] @@ -31,7 +31,7 @@ rust-version = "1.56" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } rand = "0.7" pyo3 = { version = "0.14", features = ["extension-module", "abi3", "abi3-py36"] } -datafusion = { path = "../datafusion", version = "5.1.0", features = ["pyarrow"] } +datafusion = { path = "../datafusion", version = "6.0.0", features = ["pyarrow"] } uuid = { version = "0.8", features = ["v4"] } [lib]