Implement Streaming Aggregation: Do not break pipeline in aggregation…

… if group by columns are ordered (V2) (#6124) * add starting code for experimenting * stream group by linear implementation * sorted implementation * minor changes * simplifications * Simplifications * convert vec to Option * minor changes * minor changes * minor changes * simplifications * minor changes * all tests pass * refactor * simplifications * remove unnecessary code * simplifications * minor changes * simplifications * minor changes * Simplify the GroupByOrderMode type * Address reviews * separate fully ordered case and remaining cases * change test data type * address reviews * Convert to option * retract back to old API. * Code quality: stylistic changes * Separate bounded stream and hash stream * Update comments --------- Co-authored-by: Mehmet Ozan Kabak <[email protected]>
apache · Apr 27, 2023 · aec3420 · aec3420
1 parent a384809
commit aec3420
Show file tree

Hide file tree

Showing 23 changed files with 1,818 additions and 199 deletions.
diff --git a/datafusion/core/src/physical_plan/aggregates/bounded_aggregate_stream.rs b/datafusion/core/src/physical_plan/aggregates/bounded_aggregate_stream.rs
diff --git a/datafusion/core/src/physical_plan/aggregates/mod.rs b/datafusion/core/src/physical_plan/aggregates/mod.rs
diff --git a/datafusion/core/src/physical_plan/aggregates/row_hash.rs b/datafusion/core/src/physical_plan/aggregates/row_hash.rs
@@ -24,33 +24,34 @@ use std::task::{Context, Poll};
 use std::vec;
 
 use ahash::RandomState;
-use arrow::row::{OwnedRow, RowConverter, SortField};
+use arrow::row::{RowConverter, SortField};
 use datafusion_physical_expr::hash_utils::create_hashes;
 use futures::ready;
 use futures::stream::{Stream, StreamExt};
 
 use crate::execution::context::TaskContext;
 use crate::execution::memory_pool::proxy::{RawTableAllocExt, VecAllocExt};
 use crate::execution::memory_pool::{MemoryConsumer, MemoryReservation};
+use crate::physical_plan::aggregates::utils::{
+    aggr_state_schema, col_to_scalar, get_at_indices, get_optional_filters,
+    read_as_batch, slice_and_maybe_filter, ExecutionState, GroupState,
+};
 use crate::physical_plan::aggregates::{
-    evaluate_group_by, evaluate_many, evaluate_optional, group_schema, AccumulatorItem,
-    AggregateMode, PhysicalGroupBy, RowAccumulatorItem,
+    evaluate_group_by, evaluate_many, evaluate_optional, group_schema, AggregateMode,
+    PhysicalGroupBy, RowAccumulatorItem,
 };
 use crate::physical_plan::metrics::{BaselineMetrics, RecordOutput};
 use crate::physical_plan::{aggregates, AggregateExpr, PhysicalExpr};
 use crate::physical_plan::{RecordBatchStream, SendableRecordBatchStream};
 use arrow::array::*;
-use arrow::compute::{cast, filter};
-use arrow::datatypes::{DataType, Schema, UInt32Type};
-use arrow::{compute, datatypes::SchemaRef, record_batch::RecordBatch};
+use arrow::compute::cast;
+use arrow::datatypes::DataType;
+use arrow::{datatypes::SchemaRef, record_batch::RecordBatch};
 use datafusion_common::cast::as_boolean_array;
-use datafusion_common::utils::get_arrayref_at_indices;
 use datafusion_common::{Result, ScalarValue};
 use datafusion_expr::Accumulator;
 use datafusion_row::accessor::RowAccessor;
 use datafusion_row::layout::RowLayout;
-use datafusion_row::reader::{read_row, RowReader};
-use datafusion_row::MutableRecordBatch;
 use hashbrown::raw::RawTable;
 use itertools::izip;
 
@@ -68,7 +69,6 @@ use itertools::izip;
 /// 4. The state's RecordBatch is `merge`d to a new state
 /// 5. The state is mapped to the final value
 ///
-/// [Arrow-row]: OwnedRow
 /// [WordAligned]: datafusion_row::layout
 pub(crate) struct GroupedHashAggregateStream {
     schema: SchemaRef,
@@ -107,22 +107,6 @@ pub(crate) struct GroupedHashAggregateStream {
     indices: [Vec<Range<usize>>; 2],
 }
 
-#[derive(Debug)]
-/// tracks what phase the aggregation is in
-enum ExecutionState {
-    ReadingInput,
-    ProducingOutput,
-    Done,
-}
-
-fn aggr_state_schema(aggr_expr: &[Arc<dyn AggregateExpr>]) -> Result<SchemaRef> {
-    let fields = aggr_expr
-        .iter()
-        .flat_map(|expr| expr.state_fields().unwrap().into_iter())
-        .collect::<Vec<_>>();
-    Ok(Arc::new(Schema::new(fields)))
-}
-
 impl GroupedHashAggregateStream {
     /// Create a new GroupedHashAggregateStream
     #[allow(clippy::too_many_arguments)]
@@ -617,25 +601,8 @@ impl GroupedHashAggregateStream {
     }
 }
 
-/// The state that is built for each output group.
-#[derive(Debug)]
-pub struct GroupState {
-    /// The actual group by values, stored sequentially
-    group_by_values: OwnedRow,
-
-    // Accumulator state, stored sequentially
-    pub aggregation_buffer: Vec<u8>,
-
-    // Accumulator state, one for each aggregate that doesn't support row accumulation
-    pub accumulator_set: Vec<AccumulatorItem>,
-
-    /// scratch space used to collect indices for input rows in a
-    /// bach that have values to aggregate. Reset on each batch
-    pub indices: Vec<u32>,
-}
-
 /// The state of all the groups
-pub struct AggregationState {
+pub(crate) struct AggregationState {
     pub reservation: MemoryReservation,
 
     /// Logically maps group values to an index in `group_states`
@@ -788,88 +755,3 @@ impl GroupedHashAggregateStream {
         Ok(Some(RecordBatch::try_new(self.schema.clone(), output)?))
     }
 }
-
-fn read_as_batch(rows: &[Vec<u8>], schema: &Schema) -> Vec<ArrayRef> {
-    let row_num = rows.len();
-    let mut output = MutableRecordBatch::new(row_num, Arc::new(schema.clone()));
-    let mut row = RowReader::new(schema);
-
-    for data in rows {
-        row.point_to(0, data);
-        read_row(&row, &mut output, schema);
-    }
-
-    output.output_as_columns()
-}
-
-fn get_at_indices(
-    input_values: &[Vec<ArrayRef>],
-    batch_indices: &PrimitiveArray<UInt32Type>,
-) -> Result<Vec<Vec<ArrayRef>>> {
-    input_values
-        .iter()
-        .map(|array| get_arrayref_at_indices(array, batch_indices))
-        .collect()
-}
-
-fn get_optional_filters(
-    original_values: &[Option<Arc<dyn Array>>],
-    batch_indices: &PrimitiveArray<UInt32Type>,
-) -> Vec<Option<Arc<dyn Array>>> {
-    original_values
-        .iter()
-        .map(|array| {
-            array.as_ref().map(|array| {
-                compute::take(
-                    array.as_ref(),
-                    batch_indices,
-                    None, // None: no index check
-                )
-                .unwrap()
-            })
-        })
-        .collect()
-}
-
-fn slice_and_maybe_filter(
-    aggr_array: &[ArrayRef],
-    filter_opt: Option<&Arc<dyn Array>>,
-    offsets: &[usize],
-) -> Result<Vec<ArrayRef>> {
-    let sliced_arrays: Vec<ArrayRef> = aggr_array
-        .iter()
-        .map(|array| array.slice(offsets[0], offsets[1] - offsets[0]))
-        .collect();
-
-    let filtered_arrays = match filter_opt.as_ref() {
-        Some(f) => {
-            let sliced = f.slice(offsets[0], offsets[1] - offsets[0]);
-            let filter_array = as_boolean_array(&sliced)?;
-
-            sliced_arrays
-                .iter()
-                .map(|array| filter(array, filter_array).unwrap())
-                .collect::<Vec<ArrayRef>>()
-        }
-        None => sliced_arrays,
-    };
-    Ok(filtered_arrays)
-}
-
-/// This method is similar to Scalar::try_from_array except for the Null handling.
-/// This method returns [ScalarValue::Null] instead of [ScalarValue::Type(None)]
-fn col_to_scalar(
-    array: &ArrayRef,
-    filter: &Option<&BooleanArray>,
-    row_index: usize,
-) -> Result<ScalarValue> {
-    if array.is_null(row_index) {
-        return Ok(ScalarValue::Null);
-    }
-    if let Some(filter) = filter {
-        if !filter.value(row_index) {
-            return Ok(ScalarValue::Null);
-        }
-    }
-    ScalarValue::try_from_array(array, row_index)
-}
diff --git a/datafusion/core/src/physical_plan/aggregates/utils.rs b/datafusion/core/src/physical_plan/aggregates/utils.rs
@@ -0,0 +1,151 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::physical_plan::aggregates::AccumulatorItem;
+use arrow::compute;
+use arrow::compute::filter;
+use arrow::row::OwnedRow;
+use arrow_array::types::UInt32Type;
+use arrow_array::{Array, ArrayRef, BooleanArray, PrimitiveArray};
+use arrow_schema::{Schema, SchemaRef};
+use datafusion_common::cast::as_boolean_array;
+use datafusion_common::utils::get_arrayref_at_indices;
+use datafusion_common::{Result, ScalarValue};
+use datafusion_physical_expr::AggregateExpr;
+use datafusion_row::reader::{read_row, RowReader};
+use datafusion_row::MutableRecordBatch;
+use std::sync::Arc;
+
+/// The state that is built for each output group.
+#[derive(Debug)]
+pub(crate) struct GroupState {
+    /// The actual group by values, stored sequentially
+    pub group_by_values: OwnedRow,
+
+    // Accumulator state, stored sequentially
+    pub aggregation_buffer: Vec<u8>,
+
+    // Accumulator state, one for each aggregate that doesn't support row accumulation
+    pub accumulator_set: Vec<AccumulatorItem>,
+
+    /// scratch space used to collect indices for input rows in a
+    /// bach that have values to aggregate. Reset on each batch
+    pub indices: Vec<u32>,
+}
+
+#[derive(Debug)]
+/// tracks what phase the aggregation is in
+pub(crate) enum ExecutionState {
+    ReadingInput,
+    ProducingOutput,
+    Done,
+}
+
+pub(crate) fn aggr_state_schema(
+    aggr_expr: &[Arc<dyn AggregateExpr>],
+) -> Result<SchemaRef> {
+    let fields = aggr_expr
+        .iter()
+        .flat_map(|expr| expr.state_fields().unwrap().into_iter())
+        .collect::<Vec<_>>();
+    Ok(Arc::new(Schema::new(fields)))
+}
+
+pub(crate) fn read_as_batch(rows: &[Vec<u8>], schema: &Schema) -> Vec<ArrayRef> {
+    let row_num = rows.len();
+    let mut output = MutableRecordBatch::new(row_num, Arc::new(schema.clone()));
+    let mut row = RowReader::new(schema);
+
+    for data in rows {
+        row.point_to(0, data);
+        read_row(&row, &mut output, schema);
+    }
+
+    output.output_as_columns()
+}
+
+pub(crate) fn get_at_indices(
+    input_values: &[Vec<ArrayRef>],
+    batch_indices: &PrimitiveArray<UInt32Type>,
+) -> Result<Vec<Vec<ArrayRef>>> {
+    input_values
+        .iter()
+        .map(|array| get_arrayref_at_indices(array, batch_indices))
+        .collect()
+}
+
+pub(crate) fn get_optional_filters(
+    original_values: &[Option<Arc<dyn Array>>],
+    batch_indices: &PrimitiveArray<UInt32Type>,
+) -> Vec<Option<Arc<dyn Array>>> {
+    original_values
+        .iter()
+        .map(|array| {
+            array.as_ref().map(|array| {
+                compute::take(
+                    array.as_ref(),
+                    batch_indices,
+                    None, // None: no index check
+                )
+                .unwrap()
+            })
+        })
+        .collect()
+}
+
+pub(crate) fn slice_and_maybe_filter(
+    aggr_array: &[ArrayRef],
+    filter_opt: Option<&Arc<dyn Array>>,
+    offsets: &[usize],
+) -> Result<Vec<ArrayRef>> {
+    let sliced_arrays: Vec<ArrayRef> = aggr_array
+        .iter()
+        .map(|array| array.slice(offsets[0], offsets[1] - offsets[0]))
+        .collect();
+
+    let filtered_arrays = match filter_opt.as_ref() {
+        Some(f) => {
+            let sliced = f.slice(offsets[0], offsets[1] - offsets[0]);
+            let filter_array = as_boolean_array(&sliced)?;
+
+            sliced_arrays
+                .iter()
+                .map(|array| filter(array, filter_array).unwrap())
+                .collect::<Vec<ArrayRef>>()
+        }
+        None => sliced_arrays,
+    };
+    Ok(filtered_arrays)
+}
+
+/// This method is similar to Scalar::try_from_array except for the Null handling.
+/// This method returns [ScalarValue::Null] instead of [ScalarValue::Type(None)]
+pub(crate) fn col_to_scalar(
+    array: &ArrayRef,
+    filter: &Option<&BooleanArray>,
+    row_index: usize,
+) -> Result<ScalarValue> {
+    if array.is_null(row_index) {
+        return Ok(ScalarValue::Null);
+    }
+    if let Some(filter) = filter {
+        if !filter.value(row_index) {
+            return Ok(ScalarValue::Null);
+        }
+    }
+    ScalarValue::try_from_array(array, row_index)
+}
diff --git a/datafusion/core/src/physical_plan/analyze.rs b/datafusion/core/src/physical_plan/analyze.rs
@@ -77,7 +77,7 @@ impl ExecutionPlan for AnalyzeExec {
     }
 
     /// Specifies whether this plan generates an infinite stream of records.
-    /// If the plan does not support pipelining, but it its input(s) are
+    /// If the plan does not support pipelining, but its input(s) are
     /// infinite, returns an error to indicate this.
     fn unbounded_output(&self, children: &[bool]) -> Result<bool> {
         if children[0] {

diff --git a/datafusion/core/src/physical_plan/coalesce_batches.rs b/datafusion/core/src/physical_plan/coalesce_batches.rs
@@ -96,7 +96,7 @@ impl ExecutionPlan for CoalesceBatchesExec {
     }
 
     /// Specifies whether this plan generates an infinite stream of records.
-    /// If the plan does not support pipelining, but it its input(s) are
+    /// If the plan does not support pipelining, but its input(s) are
     /// infinite, returns an error to indicate this.
     fn unbounded_output(&self, children: &[bool]) -> Result<bool> {
         Ok(children[0])

diff --git a/datafusion/core/src/physical_plan/coalesce_partitions.rs b/datafusion/core/src/physical_plan/coalesce_partitions.rs
@@ -81,7 +81,7 @@ impl ExecutionPlan for CoalescePartitionsExec {
     }
 
     /// Specifies whether this plan generates an infinite stream of records.
-    /// If the plan does not support pipelining, but it its input(s) are
+    /// If the plan does not support pipelining, but its input(s) are
     /// infinite, returns an error to indicate this.
     fn unbounded_output(&self, children: &[bool]) -> Result<bool> {
         Ok(children[0])

diff --git a/datafusion/core/src/physical_plan/filter.rs b/datafusion/core/src/physical_plan/filter.rs
@@ -107,7 +107,7 @@ impl ExecutionPlan for FilterExec {
     }
 
     /// Specifies whether this plan generates an infinite stream of records.
-    /// If the plan does not support pipelining, but it its input(s) are
+    /// If the plan does not support pipelining, but its input(s) are
     /// infinite, returns an error to indicate this.
     fn unbounded_output(&self, children: &[bool]) -> Result<bool> {
         Ok(children[0])

diff --git a/datafusion/core/src/physical_plan/joins/cross_join.rs b/datafusion/core/src/physical_plan/joins/cross_join.rs
@@ -160,7 +160,7 @@ impl ExecutionPlan for CrossJoinExec {
     }
 
     /// Specifies whether this plan generates an infinite stream of records.
-    /// If the plan does not support pipelining, but it its input(s) are
+    /// If the plan does not support pipelining, but its input(s) are
     /// infinite, returns an error to indicate this.    
     fn unbounded_output(&self, children: &[bool]) -> Result<bool> {
         if children[0] || children[1] {

diff --git a/datafusion/core/src/physical_plan/joins/hash_join.rs b/datafusion/core/src/physical_plan/joins/hash_join.rs
@@ -247,7 +247,7 @@ impl ExecutionPlan for HashJoinExec {
     }
 
     /// Specifies whether this plan generates an infinite stream of records.
-    /// If the plan does not support pipelining, but it its input(s) are
+    /// If the plan does not support pipelining, but its input(s) are
     /// infinite, returns an error to indicate this.
     fn unbounded_output(&self, children: &[bool]) -> Result<bool> {
         let (left, right) = (children[0], children[1]);