GroupedHashAggregateStream breaks spill batch

... into smaller chunks to decrease memory required for merging.
apache · Oct 31, 2023 · b814058 · b814058
1 parent d8e413c
commit b814058
Showing 1 changed file with 12 additions and 1 deletion.
diff --git a/datafusion/physical-plan/src/aggregates/row_hash.rs b/datafusion/physical-plan/src/aggregates/row_hash.rs
@@ -673,7 +673,18 @@ impl GroupedHashAggregateStream {
         let spillfile = self.runtime.disk_manager.create_tmp_file("HashAggSpill")?;
         let mut writer = IPCWriter::new(spillfile.path(), &emit.schema())?;
         // TODO: slice large `sorted` and write to multiple files in parallel
-        writer.write(&sorted)?;
+        let mut offset = 0;
+        let total_rows = sorted.num_rows();
+
+        while offset < total_rows {
+            // TODO: we could consider smaller batch size as there may be hundreds of batches
+            //       loaded at the same time.
+            let length = std::cmp::min(total_rows - offset, self.batch_size);
+            let batch = sorted.slice(offset, length);
+            offset += batch.num_rows();
+            writer.write(&batch)?;
+        }
+
         writer.finish()?;
         self.spill_state.spills.push(spillfile);
         Ok(())