-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Optimize row hash #6065
Optimize row hash #6065
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,6 +25,7 @@ use std::vec; | |
|
||
use ahash::RandomState; | ||
use arrow::row::{OwnedRow, RowConverter, SortField}; | ||
use arrow_array::NullArray; | ||
use datafusion_physical_expr::hash_utils::create_hashes; | ||
use futures::ready; | ||
use futures::stream::{Stream, StreamExt}; | ||
|
@@ -836,24 +837,24 @@ fn slice_and_maybe_filter( | |
filter_opt: Option<&Arc<dyn Array>>, | ||
offsets: &[usize], | ||
) -> Result<Vec<ArrayRef>> { | ||
let sliced_arrays: Vec<ArrayRef> = aggr_array | ||
.iter() | ||
.map(|array| array.slice(offsets[0], offsets[1] - offsets[0])) | ||
.collect(); | ||
let null_array = Arc::new(NullArray::new(0)) as ArrayRef; | ||
let mut sliced_arrays: Vec<ArrayRef> = vec![null_array; aggr_array.len()]; | ||
|
||
let filtered_arrays = match filter_opt.as_ref() { | ||
Some(f) => { | ||
let sliced = f.slice(offsets[0], offsets[1] - offsets[0]); | ||
let filter_array = as_boolean_array(&sliced)?; | ||
if let Some(f) = filter_opt { | ||
let sliced = f.slice(offsets[0], offsets[1] - offsets[0]); | ||
let filter_array = as_boolean_array(&sliced)?; | ||
|
||
sliced_arrays | ||
.iter() | ||
.map(|array| filter(array, filter_array).unwrap()) | ||
.collect::<Vec<ArrayRef>>() | ||
for (i, arr) in aggr_array.iter().enumerate() { | ||
let sliced = &arr.slice(offsets[0], offsets[1] - offsets[0]); | ||
sliced_arrays[i] = filter(sliced, filter_array).unwrap(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, I don't think it's a good idea to do more than one thing in a loop, especially the |
||
} | ||
} else { | ||
for (i, arr) in aggr_array.iter().enumerate() { | ||
sliced_arrays[i] = arr.slice(offsets[0], offsets[1] - offsets[0]); | ||
} | ||
None => sliced_arrays, | ||
}; | ||
Ok(filtered_arrays) | ||
} | ||
Comment on lines
+843
to
+855
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think writing these loops as a zip of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With |
||
|
||
Ok(sliced_arrays) | ||
} | ||
|
||
/// This method is similar to Scalar::try_from_array except for the Null handling. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't see why this should be faster? 🤔
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The key point of this PR is to get rid of extra allocations #5969 (comment) and allows 20-25% speed gain.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Another optimization that we traverse input collection only once, instead of 2 times in original implementation
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See #5969 (comment) -- reported performance improvement