-
Notifications
You must be signed in to change notification settings - Fork 784
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Speed up the substring
kernel by about 2x
#1512
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,56 +24,61 @@ use crate::{ | |
error::{ArrowError, Result}, | ||
}; | ||
|
||
#[allow(clippy::unnecessary_wraps)] | ||
fn generic_substring<OffsetSize: StringOffsetSizeTrait>( | ||
array: &GenericStringArray<OffsetSize>, | ||
start: OffsetSize, | ||
length: &Option<OffsetSize>, | ||
) -> Result<ArrayRef> { | ||
// compute current offsets | ||
let offsets = array.data_ref().clone().buffers()[0].clone(); | ||
let offsets: &[OffsetSize] = unsafe { offsets.typed_data::<OffsetSize>() }; | ||
|
||
// compute null bitmap (copy) | ||
let offsets = array.value_offsets(); | ||
let null_bit_buffer = array.data_ref().null_buffer().cloned(); | ||
|
||
// compute values | ||
let values = &array.data_ref().buffers()[1]; | ||
let values = array.value_data(); | ||
let data = values.as_slice(); | ||
let zero = OffsetSize::zero(); | ||
|
||
let mut new_values = MutableBuffer::new(0); // we have no way to estimate how much this will be. | ||
let mut new_offsets: Vec<OffsetSize> = Vec::with_capacity(array.len() + 1); | ||
|
||
let mut length_so_far = OffsetSize::zero(); | ||
new_offsets.push(length_so_far); | ||
(0..array.len()).for_each(|i| { | ||
// the length of this entry | ||
let length_i: OffsetSize = offsets[i + 1] - offsets[i]; | ||
// compute where we should start slicing this entry | ||
let start = offsets[i] | ||
+ if start >= OffsetSize::zero() { | ||
start | ||
} else { | ||
length_i + start | ||
}; | ||
let cal_new_start: Box<dyn Fn(OffsetSize, OffsetSize) -> OffsetSize> = if start | ||
>= zero | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
{ | ||
// count from the start of string | ||
Box::new(|old_start: OffsetSize, end: OffsetSize| (old_start + start).min(end)) | ||
} else { | ||
// count from the end of string | ||
Box::new(|old_start: OffsetSize, end: OffsetSize| (end + start).max(old_start)) | ||
}; | ||
|
||
let start = start.max(offsets[i]).min(offsets[i + 1]); | ||
// compute the length of the slice | ||
let length: OffsetSize = length | ||
.unwrap_or(length_i) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same reason! we don't need to |
||
// .max(0) is not needed as it is guaranteed | ||
.min(offsets[i + 1] - start); // so we do not go beyond this entry | ||
let cal_new_length: Box<dyn Fn(OffsetSize, OffsetSize) -> OffsetSize> = | ||
if let Some(length) = length { | ||
Box::new(|start: OffsetSize, end: OffsetSize| (*length).min(end - start)) | ||
} else { | ||
Box::new(|start: OffsetSize, end: OffsetSize| end - start) | ||
}; | ||
|
||
length_so_far += length; | ||
// start and end offsets for each substring | ||
let mut new_starts_ends: Vec<(OffsetSize, OffsetSize)> = | ||
Vec::with_capacity(array.len()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we don't materialize There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is a time-space trade-off, but not the major factor affecting performance |
||
let mut new_offsets: Vec<OffsetSize> = Vec::with_capacity(array.len() + 1); | ||
let mut len_so_far = zero; | ||
new_offsets.push(zero); | ||
|
||
new_offsets.push(length_so_far); | ||
offsets.windows(2).for_each(|pair| { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Purely functional style is not appropriate because |
||
let new_start = cal_new_start(pair[0], pair[1]); | ||
let new_length = cal_new_length(new_start, pair[1]); | ||
len_so_far += new_length; | ||
new_starts_ends.push((new_start, new_start + new_length)); | ||
new_offsets.push(len_so_far); | ||
}); | ||
|
||
// we need usize for ranges | ||
let start = start.to_usize().unwrap(); | ||
let length = length.to_usize().unwrap(); | ||
// concatenate substrings into a buffer | ||
let mut new_values = | ||
MutableBuffer::new(new_offsets.last().unwrap().to_usize().unwrap()); | ||
|
||
new_values.extend_from_slice(&data[start..start + length]); | ||
}); | ||
new_starts_ends | ||
.iter() | ||
.map(|(start, end)| { | ||
let start = start.to_usize().unwrap(); | ||
let end = end.to_usize().unwrap(); | ||
&data[start..end] | ||
}) | ||
.for_each(|slice| new_values.extend_from_slice(slice)); | ||
|
||
let data = unsafe { | ||
ArrayData::new_unchecked( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't need to compare
start
withzero
multiple time. So I move this outside the for loop