Skip to content

Commit

Permalink
Speed up the substring kernel by about 2x (#1512)
Browse files Browse the repository at this point in the history
* speed up substring

Signed-off-by: remzi <[email protected]>

* add comments

Signed-off-by: remzi <[email protected]>

* reformat code

Signed-off-by: remzi <[email protected]>

* use trait opject to simplify the code

Signed-off-by: remzi <[email protected]>

* reformat code

Signed-off-by: remzi <[email protected]>

* fmt code

Signed-off-by: remzi <[email protected]>
  • Loading branch information
HaoYang670 authored Apr 7, 2022
1 parent c7c7af6 commit 497521f
Showing 1 changed file with 42 additions and 37 deletions.
79 changes: 42 additions & 37 deletions arrow/src/compute/kernels/substring.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,56 +24,61 @@ use crate::{
error::{ArrowError, Result},
};

#[allow(clippy::unnecessary_wraps)]
fn generic_substring<OffsetSize: StringOffsetSizeTrait>(
array: &GenericStringArray<OffsetSize>,
start: OffsetSize,
length: &Option<OffsetSize>,
) -> Result<ArrayRef> {
// compute current offsets
let offsets = array.data_ref().clone().buffers()[0].clone();
let offsets: &[OffsetSize] = unsafe { offsets.typed_data::<OffsetSize>() };

// compute null bitmap (copy)
let offsets = array.value_offsets();
let null_bit_buffer = array.data_ref().null_buffer().cloned();

// compute values
let values = &array.data_ref().buffers()[1];
let values = array.value_data();
let data = values.as_slice();
let zero = OffsetSize::zero();

let mut new_values = MutableBuffer::new(0); // we have no way to estimate how much this will be.
let mut new_offsets: Vec<OffsetSize> = Vec::with_capacity(array.len() + 1);

let mut length_so_far = OffsetSize::zero();
new_offsets.push(length_so_far);
(0..array.len()).for_each(|i| {
// the length of this entry
let length_i: OffsetSize = offsets[i + 1] - offsets[i];
// compute where we should start slicing this entry
let start = offsets[i]
+ if start >= OffsetSize::zero() {
start
} else {
length_i + start
};
let cal_new_start: Box<dyn Fn(OffsetSize, OffsetSize) -> OffsetSize> = if start
>= zero
{
// count from the start of string
Box::new(|old_start: OffsetSize, end: OffsetSize| (old_start + start).min(end))
} else {
// count from the end of string
Box::new(|old_start: OffsetSize, end: OffsetSize| (end + start).max(old_start))
};

let start = start.max(offsets[i]).min(offsets[i + 1]);
// compute the length of the slice
let length: OffsetSize = length
.unwrap_or(length_i)
// .max(0) is not needed as it is guaranteed
.min(offsets[i + 1] - start); // so we do not go beyond this entry
let cal_new_length: Box<dyn Fn(OffsetSize, OffsetSize) -> OffsetSize> =
if let Some(length) = length {
Box::new(|start: OffsetSize, end: OffsetSize| (*length).min(end - start))
} else {
Box::new(|start: OffsetSize, end: OffsetSize| end - start)
};

length_so_far += length;
// start and end offsets for each substring
let mut new_starts_ends: Vec<(OffsetSize, OffsetSize)> =
Vec::with_capacity(array.len());
let mut new_offsets: Vec<OffsetSize> = Vec::with_capacity(array.len() + 1);
let mut len_so_far = zero;
new_offsets.push(zero);

new_offsets.push(length_so_far);
offsets.windows(2).for_each(|pair| {
let new_start = cal_new_start(pair[0], pair[1]);
let new_length = cal_new_length(new_start, pair[1]);
len_so_far += new_length;
new_starts_ends.push((new_start, new_start + new_length));
new_offsets.push(len_so_far);
});

// we need usize for ranges
let start = start.to_usize().unwrap();
let length = length.to_usize().unwrap();
// concatenate substrings into a buffer
let mut new_values =
MutableBuffer::new(new_offsets.last().unwrap().to_usize().unwrap());

new_values.extend_from_slice(&data[start..start + length]);
});
new_starts_ends
.iter()
.map(|(start, end)| {
let start = start.to_usize().unwrap();
let end = end.to_usize().unwrap();
&data[start..end]
})
.for_each(|slice| new_values.extend_from_slice(slice));

let data = unsafe {
ArrayData::new_unchecked(
Expand Down

0 comments on commit 497521f

Please sign in to comment.