diff --git a/arrow/src/compute/kernels/substring.rs b/arrow/src/compute/kernels/substring.rs index 9b468f5d423c..1be829f72719 100644 --- a/arrow/src/compute/kernels/substring.rs +++ b/arrow/src/compute/kernels/substring.rs @@ -24,56 +24,61 @@ use crate::{ error::{ArrowError, Result}, }; -#[allow(clippy::unnecessary_wraps)] fn generic_substring( array: &GenericStringArray, start: OffsetSize, length: &Option, ) -> Result { - // compute current offsets - let offsets = array.data_ref().clone().buffers()[0].clone(); - let offsets: &[OffsetSize] = unsafe { offsets.typed_data::() }; - - // compute null bitmap (copy) + let offsets = array.value_offsets(); let null_bit_buffer = array.data_ref().null_buffer().cloned(); - - // compute values - let values = &array.data_ref().buffers()[1]; + let values = array.value_data(); let data = values.as_slice(); + let zero = OffsetSize::zero(); - let mut new_values = MutableBuffer::new(0); // we have no way to estimate how much this will be. - let mut new_offsets: Vec = Vec::with_capacity(array.len() + 1); - - let mut length_so_far = OffsetSize::zero(); - new_offsets.push(length_so_far); - (0..array.len()).for_each(|i| { - // the length of this entry - let length_i: OffsetSize = offsets[i + 1] - offsets[i]; - // compute where we should start slicing this entry - let start = offsets[i] - + if start >= OffsetSize::zero() { - start - } else { - length_i + start - }; + let cal_new_start: Box OffsetSize> = if start + >= zero + { + // count from the start of string + Box::new(|old_start: OffsetSize, end: OffsetSize| (old_start + start).min(end)) + } else { + // count from the end of string + Box::new(|old_start: OffsetSize, end: OffsetSize| (end + start).max(old_start)) + }; - let start = start.max(offsets[i]).min(offsets[i + 1]); - // compute the length of the slice - let length: OffsetSize = length - .unwrap_or(length_i) - // .max(0) is not needed as it is guaranteed - .min(offsets[i + 1] - start); // so we do not go beyond this entry + let cal_new_length: Box OffsetSize> = + if let Some(length) = length { + Box::new(|start: OffsetSize, end: OffsetSize| (*length).min(end - start)) + } else { + Box::new(|start: OffsetSize, end: OffsetSize| end - start) + }; - length_so_far += length; + // start and end offsets for each substring + let mut new_starts_ends: Vec<(OffsetSize, OffsetSize)> = + Vec::with_capacity(array.len()); + let mut new_offsets: Vec = Vec::with_capacity(array.len() + 1); + let mut len_so_far = zero; + new_offsets.push(zero); - new_offsets.push(length_so_far); + offsets.windows(2).for_each(|pair| { + let new_start = cal_new_start(pair[0], pair[1]); + let new_length = cal_new_length(new_start, pair[1]); + len_so_far += new_length; + new_starts_ends.push((new_start, new_start + new_length)); + new_offsets.push(len_so_far); + }); - // we need usize for ranges - let start = start.to_usize().unwrap(); - let length = length.to_usize().unwrap(); + // concatenate substrings into a buffer + let mut new_values = + MutableBuffer::new(new_offsets.last().unwrap().to_usize().unwrap()); - new_values.extend_from_slice(&data[start..start + length]); - }); + new_starts_ends + .iter() + .map(|(start, end)| { + let start = start.to_usize().unwrap(); + let end = end.to_usize().unwrap(); + &data[start..end] + }) + .for_each(|slice| new_values.extend_from_slice(slice)); let data = unsafe { ArrayData::new_unchecked(