Skip to content

Commit

Permalink
Optimize regexp_replace when the input is a sparse array (#3804)
Browse files Browse the repository at this point in the history
* Optimize `regexp_replace` when the input is a sparse array (by reusing null buffers)

* Add a test regarding the slicing behavior
  • Loading branch information
isidentical authored Oct 12, 2022
1 parent 37fe938 commit d5c361b
Showing 1 changed file with 97 additions and 8 deletions.
105 changes: 97 additions & 8 deletions datafusion/physical-expr/src/regex_expressions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
//! Regex expressions
use arrow::array::{
new_null_array, Array, ArrayRef, GenericStringArray, OffsetSizeTrait,
new_null_array, Array, ArrayData, ArrayRef, BufferBuilder, GenericStringArray,
OffsetSizeTrait,
};
use arrow::compute;
use datafusion_common::{DataFusionError, Result};
Expand Down Expand Up @@ -254,13 +255,38 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
// with rust ones.
let replacement = regex_replace_posix_groups(replacement);

let result = string_array
.iter()
.map(|string| {
string.map(|string| re.replacen(string, limit, replacement.as_str()))
})
.collect::<GenericStringArray<T>>();
Ok(Arc::new(result) as ArrayRef)
// We are going to create the underlying string buffer from its parts
// to be able to re-use the existing null buffer for sparse arrays.
let mut vals = BufferBuilder::<u8>::new({
let offsets = string_array.value_offsets();
(offsets[string_array.len()] - offsets[0])
.to_usize()
.unwrap()
});
let mut new_offsets = BufferBuilder::<T>::new(string_array.len() + 1);
new_offsets.append(T::zero());

string_array.iter().for_each(|val| {
if let Some(val) = val {
let result = re.replacen(val, limit, replacement.as_str());
vals.append_slice(result.as_bytes());
}
new_offsets.append(T::from_usize(vals.len()).unwrap());
});

let data = ArrayData::try_new(
GenericStringArray::<T>::DATA_TYPE,
string_array.len(),
string_array
.data_ref()
.null_buffer()
.map(|b| b.bit_slice(string_array.offset(), string_array.len())),
0,
vec![new_offsets.finish(), vals.finish()],
vec![],
)?;
let result_array = GenericStringArray::<T>::from(data);
Ok(Arc::new(result_array) as ArrayRef)
}

/// Determine which implementation of the regexp_replace to use based
Expand Down Expand Up @@ -513,4 +539,67 @@ mod tests {
}
}
}

#[test]
fn test_static_pattern_regexp_replace_with_null_buffers() {
let values = StringArray::from(vec![
Some("a"),
None,
Some("b"),
None,
Some("a"),
None,
None,
Some("c"),
]);
let patterns = StringArray::from(vec!["a"; 1]);
let replacements = StringArray::from(vec!["foo"; 1]);
let expected = StringArray::from(vec![
Some("foo"),
None,
Some("b"),
None,
Some("foo"),
None,
None,
Some("c"),
]);

let re = _regexp_replace_static_pattern_replace::<i32>(&[
Arc::new(values),
Arc::new(patterns),
Arc::new(replacements),
])
.unwrap();

assert_eq!(re.as_ref(), &expected);
assert_eq!(re.null_count(), 4);
}

#[test]
fn test_static_pattern_regexp_replace_with_sliced_null_buffer() {
let values = StringArray::from(vec![
Some("a"),
None,
Some("b"),
None,
Some("a"),
None,
None,
Some("c"),
]);
let values = values.slice(2, 5);
let patterns = StringArray::from(vec!["a"; 1]);
let replacements = StringArray::from(vec!["foo"; 1]);
let expected = StringArray::from(vec![Some("b"), None, Some("foo"), None, None]);

let re = _regexp_replace_static_pattern_replace::<i32>(&[
Arc::new(values),
Arc::new(patterns),
Arc::new(replacements),
])
.unwrap();
assert_eq!(re.as_ref(), &expected);
assert_eq!(re.null_count(), 3);
}
}

0 comments on commit d5c361b

Please sign in to comment.