From d5c361b63c31b57d0052c501a94c1b1f7847b402 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Wed, 12 Oct 2022 23:17:36 +0300 Subject: [PATCH] Optimize `regexp_replace` when the input is a sparse array (#3804) * Optimize `regexp_replace` when the input is a sparse array (by reusing null buffers) * Add a test regarding the slicing behavior --- .../physical-expr/src/regex_expressions.rs | 105 ++++++++++++++++-- 1 file changed, 97 insertions(+), 8 deletions(-) diff --git a/datafusion/physical-expr/src/regex_expressions.rs b/datafusion/physical-expr/src/regex_expressions.rs index d7edc3400601..b76bb2c45457 100644 --- a/datafusion/physical-expr/src/regex_expressions.rs +++ b/datafusion/physical-expr/src/regex_expressions.rs @@ -22,7 +22,8 @@ //! Regex expressions use arrow::array::{ - new_null_array, Array, ArrayRef, GenericStringArray, OffsetSizeTrait, + new_null_array, Array, ArrayData, ArrayRef, BufferBuilder, GenericStringArray, + OffsetSizeTrait, }; use arrow::compute; use datafusion_common::{DataFusionError, Result}; @@ -254,13 +255,38 @@ fn _regexp_replace_static_pattern_replace( // with rust ones. let replacement = regex_replace_posix_groups(replacement); - let result = string_array - .iter() - .map(|string| { - string.map(|string| re.replacen(string, limit, replacement.as_str())) - }) - .collect::>(); - Ok(Arc::new(result) as ArrayRef) + // We are going to create the underlying string buffer from its parts + // to be able to re-use the existing null buffer for sparse arrays. + let mut vals = BufferBuilder::::new({ + let offsets = string_array.value_offsets(); + (offsets[string_array.len()] - offsets[0]) + .to_usize() + .unwrap() + }); + let mut new_offsets = BufferBuilder::::new(string_array.len() + 1); + new_offsets.append(T::zero()); + + string_array.iter().for_each(|val| { + if let Some(val) = val { + let result = re.replacen(val, limit, replacement.as_str()); + vals.append_slice(result.as_bytes()); + } + new_offsets.append(T::from_usize(vals.len()).unwrap()); + }); + + let data = ArrayData::try_new( + GenericStringArray::::DATA_TYPE, + string_array.len(), + string_array + .data_ref() + .null_buffer() + .map(|b| b.bit_slice(string_array.offset(), string_array.len())), + 0, + vec![new_offsets.finish(), vals.finish()], + vec![], + )?; + let result_array = GenericStringArray::::from(data); + Ok(Arc::new(result_array) as ArrayRef) } /// Determine which implementation of the regexp_replace to use based @@ -513,4 +539,67 @@ mod tests { } } } + + #[test] + fn test_static_pattern_regexp_replace_with_null_buffers() { + let values = StringArray::from(vec![ + Some("a"), + None, + Some("b"), + None, + Some("a"), + None, + None, + Some("c"), + ]); + let patterns = StringArray::from(vec!["a"; 1]); + let replacements = StringArray::from(vec!["foo"; 1]); + let expected = StringArray::from(vec![ + Some("foo"), + None, + Some("b"), + None, + Some("foo"), + None, + None, + Some("c"), + ]); + + let re = _regexp_replace_static_pattern_replace::(&[ + Arc::new(values), + Arc::new(patterns), + Arc::new(replacements), + ]) + .unwrap(); + + assert_eq!(re.as_ref(), &expected); + assert_eq!(re.null_count(), 4); + } + + #[test] + fn test_static_pattern_regexp_replace_with_sliced_null_buffer() { + let values = StringArray::from(vec![ + Some("a"), + None, + Some("b"), + None, + Some("a"), + None, + None, + Some("c"), + ]); + let values = values.slice(2, 5); + let patterns = StringArray::from(vec!["a"; 1]); + let replacements = StringArray::from(vec!["foo"; 1]); + let expected = StringArray::from(vec![Some("b"), None, Some("foo"), None, None]); + + let re = _regexp_replace_static_pattern_replace::(&[ + Arc::new(values), + Arc::new(patterns), + Arc::new(replacements), + ]) + .unwrap(); + assert_eq!(re.as_ref(), &expected); + assert_eq!(re.null_count(), 3); + } }