From d3e42a533cd7957507774faeacb6141b12ffb034 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Thu, 31 Oct 2024 16:23:37 +0100 Subject: [PATCH 1/5] Fix string view LIKE checks with NULL values --- arrow-string/src/like.rs | 119 +++++++++++++++++++++++++++++++++- arrow-string/src/predicate.rs | 29 +++++++-- 2 files changed, 141 insertions(+), 7 deletions(-) diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index 5c76d5c810a7..fee3b792a946 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -1708,7 +1708,93 @@ mod tests { } #[test] - fn like_scalar_null() { + fn string_null_like_pattern() { + // Different patterns have different execution code paths + for pattern in &[ + "", // can execute as equality check + "_", // can execute as length check + "%", // can execute as starts_with("") or non-null check + "a%", // can execute as starts_with("a") + "%a", // can execute as ends_with("") + "a%b", // can execute as starts_with("a") && ends_with("b") + "%a%", // can_execute as contains("a") + "%a%b_c_d%e", // can_execute as regular expression + ] { + let a = Scalar::new(StringArray::new_null(1)); + let b = StringArray::new_scalar(pattern); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1, "With pattern {pattern}"); + assert_eq!(r.null_count(), 1, "With pattern {pattern}"); + assert!(r.is_null(0), "With pattern {pattern}"); + + let a = Scalar::new(StringArray::new_null(1)); + let b = StringArray::from_iter_values([pattern]); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1, "With pattern {pattern}"); + assert_eq!(r.null_count(), 1, "With pattern {pattern}"); + assert!(r.is_null(0), "With pattern {pattern}"); + + let a = StringArray::new_null(1); + let b = StringArray::from_iter_values([pattern]); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1, "With pattern {pattern}"); + assert_eq!(r.null_count(), 1, "With pattern {pattern}"); + assert!(r.is_null(0), "With pattern {pattern}"); + + let a = StringArray::new_null(1); + let b = StringArray::new_scalar(pattern); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1, "With pattern {pattern}"); + assert_eq!(r.null_count(), 1, "With pattern {pattern}"); + assert!(r.is_null(0), "With pattern {pattern}"); + } + } + + #[test] + fn string_view_null_like_pattern() { + // Different patterns have different execution code paths + for pattern in &[ + "", // can execute as equality check + "_", // can execute as length check + "%", // can execute as starts_with("") or non-null check + "a%", // can execute as starts_with("a") + "%a", // can execute as ends_with("") + "a%b", // can execute as starts_with("a") && ends_with("b") + "%a%", // can_execute as contains("a") + "%a%b_c_d%e", // can_execute as regular expression + ] { + let a = Scalar::new(StringViewArray::new_null(1)); + let b = StringViewArray::new_scalar(pattern); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1, "With pattern {pattern}"); + assert_eq!(r.null_count(), 1, "With pattern {pattern}"); + assert!(r.is_null(0), "With pattern {pattern}"); + + let a = Scalar::new(StringViewArray::new_null(1)); + let b = StringViewArray::from_iter_values([pattern]); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1, "With pattern {pattern}"); + assert_eq!(r.null_count(), 1, "With pattern {pattern}"); + assert!(r.is_null(0), "With pattern {pattern}"); + + let a = StringViewArray::new_null(1); + let b = StringViewArray::from_iter_values([pattern]); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1, "With pattern {pattern}"); + assert_eq!(r.null_count(), 1, "With pattern {pattern}"); + assert!(r.is_null(0), "With pattern {pattern}"); + + let a = StringViewArray::new_null(1); + let b = StringViewArray::new_scalar(pattern); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1, "With pattern {pattern}"); + assert_eq!(r.null_count(), 1, "With pattern {pattern}"); + assert!(r.is_null(0), "With pattern {pattern}"); + } + } + + #[test] + fn string_like_scalar_null() { let a = StringArray::new_scalar("a"); let b = Scalar::new(StringArray::new_null(1)); let r = like(&a, &b).unwrap(); @@ -1737,4 +1823,35 @@ mod tests { assert_eq!(r.null_count(), 1); assert!(r.is_null(0)); } + + #[test] + fn string_view_like_scalar_null() { + let a = StringViewArray::new_scalar("a"); + let b = Scalar::new(StringViewArray::new_null(1)); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1); + assert_eq!(r.null_count(), 1); + assert!(r.is_null(0)); + + let a = StringViewArray::from_iter_values(["a"]); + let b = Scalar::new(StringViewArray::new_null(1)); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1); + assert_eq!(r.null_count(), 1); + assert!(r.is_null(0)); + + let a = StringViewArray::from_iter_values(["a"]); + let b = StringViewArray::new_null(1); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1); + assert_eq!(r.null_count(), 1); + assert!(r.is_null(0)); + + let a = StringViewArray::new_scalar("a"); + let b = StringViewArray::new_null(1); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1); + assert_eq!(r.null_count(), 1); + assert!(r.is_null(0)); + } } diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index f559088e6c96..04c6f2de852c 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow_array::{ArrayAccessor, BooleanArray, StringViewArray}; +use arrow_array::{Array, ArrayAccessor, BooleanArray, StringViewArray}; use arrow_schema::ArrowError; use memchr::memchr2; use memchr::memmem::Finder; @@ -116,10 +116,17 @@ impl<'a> Predicate<'a> { }), Predicate::Contains(finder) => { if let Some(string_view_array) = array.as_any().downcast_ref::() { + let nulls = string_view_array.logical_nulls(); BooleanArray::from( string_view_array .bytes_iter() - .map(|haystack| finder.find(haystack).is_some() != negate) + .enumerate() + .map(|(idx, haystack)| { + if nulls.as_ref().map(|n| n.is_null(idx)).unwrap_or_default() { + return None; + } + Some(finder.find(haystack).is_some() != negate) + }) .collect::>(), ) } else { @@ -130,11 +137,16 @@ impl<'a> Predicate<'a> { } Predicate::StartsWith(v) => { if let Some(string_view_array) = array.as_any().downcast_ref::() { + let nulls = string_view_array.logical_nulls(); BooleanArray::from( string_view_array .prefix_bytes_iter(v.len()) - .map(|haystack| { - equals_bytes(haystack, v.as_bytes(), equals_kernel) != negate + .enumerate() + .map(|(idx, haystack)| { + if nulls.as_ref().map(|n| n.is_null(idx)).unwrap_or_default() { + return None; + } + Some(equals_bytes(haystack, v.as_bytes(), equals_kernel) != negate) }) .collect::>(), ) @@ -166,11 +178,16 @@ impl<'a> Predicate<'a> { } Predicate::EndsWith(v) => { if let Some(string_view_array) = array.as_any().downcast_ref::() { + let nulls = string_view_array.logical_nulls(); BooleanArray::from( string_view_array .suffix_bytes_iter(v.len()) - .map(|haystack| { - equals_bytes(haystack, v.as_bytes(), equals_kernel) != negate + .enumerate() + .map(|(idx, haystack)| { + if nulls.as_ref().map(|n| n.is_null(idx)).unwrap_or_default() { + return None; + } + Some(equals_bytes(haystack, v.as_bytes(), equals_kernel) != negate) }) .collect::>(), ) From 65d52ff0dfbb8d84b393dc3a6e722eada7a4d0cd Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Fri, 1 Nov 2024 21:37:12 +0100 Subject: [PATCH 2/5] Add TODO comments to operations yet to be fixed --- arrow-string/src/predicate.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index 04c6f2de852c..2bff39786edb 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -158,6 +158,7 @@ impl<'a> Predicate<'a> { } Predicate::IStartsWithAscii(v) => { if let Some(string_view_array) = array.as_any().downcast_ref::() { + // TODO respect null buffer BooleanArray::from( string_view_array .prefix_bytes_iter(v.len()) @@ -199,6 +200,7 @@ impl<'a> Predicate<'a> { } Predicate::IEndsWithAscii(v) => { if let Some(string_view_array) = array.as_any().downcast_ref::() { + // TODO respect null buffer BooleanArray::from( string_view_array .suffix_bytes_iter(v.len()) From 1e82884f3d12f55260e007b0cb42795635d3b470 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Fri, 1 Nov 2024 21:37:59 +0100 Subject: [PATCH 3/5] Use from_unary for contains with string view from_unary takes care of nulls --- arrow-string/src/predicate.rs | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index 2bff39786edb..15a5cee6c40e 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -114,27 +114,9 @@ impl<'a> Predicate<'a> { Predicate::IEqAscii(v) => BooleanArray::from_unary(array, |haystack| { haystack.eq_ignore_ascii_case(v) != negate }), - Predicate::Contains(finder) => { - if let Some(string_view_array) = array.as_any().downcast_ref::() { - let nulls = string_view_array.logical_nulls(); - BooleanArray::from( - string_view_array - .bytes_iter() - .enumerate() - .map(|(idx, haystack)| { - if nulls.as_ref().map(|n| n.is_null(idx)).unwrap_or_default() { - return None; - } - Some(finder.find(haystack).is_some() != negate) - }) - .collect::>(), - ) - } else { - BooleanArray::from_unary(array, |haystack| { - finder.find(haystack.as_bytes()).is_some() != negate - }) - } - } + Predicate::Contains(finder) => BooleanArray::from_unary(array, |haystack| { + finder.find(haystack.as_bytes()).is_some() != negate + }), Predicate::StartsWith(v) => { if let Some(string_view_array) = array.as_any().downcast_ref::() { let nulls = string_view_array.logical_nulls(); From bede692824680c3da75cb8ca657e71416dca8f43 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Tue, 5 Nov 2024 22:53:43 +0100 Subject: [PATCH 4/5] Fix performance --- arrow-string/src/predicate.rs | 39 ++++++++--------------------------- 1 file changed, 9 insertions(+), 30 deletions(-) diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index 15a5cee6c40e..d79df3d581cb 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -16,6 +16,7 @@ // under the License. use arrow_array::{Array, ArrayAccessor, BooleanArray, StringViewArray}; +use arrow_buffer::BooleanBuffer; use arrow_schema::ArrowError; use memchr::memchr2; use memchr::memmem::Finder; @@ -119,19 +120,15 @@ impl<'a> Predicate<'a> { }), Predicate::StartsWith(v) => { if let Some(string_view_array) = array.as_any().downcast_ref::() { - let nulls = string_view_array.logical_nulls(); - BooleanArray::from( + let values = BooleanBuffer::from( string_view_array .prefix_bytes_iter(v.len()) - .enumerate() - .map(|(idx, haystack)| { - if nulls.as_ref().map(|n| n.is_null(idx)).unwrap_or_default() { - return None; - } - Some(equals_bytes(haystack, v.as_bytes(), equals_kernel) != negate) + .map(|haystack| { + equals_bytes(haystack, v.as_bytes(), equals_kernel) != negate }) .collect::>(), - ) + ); + BooleanArray::new(values, string_view_array.logical_nulls()) } else { BooleanArray::from_unary(array, |haystack| { starts_with(haystack, v, equals_kernel) != negate @@ -159,27 +156,9 @@ impl<'a> Predicate<'a> { }) } } - Predicate::EndsWith(v) => { - if let Some(string_view_array) = array.as_any().downcast_ref::() { - let nulls = string_view_array.logical_nulls(); - BooleanArray::from( - string_view_array - .suffix_bytes_iter(v.len()) - .enumerate() - .map(|(idx, haystack)| { - if nulls.as_ref().map(|n| n.is_null(idx)).unwrap_or_default() { - return None; - } - Some(equals_bytes(haystack, v.as_bytes(), equals_kernel) != negate) - }) - .collect::>(), - ) - } else { - BooleanArray::from_unary(array, |haystack| { - ends_with(haystack, v, equals_kernel) != negate - }) - } - } + Predicate::EndsWith(v) => BooleanArray::from_unary(array, |haystack| { + ends_with(haystack, v, equals_kernel) != negate + }), Predicate::IEndsWithAscii(v) => { if let Some(string_view_array) = array.as_any().downcast_ref::() { // TODO respect null buffer From 2394f92070e5c49f48ae753835abcec9667d2447 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Tue, 5 Nov 2024 23:05:38 +0100 Subject: [PATCH 5/5] fixup! Fix performance --- arrow-string/src/predicate.rs | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index d79df3d581cb..d88eeac64022 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -120,6 +120,7 @@ impl<'a> Predicate<'a> { }), Predicate::StartsWith(v) => { if let Some(string_view_array) = array.as_any().downcast_ref::() { + let nulls = string_view_array.logical_nulls(); let values = BooleanBuffer::from( string_view_array .prefix_bytes_iter(v.len()) @@ -128,7 +129,7 @@ impl<'a> Predicate<'a> { }) .collect::>(), ); - BooleanArray::new(values, string_view_array.logical_nulls()) + BooleanArray::new(values, nulls) } else { BooleanArray::from_unary(array, |haystack| { starts_with(haystack, v, equals_kernel) != negate @@ -156,9 +157,24 @@ impl<'a> Predicate<'a> { }) } } - Predicate::EndsWith(v) => BooleanArray::from_unary(array, |haystack| { - ends_with(haystack, v, equals_kernel) != negate - }), + Predicate::EndsWith(v) => { + if let Some(string_view_array) = array.as_any().downcast_ref::() { + let nulls = string_view_array.logical_nulls(); + let values = BooleanBuffer::from( + string_view_array + .suffix_bytes_iter(v.len()) + .map(|haystack| { + equals_bytes(haystack, v.as_bytes(), equals_kernel) != negate + }) + .collect::>(), + ); + BooleanArray::new(values, nulls) + } else { + BooleanArray::from_unary(array, |haystack| { + ends_with(haystack, v, equals_kernel) != negate + }) + } + } Predicate::IEndsWithAscii(v) => { if let Some(string_view_array) = array.as_any().downcast_ref::() { // TODO respect null buffer