From db22383d54df9f9d526275ff98b435f6e2848396 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 8 Jan 2022 07:14:33 -0500 Subject: [PATCH 1/4] Fix undefined behavor in GenericStringArray::from_iter_values --- arrow/src/array/array_string.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arrow/src/array/array_string.rs b/arrow/src/array/array_string.rs index 82c9f16fe99d..24981bcfa187 100644 --- a/arrow/src/array/array_string.rs +++ b/arrow/src/array/array_string.rs @@ -187,8 +187,15 @@ impl GenericStringArray { offsets.push(length_so_far); values.extend_from_slice(s.as_bytes()); } + + // iterator size hint may not be correct so compute the actual number of offsets + let actual_len = match offsets.len() { + 0 => 0, + len => (len / std::mem::size_of::()) - 1 + }; + let array_data = ArrayData::builder(OffsetSize::DATA_TYPE) - .len(data_len) + .len(actual_len) .add_buffer(offsets.into()) .add_buffer(values.into()); let array_data = unsafe { array_data.build_unchecked() }; From 3097223caaf2a5c8b172e544a97dc71482fc6de6 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 9 Jan 2022 10:16:31 -0500 Subject: [PATCH 2/4] Cleanup code and tests --- arrow/src/array/array_string.rs | 69 ++++++++++++++++++++++++++++++--- arrow/src/util/test_util.rs | 48 +++++++++++++++++++++++ 2 files changed, 112 insertions(+), 5 deletions(-) diff --git a/arrow/src/array/array_string.rs b/arrow/src/array/array_string.rs index 24981bcfa187..7461dcd67ae9 100644 --- a/arrow/src/array/array_string.rs +++ b/arrow/src/array/array_string.rs @@ -189,10 +189,8 @@ impl GenericStringArray { } // iterator size hint may not be correct so compute the actual number of offsets - let actual_len = match offsets.len() { - 0 => 0, - len => (len / std::mem::size_of::()) - 1 - }; + assert!(offsets.len() > 0); // wrote at least one + let actual_len = (offsets.len() / std::mem::size_of::()) - 1; let array_data = ArrayData::builder(OffsetSize::DATA_TYPE) .len(actual_len) @@ -381,7 +379,10 @@ impl From> for GenericStringArray< #[cfg(test)] mod tests { - use crate::array::{ListBuilder, StringBuilder}; + use crate::{ + array::{ListBuilder, StringBuilder}, + util::test_util::BadIterator, + }; use super::*; @@ -579,4 +580,62 @@ mod tests { .validate_full() .expect("All null array has valid array data"); } + + #[test] + fn bad_size_collect_string() { + let data = vec![Some("foo"), None, Some("bar")]; + let expected: StringArray = data.clone().into_iter().collect(); + + // Iterator reports too many items + let arr: StringArray = BadIterator::new(3, 10, data.clone()).collect(); + assert_eq!(expected, arr); + + // Iterator reports too few items + let arr: StringArray = BadIterator::new(3, 1, data.clone()).collect(); + assert_eq!(expected, arr); + } + + #[test] + fn bad_size_collect_large_string() { + let data = vec![Some("foo"), None, Some("bar")]; + let expected: LargeStringArray = data.clone().into_iter().collect(); + + // Iterator reports too many items + let arr: LargeStringArray = BadIterator::new(3, 10, data.clone()).collect(); + assert_eq!(expected, arr); + + // Iterator reports too few items + let arr: LargeStringArray = BadIterator::new(3, 1, data.clone()).collect(); + assert_eq!(expected, arr); + } + + #[test] + fn bad_size_iter_values_string() { + let data = vec!["foo", "bar", "baz"]; + let expected: StringArray = data.clone().into_iter().map(Some).collect(); + + // Iterator reports too many items + let arr = StringArray::from_iter_values(BadIterator::new(3, 10, data.clone())); + assert_eq!(expected, arr); + + // Iterator reports too few items + let arr = StringArray::from_iter_values(BadIterator::new(3, 1, data.clone())); + assert_eq!(expected, arr); + } + + #[test] + fn bad_size_iter_values_large_string() { + let data = vec!["foo", "bar", "baz"]; + let expected: LargeStringArray = data.clone().into_iter().map(Some).collect(); + + // Iterator reports too many items + let arr = + LargeStringArray::from_iter_values(BadIterator::new(3, 10, data.clone())); + assert_eq!(expected, arr); + + // Iterator reports too few items + let arr = + LargeStringArray::from_iter_values(BadIterator::new(3, 1, data.clone())); + assert_eq!(expected, arr); + } } diff --git a/arrow/src/util/test_util.rs b/arrow/src/util/test_util.rs index f02c8d09831d..6b1ad4be1d76 100644 --- a/arrow/src/util/test_util.rs +++ b/arrow/src/util/test_util.rs @@ -152,6 +152,54 @@ fn get_data_dir(udf_env: &str, submodule_data: &str) -> Result { + /// where the iterator currently is + cur: usize, + /// How many items will this iterator *actually* make + limit: usize, + /// How many items this iterator claims it will make + claimed: usize, + /// The items to return. If there are fewer items than `limit` + /// they will be repeated + pub items: Vec, +} + +impl BadIterator { + /// Create a new iterator for items, but that reports to + /// produce items. Must provide at least 1 item. + pub fn new(limit: usize, claimed: usize, items: Vec) -> Self { + assert!(!items.is_empty()); + Self { + cur: 0, + limit, + claimed, + items, + } + } +} + +impl Iterator for BadIterator { + type Item = T; + + fn next(&mut self) -> Option { + if self.cur < self.limit { + let next_item_idx = self.cur % self.items.len(); + let next_item = self.items[next_item_idx].clone(); + self.cur += 1; + Some(next_item) + } else { + None + } + } + + /// report whatever the iterator says to + fn size_hint(&self) -> (usize, Option) { + (0, Some(self.claimed as usize)) + } +} + #[cfg(test)] mod tests { use super::*; From b9dee676d2a5ce8173131a4ed078bed9add961e0 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 9 Jan 2022 10:23:02 -0500 Subject: [PATCH 3/4] clippy --- arrow/src/array/array_string.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/src/array/array_string.rs b/arrow/src/array/array_string.rs index 7461dcd67ae9..52eb07fe146e 100644 --- a/arrow/src/array/array_string.rs +++ b/arrow/src/array/array_string.rs @@ -189,7 +189,7 @@ impl GenericStringArray { } // iterator size hint may not be correct so compute the actual number of offsets - assert!(offsets.len() > 0); // wrote at least one + assert!(!offsets.is_empty()); // wrote at least one let actual_len = (offsets.len() / std::mem::size_of::()) - 1; let array_data = ArrayData::builder(OffsetSize::DATA_TYPE) From 13d6baceb79aeda4d67b2c65f19c95cd308e0305 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 9 Jan 2022 15:43:06 -0500 Subject: [PATCH 4/4] Fix test --- arrow/src/array/array_string.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/arrow/src/array/array_string.rs b/arrow/src/array/array_string.rs index 52eb07fe146e..d1826efaaed7 100644 --- a/arrow/src/array/array_string.rs +++ b/arrow/src/array/array_string.rs @@ -379,10 +379,7 @@ impl From> for GenericStringArray< #[cfg(test)] mod tests { - use crate::{ - array::{ListBuilder, StringBuilder}, - util::test_util::BadIterator, - }; + use crate::array::{ListBuilder, StringBuilder}; use super::*; @@ -581,8 +578,10 @@ mod tests { .expect("All null array has valid array data"); } + #[cfg(feature = "test_utils")] #[test] fn bad_size_collect_string() { + use crate::util::test_util::BadIterator; let data = vec![Some("foo"), None, Some("bar")]; let expected: StringArray = data.clone().into_iter().collect(); @@ -595,8 +594,10 @@ mod tests { assert_eq!(expected, arr); } + #[cfg(feature = "test_utils")] #[test] fn bad_size_collect_large_string() { + use crate::util::test_util::BadIterator; let data = vec![Some("foo"), None, Some("bar")]; let expected: LargeStringArray = data.clone().into_iter().collect(); @@ -609,8 +610,10 @@ mod tests { assert_eq!(expected, arr); } + #[cfg(feature = "test_utils")] #[test] fn bad_size_iter_values_string() { + use crate::util::test_util::BadIterator; let data = vec!["foo", "bar", "baz"]; let expected: StringArray = data.clone().into_iter().map(Some).collect(); @@ -623,8 +626,10 @@ mod tests { assert_eq!(expected, arr); } + #[cfg(feature = "test_utils")] #[test] fn bad_size_iter_values_large_string() { + use crate::util::test_util::BadIterator; let data = vec!["foo", "bar", "baz"]; let expected: LargeStringArray = data.clone().into_iter().map(Some).collect();