diff --git a/arrow/src/array/array_string.rs b/arrow/src/array/array_string.rs index 82c9f16fe99d..d1826efaaed7 100644 --- a/arrow/src/array/array_string.rs +++ b/arrow/src/array/array_string.rs @@ -187,8 +187,13 @@ impl GenericStringArray { offsets.push(length_so_far); values.extend_from_slice(s.as_bytes()); } + + // iterator size hint may not be correct so compute the actual number of offsets + assert!(!offsets.is_empty()); // wrote at least one + let actual_len = (offsets.len() / std::mem::size_of::()) - 1; + let array_data = ArrayData::builder(OffsetSize::DATA_TYPE) - .len(data_len) + .len(actual_len) .add_buffer(offsets.into()) .add_buffer(values.into()); let array_data = unsafe { array_data.build_unchecked() }; @@ -572,4 +577,70 @@ mod tests { .validate_full() .expect("All null array has valid array data"); } + + #[cfg(feature = "test_utils")] + #[test] + fn bad_size_collect_string() { + use crate::util::test_util::BadIterator; + let data = vec![Some("foo"), None, Some("bar")]; + let expected: StringArray = data.clone().into_iter().collect(); + + // Iterator reports too many items + let arr: StringArray = BadIterator::new(3, 10, data.clone()).collect(); + assert_eq!(expected, arr); + + // Iterator reports too few items + let arr: StringArray = BadIterator::new(3, 1, data.clone()).collect(); + assert_eq!(expected, arr); + } + + #[cfg(feature = "test_utils")] + #[test] + fn bad_size_collect_large_string() { + use crate::util::test_util::BadIterator; + let data = vec![Some("foo"), None, Some("bar")]; + let expected: LargeStringArray = data.clone().into_iter().collect(); + + // Iterator reports too many items + let arr: LargeStringArray = BadIterator::new(3, 10, data.clone()).collect(); + assert_eq!(expected, arr); + + // Iterator reports too few items + let arr: LargeStringArray = BadIterator::new(3, 1, data.clone()).collect(); + assert_eq!(expected, arr); + } + + #[cfg(feature = "test_utils")] + #[test] + fn bad_size_iter_values_string() { + use crate::util::test_util::BadIterator; + let data = vec!["foo", "bar", "baz"]; + let expected: StringArray = data.clone().into_iter().map(Some).collect(); + + // Iterator reports too many items + let arr = StringArray::from_iter_values(BadIterator::new(3, 10, data.clone())); + assert_eq!(expected, arr); + + // Iterator reports too few items + let arr = StringArray::from_iter_values(BadIterator::new(3, 1, data.clone())); + assert_eq!(expected, arr); + } + + #[cfg(feature = "test_utils")] + #[test] + fn bad_size_iter_values_large_string() { + use crate::util::test_util::BadIterator; + let data = vec!["foo", "bar", "baz"]; + let expected: LargeStringArray = data.clone().into_iter().map(Some).collect(); + + // Iterator reports too many items + let arr = + LargeStringArray::from_iter_values(BadIterator::new(3, 10, data.clone())); + assert_eq!(expected, arr); + + // Iterator reports too few items + let arr = + LargeStringArray::from_iter_values(BadIterator::new(3, 1, data.clone())); + assert_eq!(expected, arr); + } } diff --git a/arrow/src/util/test_util.rs b/arrow/src/util/test_util.rs index f02c8d09831d..6b1ad4be1d76 100644 --- a/arrow/src/util/test_util.rs +++ b/arrow/src/util/test_util.rs @@ -152,6 +152,54 @@ fn get_data_dir(udf_env: &str, submodule_data: &str) -> Result { + /// where the iterator currently is + cur: usize, + /// How many items will this iterator *actually* make + limit: usize, + /// How many items this iterator claims it will make + claimed: usize, + /// The items to return. If there are fewer items than `limit` + /// they will be repeated + pub items: Vec, +} + +impl BadIterator { + /// Create a new iterator for items, but that reports to + /// produce items. Must provide at least 1 item. + pub fn new(limit: usize, claimed: usize, items: Vec) -> Self { + assert!(!items.is_empty()); + Self { + cur: 0, + limit, + claimed, + items, + } + } +} + +impl Iterator for BadIterator { + type Item = T; + + fn next(&mut self) -> Option { + if self.cur < self.limit { + let next_item_idx = self.cur % self.items.len(); + let next_item = self.items[next_item_idx].clone(); + self.cur += 1; + Some(next_item) + } else { + None + } + } + + /// report whatever the iterator says to + fn size_hint(&self) -> (usize, Option) { + (0, Some(self.claimed as usize)) + } +} + #[cfg(test)] mod tests { use super::*;