Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix undefined behavor in GenericStringArray::from_iter_values #1145

Merged
merged 4 commits into from
Jan 10, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 68 additions & 2 deletions arrow/src/array/array_string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,13 @@ impl<OffsetSize: StringOffsetSizeTrait> GenericStringArray<OffsetSize> {
offsets.push(length_so_far);
values.extend_from_slice(s.as_bytes());
}

// iterator size hint may not be correct so compute the actual number of offsets
assert!(!offsets.is_empty()); // wrote at least one
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here is the fix -- the rest of the PR is tests

let actual_len = (offsets.len() / std::mem::size_of::<OffsetSize>()) - 1;

let array_data = ArrayData::builder(OffsetSize::DATA_TYPE)
.len(data_len)
.len(actual_len)
.add_buffer(offsets.into())
.add_buffer(values.into());
let array_data = unsafe { array_data.build_unchecked() };
Expand Down Expand Up @@ -374,7 +379,10 @@ impl<T: StringOffsetSizeTrait> From<GenericListArray<T>> for GenericStringArray<
#[cfg(test)]
mod tests {

use crate::array::{ListBuilder, StringBuilder};
use crate::{
array::{ListBuilder, StringBuilder},
util::test_util::BadIterator,
};

use super::*;

Expand Down Expand Up @@ -572,4 +580,62 @@ mod tests {
.validate_full()
.expect("All null array has valid array data");
}

#[test]
fn bad_size_collect_string() {
let data = vec![Some("foo"), None, Some("bar")];
let expected: StringArray = data.clone().into_iter().collect();

// Iterator reports too many items
let arr: StringArray = BadIterator::new(3, 10, data.clone()).collect();
assert_eq!(expected, arr);

// Iterator reports too few items
let arr: StringArray = BadIterator::new(3, 1, data.clone()).collect();
assert_eq!(expected, arr);
}

#[test]
fn bad_size_collect_large_string() {
let data = vec![Some("foo"), None, Some("bar")];
let expected: LargeStringArray = data.clone().into_iter().collect();

// Iterator reports too many items
let arr: LargeStringArray = BadIterator::new(3, 10, data.clone()).collect();
assert_eq!(expected, arr);

// Iterator reports too few items
let arr: LargeStringArray = BadIterator::new(3, 1, data.clone()).collect();
assert_eq!(expected, arr);
}

#[test]
fn bad_size_iter_values_string() {
let data = vec!["foo", "bar", "baz"];
let expected: StringArray = data.clone().into_iter().map(Some).collect();

// Iterator reports too many items
let arr = StringArray::from_iter_values(BadIterator::new(3, 10, data.clone()));
assert_eq!(expected, arr);

// Iterator reports too few items
let arr = StringArray::from_iter_values(BadIterator::new(3, 1, data.clone()));
assert_eq!(expected, arr);
}

#[test]
fn bad_size_iter_values_large_string() {
let data = vec!["foo", "bar", "baz"];
let expected: LargeStringArray = data.clone().into_iter().map(Some).collect();

// Iterator reports too many items
let arr =
LargeStringArray::from_iter_values(BadIterator::new(3, 10, data.clone()));
assert_eq!(expected, arr);

// Iterator reports too few items
let arr =
LargeStringArray::from_iter_values(BadIterator::new(3, 1, data.clone()));
assert_eq!(expected, arr);
}
}
48 changes: 48 additions & 0 deletions arrow/src/util/test_util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,54 @@ fn get_data_dir(udf_env: &str, submodule_data: &str) -> Result<PathBuf, Box<dyn
}
}

/// An iterator that is untruthful about its actual length
#[derive(Debug, Clone)]
pub struct BadIterator<T> {
/// where the iterator currently is
cur: usize,
/// How many items will this iterator *actually* make
limit: usize,
/// How many items this iterator claims it will make
claimed: usize,
/// The items to return. If there are fewer items than `limit`
/// they will be repeated
pub items: Vec<T>,
}

impl<T> BadIterator<T> {
/// Create a new iterator for <limit> items, but that reports to
/// produce <claimed> items. Must provide at least 1 item.
pub fn new(limit: usize, claimed: usize, items: Vec<T>) -> Self {
assert!(!items.is_empty());
Self {
cur: 0,
limit,
claimed,
items,
}
}
}

impl<T: Clone> Iterator for BadIterator<T> {
type Item = T;

fn next(&mut self) -> Option<Self::Item> {
if self.cur < self.limit {
let next_item_idx = self.cur % self.items.len();
let next_item = self.items[next_item_idx].clone();
self.cur += 1;
Some(next_item)
} else {
None
}
}

/// report whatever the iterator says to
fn size_hint(&self) -> (usize, Option<usize>) {
(0, Some(self.claimed as usize))
}
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down