-
Notifications
You must be signed in to change notification settings - Fork 784
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix reading/writing nested null arrays (#1480) (#1036) (#1399) #1481
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -225,11 +225,10 @@ where | |
|
||
/// Reads at most `batch_size` records into array. | ||
fn next_batch(&mut self, batch_size: usize) -> Result<ArrayRef> { | ||
let records_read = | ||
read_records(&mut self.record_reader, self.pages.as_mut(), batch_size)?; | ||
read_records(&mut self.record_reader, self.pages.as_mut(), batch_size)?; | ||
|
||
// convert to arrays | ||
let array = arrow::array::NullArray::new(records_read); | ||
let array = arrow::array::NullArray::new(self.record_reader.num_values()); | ||
|
||
// save definition and repetition buffers | ||
self.def_levels_buffer = self.record_reader.consume_def_levels()?; | ||
|
@@ -887,7 +886,7 @@ fn remove_indices( | |
Ok(Arc::new(StructArray::from((new_columns, valid.finish())))) | ||
} | ||
} | ||
ArrowType::Null => Ok(Arc::new(NullArray::new(arr.len()))), | ||
ArrowType::Null => Ok(Arc::new(NullArray::new(arr.len() - indices.len()))), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, I remember by deducting the null indices, the resulting record batch will be empty, but actually there are a row with empty list in previous issue. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That was caused by #1448 (comment) if I'm not mistaken There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
_ => Err(ParquetError::General(format!( | ||
"ListArray of type List({:?}) is not supported by array_reader", | ||
item_type | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1513,6 +1513,43 @@ mod tests { | |
required_and_optional::<LargeStringArray, _>(raw_strs); | ||
} | ||
|
||
#[test] | ||
fn null_list_single_column() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is an adapted version of the test added by @novemberkilo in https://github.com/apache/arrow-rs/pull/1477/files#diff-92fdd8b1e0a40e7072901fe74855da759d2d97f02c4f7224f4217468d545ecacR1276 |
||
let null_field = Field::new("item", DataType::Null, true); | ||
let list_field = | ||
Field::new("emptylist", DataType::List(Box::new(null_field)), true); | ||
|
||
let schema = Schema::new(vec![list_field]); | ||
|
||
// Build [[], null, [null, null]] | ||
let a_values = NullArray::new(2); | ||
let a_value_offsets = arrow::buffer::Buffer::from(&[0, 0, 0, 2].to_byte_slice()); | ||
let a_list_data = ArrayData::builder(DataType::List(Box::new(Field::new( | ||
"item", | ||
DataType::Null, | ||
true, | ||
)))) | ||
.len(3) | ||
.add_buffer(a_value_offsets) | ||
.null_bit_buffer(Buffer::from(vec![0b00000101])) | ||
.add_child_data(a_values.data().clone()) | ||
.build() | ||
.unwrap(); | ||
|
||
let a = ListArray::from(a_list_data); | ||
|
||
assert!(a.is_valid(0)); | ||
assert!(!a.is_valid(1)); | ||
assert!(a.is_valid(2)); | ||
|
||
assert_eq!(a.value(0).len(), 0); | ||
assert_eq!(a.value(2).len(), 2); | ||
assert_eq!(a.value(2).null_count(), 2); | ||
|
||
let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)]).unwrap(); | ||
roundtrip(batch, None); | ||
} | ||
|
||
#[test] | ||
fn list_single_column() { | ||
let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); | ||
|
@@ -1564,6 +1601,25 @@ mod tests { | |
one_column_roundtrip(values, true, Some(SMALL_SIZE / 2)); | ||
} | ||
|
||
#[test] | ||
fn list_nested_nulls() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test was just something I wrote whilst exploring the problem space, and I think is generally useful |
||
use arrow::datatypes::Int32Type; | ||
let data = vec![ | ||
Some(vec![Some(1)]), | ||
Some(vec![Some(2), Some(3)]), | ||
None, | ||
Some(vec![Some(4), Some(5), None]), | ||
Some(vec![None]), | ||
Some(vec![Some(6), Some(7)]), | ||
]; | ||
|
||
let list = ListArray::from_iter_primitive::<Int32Type, _, _>(data.clone()); | ||
one_column_roundtrip(Arc::new(list), true, Some(SMALL_SIZE / 2)); | ||
|
||
let list = LargeListArray::from_iter_primitive::<Int32Type, _, _>(data); | ||
one_column_roundtrip(Arc::new(list), true, Some(SMALL_SIZE / 2)); | ||
} | ||
|
||
#[test] | ||
fn struct_single_column() { | ||
let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -200,9 +200,8 @@ impl LevelInfo { | |
); | ||
|
||
match child_array.data_type() { | ||
// TODO: The behaviour of a <list<null>> is untested | ||
DataType::Null => vec![list_level], | ||
DataType::Boolean | ||
DataType::Null | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the fix for #1036 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yea, i think this is correct. |
||
| DataType::Boolean | ||
| DataType::Int8 | ||
| DataType::Int16 | ||
| DataType::Int32 | ||
|
@@ -677,8 +676,9 @@ impl LevelInfo { | |
len: usize, | ||
) -> (Vec<i64>, Vec<bool>) { | ||
match array.data_type() { | ||
DataType::Null | ||
| DataType::Boolean | ||
// A NullArray is entirely nulls, despite not containing a null buffer | ||
DataType::Null => ((0..=(len as i64)).collect(), vec![false; len]), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the fix for #1480 |
||
DataType::Boolean | ||
| DataType::Int8 | ||
| DataType::Int16 | ||
| DataType::Int32 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This was a bug, the NullArray should have the value count number of elements, i.e. the same number of elements as the definition/repetition levels if any.
In the non-nested case the two quantities will be the same, but in the event of a nested array a single record might correspond to 1 or more values.
This would then cause this line to fire - https://github.com/apache/arrow-rs/blob/master/parquet/src/arrow/array_reader.rs#L930