Skip to content

Commit

Permalink
Additional tests for parquet reader utf8 validation (#6023)
Browse files Browse the repository at this point in the history
  • Loading branch information
alamb authored and XiangpengHao committed Jul 11, 2024
1 parent 70e1c5a commit 5b6bc55
Showing 1 changed file with 146 additions and 56 deletions.
202 changes: 146 additions & 56 deletions parquet/src/arrow/arrow_reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2425,89 +2425,179 @@ mod tests {

fn test_invalid_utf8_string_array_inner<O: OffsetSizeTrait>() {
let cases = [
(
invalid_utf8_first_char::<O>(),
"Parquet argument error: Parquet error: encountered non UTF-8 data",
),
(
invalid_utf8_later_char::<O>(),
"Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 6",
),
invalid_utf8_first_char::<O>(),
invalid_utf8_first_char_long_strings::<O>(),
invalid_utf8_later_char::<O>(),
invalid_utf8_later_char_long_strings::<O>(),
invalid_utf8_later_char_really_long_strings::<O>(),
invalid_utf8_later_char_really_long_strings2::<O>(),
];
for (array, expected_error) in cases {
// data is not valid utf8 we can not construct a correct StringArray
// safely, so purposely create an invalid StringArray
let array = unsafe {
GenericStringArray::<O>::new_unchecked(
array.offsets().clone(),
array.values().clone(),
array.nulls().cloned(),
)
};
let data_type = array.data_type().clone();
let data = write_to_parquet(Arc::new(array));
let err = read_from_parquet(data).unwrap_err();
assert_eq!(err.to_string(), expected_error, "data type: {data_type:?}")
for array in &cases {
for encoding in STRING_ENCODINGS {
// data is not valid utf8 we can not construct a correct StringArray
// safely, so purposely create an invalid StringArray
let array = unsafe {
GenericStringArray::<O>::new_unchecked(
array.offsets().clone(),
array.values().clone(),
array.nulls().cloned(),
)
};
let data_type = array.data_type().clone();
let data = write_to_parquet_with_encoding(Arc::new(array), *encoding);
let err = read_from_parquet(data).unwrap_err();
let expected_err =
"Parquet argument error: Parquet error: encountered non UTF-8 data";
assert!(
err.to_string().contains(expected_err),
"data type: {data_type:?}, expected: {expected_err}, got: {err}"
);
}
}
}

#[test]
fn test_invalid_utf8_string_view_array() {
let cases = [
(
invalid_utf8_first_char::<i32>(),
"Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 11",
),
(
invalid_utf8_later_char::<i32>(),
"Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 14",
),
invalid_utf8_first_char::<i32>(),
invalid_utf8_first_char_long_strings::<i32>(),
invalid_utf8_later_char::<i32>(),
invalid_utf8_later_char_long_strings::<i32>(),
invalid_utf8_later_char_really_long_strings::<i32>(),
invalid_utf8_later_char_really_long_strings2::<i32>(),
];
for (array, expected_error) in cases {
let array = arrow_cast::cast(&array, &ArrowDataType::BinaryView).unwrap();
let array = array.as_binary_view();

// data is not valid utf8 we can not construct a correct StringArray
// safely, so purposely create an invalid StringArray
let array = unsafe {
StringViewArray::new_unchecked(
array.views().clone(),
array.data_buffers().to_vec(),
array.nulls().cloned(),
)
};
let data_type = array.data_type().clone();
let data = write_to_parquet(Arc::new(array));
let err = read_from_parquet(data).unwrap_err();
assert_eq!(err.to_string(), expected_error, "data type: {data_type:?}")

for encoding in STRING_ENCODINGS {
for array in &cases {
let array = arrow_cast::cast(&array, &ArrowDataType::BinaryView).unwrap();
let array = array.as_binary_view();

// data is not valid utf8 we can not construct a correct StringArray
// safely, so purposely create an invalid StringViewArray
let array = unsafe {
StringViewArray::new_unchecked(
array.views().clone(),
array.data_buffers().to_vec(),
array.nulls().cloned(),
)
};

let data_type = array.data_type().clone();
let data = write_to_parquet_with_encoding(Arc::new(array), *encoding);
let err = read_from_parquet(data).unwrap_err();
let expected_err =
"Parquet argument error: Parquet error: encountered non UTF-8 data";
assert!(
err.to_string().contains(expected_err),
"data type: {data_type:?}, expected: {expected_err}, got: {err}"
);
}
}
}

/// Encodings suitable for string data
const STRING_ENCODINGS: &[Option<Encoding>] = &[
None,
Some(Encoding::PLAIN),
Some(Encoding::DELTA_LENGTH_BYTE_ARRAY),
Some(Encoding::DELTA_BYTE_ARRAY),
];

/// Invalid Utf-8 sequence in the first character
/// <https://stackoverflow.com/questions/1301402/example-invalid-utf8-string>
const INVALID_UTF8_FIRST_CHAR: &[u8] = &[0xa0, 0xa1, 0x20, 0x20];

/// Invalid Utf=8 sequence in NOT the first character
/// <https://stackoverflow.com/questions/1301402/example-invalid-utf8-string>
const INVALID_UTF8_LATER_CHAR: &[u8] = &[0x20, 0x20, 0x20, 0xa0, 0xa1, 0x20, 0x20];

/// returns a BinaryArray with invalid UTF8 data in the first character
fn invalid_utf8_first_char<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
// invalid sequence in the first character
// https://stackoverflow.com/questions/1301402/example-invalid-utf8-string
let valid: &[u8] = b" ";
let invalid: &[u8] = &[0xa0, 0xa1, 0x20, 0x20];
let invalid = INVALID_UTF8_FIRST_CHAR;
GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None, Some(invalid)])
}

/// Returns a BinaryArray with invalid UTF8 data in the first character of a
/// string larger than 12 bytes which is handled specially when reading
/// `ByteViewArray`s
fn invalid_utf8_first_char_long_strings<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
let valid: &[u8] = b" ";
let mut invalid = vec![];
invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
invalid.extend_from_slice(INVALID_UTF8_FIRST_CHAR);
GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None, Some(&invalid)])
}

/// returns a BinaryArray with invalid UTF8 data in a character other than
/// the first (this is checked in a special codepath)
fn invalid_utf8_later_char<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
// invalid sequence in NOT the first character
// https://stackoverflow.com/questions/1301402/example-invalid-utf8-string
let valid: &[u8] = b" ";
let invalid: &[u8] = &[0x20, 0x20, 0x20, 0xa0, 0xa1, 0x20, 0x20];
let invalid: &[u8] = INVALID_UTF8_LATER_CHAR;
GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None, Some(invalid)])
}

// writes the array into a single column parquet file
fn write_to_parquet(array: ArrayRef) -> Vec<u8> {
/// returns a BinaryArray with invalid UTF8 data in a character other than
/// the first in a string larger than 12 bytes which is handled specially
/// when reading `ByteViewArray`s (this is checked in a special codepath)
fn invalid_utf8_later_char_long_strings<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
let valid: &[u8] = b" ";
let mut invalid = vec![];
invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
invalid.extend_from_slice(INVALID_UTF8_LATER_CHAR);
GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None, Some(&invalid)])
}

/// returns a BinaryArray with invalid UTF8 data in a character other than
/// the first in a string larger than 128 bytes which is handled specially
/// when reading `ByteViewArray`s (this is checked in a special codepath)
fn invalid_utf8_later_char_really_long_strings<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
let valid: &[u8] = b" ";
let mut invalid = vec![];
for _ in 0..10 {
// each instance is 38 bytes
invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
}
invalid.extend_from_slice(INVALID_UTF8_LATER_CHAR);
GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None, Some(&invalid)])
}

/// returns a BinaryArray with small invalid UTF8 data followed by a large
/// invalid UTF8 data in a character other than the first in a string larger
fn invalid_utf8_later_char_really_long_strings2<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
let valid: &[u8] = b" ";
let mut valid_long = vec![];
for _ in 0..10 {
// each instance is 38 bytes
valid_long.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
}
let invalid = INVALID_UTF8_LATER_CHAR;
GenericBinaryArray::<O>::from_iter(vec![
None,
Some(valid),
Some(invalid),
None,
Some(&valid_long),
Some(valid),
])
}

/// writes the array into a single column parquet file with the specified
/// encoding.
///
/// If no encoding is specified, use default (dictionary) encoding
fn write_to_parquet_with_encoding(array: ArrayRef, encoding: Option<Encoding>) -> Vec<u8> {
let batch = RecordBatch::try_from_iter(vec![("c", array)]).unwrap();
let mut data = vec![];
let schema = batch.schema();
let props = None;
let props = encoding.map(|encoding| {
WriterProperties::builder()
// must disable dictionary encoding to actually use encoding
.set_dictionary_enabled(false)
.set_encoding(encoding)
.build()
});

{
let mut writer = ArrowWriter::try_new(&mut data, schema, props).unwrap();
writer.write(&batch).unwrap();
Expand Down

0 comments on commit 5b6bc55

Please sign in to comment.