From 91d26cdc11bb9b4debd7a8ab68724ade1814c8d0 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 25 Oct 2024 14:02:50 -0700 Subject: [PATCH 1/2] return error rather than panic when too many row groups --- parquet/src/file/writer.rs | 44 +++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 95ff109a3dd0..4f8917a32688 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -378,7 +378,12 @@ fn write_bloom_filters( .ordinal() .expect("Missing row group ordinal") .try_into() - .expect("Negative row group ordinal"); + .map_err(|_| { + ParquetError::General(format!( + "Negative row group ordinal: {})", + row_group.ordinal().unwrap() + )) + })?; let row_group_idx = row_group_idx as usize; for (column_idx, column_chunk) in row_group.columns_mut().iter_mut().enumerate() { if let Some(bloom_filter) = bloom_filters[row_group_idx][column_idx].take() { @@ -1892,6 +1897,43 @@ mod tests { assert_eq!(page_sizes[0], unenc_size); } + #[test] + fn test_too_many_rowgroups() { + let message_type = " + message test_schema { + REQUIRED BYTE_ARRAY a (UTF8); + } + "; + let schema = Arc::new(parse_message_type(message_type).unwrap()); + let file: File = tempfile::tempfile().unwrap(); + let props = Arc::new( + WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::None) + .set_max_row_group_size(1) + .build(), + ); + let mut writer = SerializedFileWriter::new(&file, schema, props).unwrap(); + + // create 32k empty rowgroups + for i in 0..0x8001 { + match writer.next_row_group() { + Ok(mut row_group_writer) => { + let col_writer = row_group_writer.next_column().unwrap().unwrap(); + col_writer.close().unwrap(); + row_group_writer.close().unwrap(); + } + Err(e) => { + assert_eq!(i, 0x8000); + assert_eq!( + e.to_string(), + "Parquet error: Parquet does not support more than 32767 row groups per file (currently: 32768)" + ); + } + } + } + writer.close().unwrap(); + } + #[test] fn test_size_statistics_with_repetition_and_nulls() { let message_type = " From 9c8dff09a7bcda41332bcfa42920eaf2db04a011 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 25 Oct 2024 14:32:36 -0700 Subject: [PATCH 2/2] clean up test a bit --- parquet/src/file/writer.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 4f8917a32688..b84c57a60e19 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1900,10 +1900,10 @@ mod tests { #[test] fn test_too_many_rowgroups() { let message_type = " - message test_schema { - REQUIRED BYTE_ARRAY a (UTF8); - } - "; + message test_schema { + REQUIRED BYTE_ARRAY a (UTF8); + } + "; let schema = Arc::new(parse_message_type(message_type).unwrap()); let file: File = tempfile::tempfile().unwrap(); let props = Arc::new( @@ -1914,10 +1914,11 @@ mod tests { ); let mut writer = SerializedFileWriter::new(&file, schema, props).unwrap(); - // create 32k empty rowgroups + // Create 32k empty rowgroups. Should error when i == 32768. for i in 0..0x8001 { match writer.next_row_group() { Ok(mut row_group_writer) => { + assert_ne!(i, 0x8000); let col_writer = row_group_writer.next_column().unwrap().unwrap(); col_writer.close().unwrap(); row_group_writer.close().unwrap();