From 46b14cf60a21be9e3d311a13178144c807d7bb96 Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 18 Oct 2023 21:21:03 +0800 Subject: [PATCH 1/5] basic impl --- cpp/src/parquet/column_reader.cc | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index e09369effb698..a6befa4aeebf6 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -594,10 +594,17 @@ std::shared_ptr SerializedPageReader::DecompressIfNeeded( } // Decompress the values - PARQUET_THROW_NOT_OK(decompressor_->Decompress( - compressed_len - levels_byte_len, page_buffer->data() + levels_byte_len, - uncompressed_len - levels_byte_len, - decompression_buffer_->mutable_data() + levels_byte_len)); + PARQUET_ASSIGN_OR_THROW( + auto decompressed_len, + decompressor_->Decompress(compressed_len - levels_byte_len, + page_buffer->data() + levels_byte_len, + uncompressed_len - levels_byte_len, + decompression_buffer_->mutable_data() + levels_byte_len)); + if (decompressed_len != uncompressed_len - levels_byte_len) { + throw ParquetException("Page didn't decompress to expected size, expect: " + + std::to_string(uncompressed_len - levels_byte_len) + + ", but got:" + std::to_string(decompressed_len)); + } return decompression_buffer_; } From 6fad105a3c13c5aa1055104c1c72910d1172ef08 Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 18 Oct 2023 22:45:58 +0800 Subject: [PATCH 2/5] apply suggestions --- cpp/src/parquet/column_reader.cc | 2 +- cpp/src/parquet/column_reader_test.cc | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index a6befa4aeebf6..ecc48811e46fc 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -601,7 +601,7 @@ std::shared_ptr SerializedPageReader::DecompressIfNeeded( uncompressed_len - levels_byte_len, decompression_buffer_->mutable_data() + levels_byte_len)); if (decompressed_len != uncompressed_len - levels_byte_len) { - throw ParquetException("Page didn't decompress to expected size, expect: " + + throw ParquetException("Page didn't decompress to expected size, expected: " + std::to_string(uncompressed_len - levels_byte_len) + ", but got:" + std::to_string(decompressed_len)); } diff --git a/cpp/src/parquet/column_reader_test.cc b/cpp/src/parquet/column_reader_test.cc index b3b75a7bb58d2..bed7e06786e70 100644 --- a/cpp/src/parquet/column_reader_test.cc +++ b/cpp/src/parquet/column_reader_test.cc @@ -1587,8 +1587,8 @@ class ByteArrayRecordReaderTest : public ::testing::TestWithParam { }; // Tests reading and skipping a ByteArray field. -// The binary readers only differ in DeocdeDense and DecodeSpaced functions, so -// testing optional is sufficient in excercising those code paths. +// The binary readers only differ in DecodeDense and DecodeSpaced functions, so +// testing optional is sufficient in exercising those code paths. TEST_P(ByteArrayRecordReaderTest, ReadAndSkipOptional) { MakeRecordReader(/*levels_per_page=*/90, /*num_pages=*/1); From c9e5af9d801884bd25b10fdf1eb0f57084c8f443 Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 19 Oct 2023 00:48:37 +0800 Subject: [PATCH 3/5] add test --- cpp/src/parquet/file_deserialize_test.cc | 32 ++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/cpp/src/parquet/file_deserialize_test.cc b/cpp/src/parquet/file_deserialize_test.cc index b2a918915e4e6..ed26d9e7706b0 100644 --- a/cpp/src/parquet/file_deserialize_test.cc +++ b/cpp/src/parquet/file_deserialize_test.cc @@ -879,6 +879,38 @@ TEST_F(TestPageSerde, DataPageV2CrcCheckNonExistent) { /* write_data_page_v2 */ true); } +TEST_F(TestPageSerde, BadCompressedPageSize) { + auto codec_types = GetSupportedCodecTypes(); + const int32_t num_rows = 32; // dummy value + data_page_header_.num_values = num_rows; + std::vector faux_data; + test::random_bytes(1024, 0, &faux_data); + for (auto codec_type : codec_types) { + auto codec = GetCodec(codec_type); + std::vector buffer; + const uint8_t* data = faux_data.data(); + int data_size = static_cast(faux_data.size()); + + int64_t max_compressed_size = codec->MaxCompressedLen(data_size, data); + buffer.resize(max_compressed_size); + + int64_t actual_size; + ASSERT_OK_AND_ASSIGN( + actual_size, codec->Compress(data_size, data, max_compressed_size, &buffer[0])); + + actual_size += 1; // corrupt the compressed data + if (buffer.size() < static_cast(actual_size)) { + buffer.resize(actual_size); + } + ASSERT_NO_FATAL_FAILURE( + WriteDataPageHeader(1024, data_size, static_cast(actual_size))); + ASSERT_OK(out_stream_->Write(buffer.data(), actual_size)); + InitSerializedPageReader(num_rows, codec_type); + EXPECT_THROW(page_reader_->NextPage(), ParquetException) << codec_type; + ResetStream(); + } +} + // ---------------------------------------------------------------------- // File structure tests From 98d047b64fd7c2d416899ff659c9115fbc82e894 Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 19 Oct 2023 01:08:24 +0800 Subject: [PATCH 4/5] fix test --- cpp/src/parquet/file_deserialize_test.cc | 25 +++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/cpp/src/parquet/file_deserialize_test.cc b/cpp/src/parquet/file_deserialize_test.cc index ed26d9e7706b0..24ef327a69032 100644 --- a/cpp/src/parquet/file_deserialize_test.cc +++ b/cpp/src/parquet/file_deserialize_test.cc @@ -881,10 +881,12 @@ TEST_F(TestPageSerde, DataPageV2CrcCheckNonExistent) { TEST_F(TestPageSerde, BadCompressedPageSize) { auto codec_types = GetSupportedCodecTypes(); + const int data_page_bytes = 8192; const int32_t num_rows = 32; // dummy value data_page_header_.num_values = num_rows; std::vector faux_data; - test::random_bytes(1024, 0, &faux_data); + // Resize and make all to 1 to make it well-compressed. + faux_data.resize(data_page_bytes, 1); for (auto codec_type : codec_types) { auto codec = GetCodec(codec_type); std::vector buffer; @@ -897,16 +899,21 @@ TEST_F(TestPageSerde, BadCompressedPageSize) { int64_t actual_size; ASSERT_OK_AND_ASSIGN( actual_size, codec->Compress(data_size, data, max_compressed_size, &buffer[0])); - - actual_size += 1; // corrupt the compressed data - if (buffer.size() < static_cast(actual_size)) { - buffer.resize(actual_size); - } - ASSERT_NO_FATAL_FAILURE( - WriteDataPageHeader(1024, data_size, static_cast(actual_size))); + ASSERT_NO_FATAL_FAILURE(WriteDataPageHeader(data_page_bytes, data_size + 1, + static_cast(actual_size))); ASSERT_OK(out_stream_->Write(buffer.data(), actual_size)); InitSerializedPageReader(num_rows, codec_type); - EXPECT_THROW(page_reader_->NextPage(), ParquetException) << codec_type; + + EXPECT_THROW_THAT( + [&]() { page_reader_->NextPage(); }, ParquetException, + ::testing::AnyOf( + ::testing::Property( + &ParquetException::what, + ::testing::HasSubstr("Page didn't decompress to expected size")), + // Some decompressor, like zstd, might be able to detect the error + // before checking the page size. + ::testing::Property(&ParquetException::what, + ::testing::HasSubstr("IOError")))); ResetStream(); } } From 4898b5951d1c8d537e5ab9cc4f51599d2b0236f8 Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 19 Oct 2023 22:35:42 +0800 Subject: [PATCH 5/5] apply suggestions --- cpp/src/parquet/file_deserialize_test.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/file_deserialize_test.cc b/cpp/src/parquet/file_deserialize_test.cc index 24ef327a69032..4377e714a240b 100644 --- a/cpp/src/parquet/file_deserialize_test.cc +++ b/cpp/src/parquet/file_deserialize_test.cc @@ -880,12 +880,14 @@ TEST_F(TestPageSerde, DataPageV2CrcCheckNonExistent) { } TEST_F(TestPageSerde, BadCompressedPageSize) { + // GH-38326: an exception should be raised if a compressed data page + // decompresses to a smaller size than declared in the data page header. auto codec_types = GetSupportedCodecTypes(); const int data_page_bytes = 8192; const int32_t num_rows = 32; // dummy value data_page_header_.num_values = num_rows; std::vector faux_data; - // Resize and make all to 1 to make it well-compressed. + // A well-compressible piece of data faux_data.resize(data_page_bytes, 1); for (auto codec_type : codec_types) { auto codec = GetCodec(codec_type); @@ -899,6 +901,7 @@ TEST_F(TestPageSerde, BadCompressedPageSize) { int64_t actual_size; ASSERT_OK_AND_ASSIGN( actual_size, codec->Compress(data_size, data, max_compressed_size, &buffer[0])); + // Write a data page header declaring a larger decompressed size than actual ASSERT_NO_FATAL_FAILURE(WriteDataPageHeader(data_page_bytes, data_size + 1, static_cast(actual_size))); ASSERT_OK(out_stream_->Write(buffer.data(), actual_size));