diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index e09369effb698..ecc48811e46fc 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -594,10 +594,17 @@ std::shared_ptr SerializedPageReader::DecompressIfNeeded( } // Decompress the values - PARQUET_THROW_NOT_OK(decompressor_->Decompress( - compressed_len - levels_byte_len, page_buffer->data() + levels_byte_len, - uncompressed_len - levels_byte_len, - decompression_buffer_->mutable_data() + levels_byte_len)); + PARQUET_ASSIGN_OR_THROW( + auto decompressed_len, + decompressor_->Decompress(compressed_len - levels_byte_len, + page_buffer->data() + levels_byte_len, + uncompressed_len - levels_byte_len, + decompression_buffer_->mutable_data() + levels_byte_len)); + if (decompressed_len != uncompressed_len - levels_byte_len) { + throw ParquetException("Page didn't decompress to expected size, expected: " + + std::to_string(uncompressed_len - levels_byte_len) + + ", but got:" + std::to_string(decompressed_len)); + } return decompression_buffer_; } diff --git a/cpp/src/parquet/column_reader_test.cc b/cpp/src/parquet/column_reader_test.cc index b3b75a7bb58d2..bed7e06786e70 100644 --- a/cpp/src/parquet/column_reader_test.cc +++ b/cpp/src/parquet/column_reader_test.cc @@ -1587,8 +1587,8 @@ class ByteArrayRecordReaderTest : public ::testing::TestWithParam { }; // Tests reading and skipping a ByteArray field. -// The binary readers only differ in DeocdeDense and DecodeSpaced functions, so -// testing optional is sufficient in excercising those code paths. +// The binary readers only differ in DecodeDense and DecodeSpaced functions, so +// testing optional is sufficient in exercising those code paths. TEST_P(ByteArrayRecordReaderTest, ReadAndSkipOptional) { MakeRecordReader(/*levels_per_page=*/90, /*num_pages=*/1); diff --git a/cpp/src/parquet/file_deserialize_test.cc b/cpp/src/parquet/file_deserialize_test.cc index b2a918915e4e6..4377e714a240b 100644 --- a/cpp/src/parquet/file_deserialize_test.cc +++ b/cpp/src/parquet/file_deserialize_test.cc @@ -879,6 +879,48 @@ TEST_F(TestPageSerde, DataPageV2CrcCheckNonExistent) { /* write_data_page_v2 */ true); } +TEST_F(TestPageSerde, BadCompressedPageSize) { + // GH-38326: an exception should be raised if a compressed data page + // decompresses to a smaller size than declared in the data page header. + auto codec_types = GetSupportedCodecTypes(); + const int data_page_bytes = 8192; + const int32_t num_rows = 32; // dummy value + data_page_header_.num_values = num_rows; + std::vector faux_data; + // A well-compressible piece of data + faux_data.resize(data_page_bytes, 1); + for (auto codec_type : codec_types) { + auto codec = GetCodec(codec_type); + std::vector buffer; + const uint8_t* data = faux_data.data(); + int data_size = static_cast(faux_data.size()); + + int64_t max_compressed_size = codec->MaxCompressedLen(data_size, data); + buffer.resize(max_compressed_size); + + int64_t actual_size; + ASSERT_OK_AND_ASSIGN( + actual_size, codec->Compress(data_size, data, max_compressed_size, &buffer[0])); + // Write a data page header declaring a larger decompressed size than actual + ASSERT_NO_FATAL_FAILURE(WriteDataPageHeader(data_page_bytes, data_size + 1, + static_cast(actual_size))); + ASSERT_OK(out_stream_->Write(buffer.data(), actual_size)); + InitSerializedPageReader(num_rows, codec_type); + + EXPECT_THROW_THAT( + [&]() { page_reader_->NextPage(); }, ParquetException, + ::testing::AnyOf( + ::testing::Property( + &ParquetException::what, + ::testing::HasSubstr("Page didn't decompress to expected size")), + // Some decompressor, like zstd, might be able to detect the error + // before checking the page size. + ::testing::Property(&ParquetException::what, + ::testing::HasSubstr("IOError")))); + ResetStream(); + } +} + // ---------------------------------------------------------------------- // File structure tests