Skip to content

Commit

Permalink
apacheGH-38326: [C++][Parquet] check the decompressed page size same …
Browse files Browse the repository at this point in the history
…as size in page header (apache#38327)

### Rationale for this change

As mentioned in issue, currently we only decompress the page without checking the decompress size. 
This patch add a checkings.

### What changes are included in this PR?

Throw exception when size not matches in `SerializedPageReader::DecompressIfNeeded`

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Non-conforming files may throw an exception while they would silently return invalid results before.

* Closes: apache#38326

Authored-by: mwish <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
mapleFU authored and Jeremy Aguilon committed Oct 23, 2023
1 parent f87077d commit 26df515
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 6 deletions.
15 changes: 11 additions & 4 deletions cpp/src/parquet/column_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -594,10 +594,17 @@ std::shared_ptr<Buffer> SerializedPageReader::DecompressIfNeeded(
}

// Decompress the values
PARQUET_THROW_NOT_OK(decompressor_->Decompress(
compressed_len - levels_byte_len, page_buffer->data() + levels_byte_len,
uncompressed_len - levels_byte_len,
decompression_buffer_->mutable_data() + levels_byte_len));
PARQUET_ASSIGN_OR_THROW(
auto decompressed_len,
decompressor_->Decompress(compressed_len - levels_byte_len,
page_buffer->data() + levels_byte_len,
uncompressed_len - levels_byte_len,
decompression_buffer_->mutable_data() + levels_byte_len));
if (decompressed_len != uncompressed_len - levels_byte_len) {
throw ParquetException("Page didn't decompress to expected size, expected: " +
std::to_string(uncompressed_len - levels_byte_len) +
", but got:" + std::to_string(decompressed_len));
}

return decompression_buffer_;
}
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/parquet/column_reader_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1587,8 +1587,8 @@ class ByteArrayRecordReaderTest : public ::testing::TestWithParam<bool> {
};

// Tests reading and skipping a ByteArray field.
// The binary readers only differ in DeocdeDense and DecodeSpaced functions, so
// testing optional is sufficient in excercising those code paths.
// The binary readers only differ in DecodeDense and DecodeSpaced functions, so
// testing optional is sufficient in exercising those code paths.
TEST_P(ByteArrayRecordReaderTest, ReadAndSkipOptional) {
MakeRecordReader(/*levels_per_page=*/90, /*num_pages=*/1);

Expand Down
42 changes: 42 additions & 0 deletions cpp/src/parquet/file_deserialize_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -879,6 +879,48 @@ TEST_F(TestPageSerde, DataPageV2CrcCheckNonExistent) {
/* write_data_page_v2 */ true);
}

TEST_F(TestPageSerde, BadCompressedPageSize) {
// GH-38326: an exception should be raised if a compressed data page
// decompresses to a smaller size than declared in the data page header.
auto codec_types = GetSupportedCodecTypes();
const int data_page_bytes = 8192;
const int32_t num_rows = 32; // dummy value
data_page_header_.num_values = num_rows;
std::vector<uint8_t> faux_data;
// A well-compressible piece of data
faux_data.resize(data_page_bytes, 1);
for (auto codec_type : codec_types) {
auto codec = GetCodec(codec_type);
std::vector<uint8_t> buffer;
const uint8_t* data = faux_data.data();
int data_size = static_cast<int>(faux_data.size());

int64_t max_compressed_size = codec->MaxCompressedLen(data_size, data);
buffer.resize(max_compressed_size);

int64_t actual_size;
ASSERT_OK_AND_ASSIGN(
actual_size, codec->Compress(data_size, data, max_compressed_size, &buffer[0]));
// Write a data page header declaring a larger decompressed size than actual
ASSERT_NO_FATAL_FAILURE(WriteDataPageHeader(data_page_bytes, data_size + 1,
static_cast<int32_t>(actual_size)));
ASSERT_OK(out_stream_->Write(buffer.data(), actual_size));
InitSerializedPageReader(num_rows, codec_type);

EXPECT_THROW_THAT(
[&]() { page_reader_->NextPage(); }, ParquetException,
::testing::AnyOf(
::testing::Property(
&ParquetException::what,
::testing::HasSubstr("Page didn't decompress to expected size")),
// Some decompressor, like zstd, might be able to detect the error
// before checking the page size.
::testing::Property(&ParquetException::what,
::testing::HasSubstr("IOError"))));
ResetStream();
}
}

// ----------------------------------------------------------------------
// File structure tests

Expand Down

0 comments on commit 26df515

Please sign in to comment.