From 4ef95eb89f9202dfcd9017633cf55671d56e337f Mon Sep 17 00:00:00 2001 From: William Butler Date: Fri, 4 Mar 2022 15:48:33 -0800 Subject: [PATCH] PARQUET-2131: Number values decoded DCHECKs should be exceptions As discussed on some other bugs, there are some parquet-cpp DCHECKs on the number of values decoded that really should be exceptions. When invalid Parquet files are read, it is possible for the decoders to return less values than expected and this should be signaled back to the user even in non-debug mode and it should not be a crash in debug mode. Closes #12490 from tachyonwill/arrow_decode_check Authored-by: William Butler Signed-off-by: Micah Kornfield --- cpp/src/parquet/column_reader.cc | 21 +++++++++++++++------ testing | 2 +- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index c31cf7eb4072f..76476c5da7382 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -78,6 +78,15 @@ inline bool HasSpacedValues(const ColumnDescriptor* descr) { return false; } } + +// Throws exception if number_decoded does not match expected. +inline void CheckNumberDecoded(int64_t number_decoded, int64_t expected) { + if (ARROW_PREDICT_FALSE(number_decoded != expected)) { + ParquetException::EofException("Decoded values " + std::to_string(number_decoded) + + " does not match expected " + + std::to_string(expected)); + } +} } // namespace LevelDecoder::LevelDecoder() : num_values_remaining_(0) {} @@ -1478,13 +1487,13 @@ class TypedRecordReader : public ColumnReaderImplBase, int64_t num_decoded = this->current_decoder_->DecodeSpaced( ValuesHead(), static_cast(values_with_nulls), static_cast(null_count), valid_bits, valid_bits_offset); - DCHECK_EQ(num_decoded, values_with_nulls); + CheckNumberDecoded(num_decoded, values_with_nulls); } virtual void ReadValuesDense(int64_t values_to_read) { int64_t num_decoded = this->current_decoder_->Decode(ValuesHead(), static_cast(values_to_read)); - DCHECK_EQ(num_decoded, values_to_read); + CheckNumberDecoded(num_decoded, values_to_read); } // Return number of logical records read @@ -1612,7 +1621,7 @@ class FLBARecordReader : public TypedRecordReader, auto values = ValuesHead(); int64_t num_decoded = this->current_decoder_->Decode(values, static_cast(values_to_read)); - DCHECK_EQ(num_decoded, values_to_read); + CheckNumberDecoded(num_decoded, values_to_read); for (int64_t i = 0; i < num_decoded; i++) { PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr)); @@ -1668,7 +1677,7 @@ class ByteArrayChunkedRecordReader : public TypedRecordReader, void ReadValuesDense(int64_t values_to_read) override { int64_t num_decoded = this->current_decoder_->DecodeArrowNonNull( static_cast(values_to_read), &accumulator_); - DCHECK_EQ(num_decoded, values_to_read); + CheckNumberDecoded(num_decoded, values_to_read); ResetValues(); } @@ -1676,7 +1685,7 @@ class ByteArrayChunkedRecordReader : public TypedRecordReader, int64_t num_decoded = this->current_decoder_->DecodeArrow( static_cast(values_to_read), static_cast(null_count), valid_bits_->mutable_data(), values_written_, &accumulator_); - DCHECK_EQ(num_decoded, values_to_read - null_count); + CheckNumberDecoded(num_decoded, values_to_read - null_count); ResetValues(); } @@ -1737,7 +1746,7 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader, /// Flush values since they have been copied into the builder ResetValues(); } - DCHECK_EQ(num_decoded, values_to_read); + CheckNumberDecoded(num_decoded, values_to_read); } void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override { diff --git a/testing b/testing index d2e832754b0ae..53b498047109d 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit d2e832754b0ae1fb0b9a5f637b44d68fa4bf6989 +Subproject commit 53b498047109d9940fcfab388bd9d6aeb8c57425