Skip to content

Commit

Permalink
GH-41545: [C++][Parquet] Fix DeltaLengthByteArrayEncoder::EstimatedDa…
Browse files Browse the repository at this point in the history
…taEncodedSize (#41546)

### Rationale for this change

`DeltaLengthByteArrayEncoder::EstimatedDataEncodedSize` would return an wrong estimate when `Put(const Array&)` was called.

### What changes are included in this PR?

Remove `encoded_size_` and uses `sink_.length()` as `encoded_size_`.

### Are these changes tested?

Yes

### Are there any user-facing changes?

No

* GitHub Issue: #41545

Authored-by: mwish <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
mapleFU authored May 7, 2024
1 parent 9cf0ee7 commit 51689a0
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 8 deletions.
18 changes: 10 additions & 8 deletions cpp/src/parquet/encoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2740,13 +2740,12 @@ class DeltaLengthByteArrayEncoder : public EncoderImpl,
: EncoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY,
pool = ::arrow::default_memory_pool()),
sink_(pool),
length_encoder_(nullptr, pool),
encoded_size_{0} {}
length_encoder_(nullptr, pool) {}

std::shared_ptr<Buffer> FlushValues() override;

int64_t EstimatedDataEncodedSize() override {
return encoded_size_ + length_encoder_.EstimatedDataEncodedSize();
return sink_.length() + length_encoder_.EstimatedDataEncodedSize();
}

using TypedEncoder<ByteArrayType>::Put;
Expand All @@ -2768,6 +2767,11 @@ class DeltaLengthByteArrayEncoder : public EncoderImpl,
return Status::Invalid(
"Parquet cannot store strings with size 2GB or more, got: ", view.size());
}
if (ARROW_PREDICT_FALSE(
view.size() + sink_.length() >
static_cast<size_t>(std::numeric_limits<int32_t>::max()))) {
return Status::Invalid("excess expansion in DELTA_LENGTH_BYTE_ARRAY");
}
length_encoder_.Put({static_cast<int32_t>(view.length())}, 1);
PARQUET_THROW_NOT_OK(sink_.Append(view.data(), view.length()));
return Status::OK();
Expand All @@ -2777,7 +2781,6 @@ class DeltaLengthByteArrayEncoder : public EncoderImpl,

::arrow::BufferBuilder sink_;
DeltaBitPackEncoder<Int32Type> length_encoder_;
uint32_t encoded_size_;
};

template <typename DType>
Expand All @@ -2803,15 +2806,15 @@ void DeltaLengthByteArrayEncoder<DType>::Put(const T* src, int num_values) {
const int batch_size = std::min(kBatchSize, num_values - idx);
for (int j = 0; j < batch_size; ++j) {
const int32_t len = src[idx + j].len;
if (AddWithOverflow(total_increment_size, len, &total_increment_size)) {
if (ARROW_PREDICT_FALSE(
AddWithOverflow(total_increment_size, len, &total_increment_size))) {
throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY");
}
lengths[j] = len;
}
length_encoder_.Put(lengths.data(), batch_size);
}

if (AddWithOverflow(encoded_size_, total_increment_size, &encoded_size_)) {
if (sink_.length() + total_increment_size > std::numeric_limits<int32_t>::max()) {
throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY");
}
PARQUET_THROW_NOT_OK(sink_.Reserve(total_increment_size));
Expand Down Expand Up @@ -2850,7 +2853,6 @@ std::shared_ptr<Buffer> DeltaLengthByteArrayEncoder<DType>::FlushValues() {

std::shared_ptr<Buffer> buffer;
PARQUET_THROW_NOT_OK(sink_.Finish(&buffer, true));
encoded_size_ = 0;
return buffer;
}

Expand Down
9 changes: 9 additions & 0 deletions cpp/src/parquet/encoding_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -577,6 +577,11 @@ TEST(PlainEncodingAdHoc, ArrowBinaryDirectPut) {
auto decoder = MakeTypedDecoder<ByteArrayType>(Encoding::PLAIN);

ASSERT_NO_THROW(encoder->Put(*values));
// For Plain encoding, the estimated size should be at least the total byte size
auto& string_array = dynamic_cast<const ::arrow::StringArray&>(*values);
EXPECT_GE(encoder->EstimatedDataEncodedSize(), string_array.total_values_length())
<< "Estimated size should be at least the total byte size";

auto buf = encoder->FlushValues();

int num_values = static_cast<int>(values->length() - values->null_count());
Expand Down Expand Up @@ -2160,6 +2165,10 @@ TEST(DeltaLengthByteArrayEncodingAdHoc, ArrowBinaryDirectPut) {

auto CheckSeed = [&](std::shared_ptr<::arrow::Array> values) {
ASSERT_NO_THROW(encoder->Put(*values));
auto* binary_array = checked_cast<const ::arrow::BinaryArray*>(values.get());
// For DeltaLength encoding, the estimated size should be at least the total byte size
EXPECT_GE(encoder->EstimatedDataEncodedSize(), binary_array->total_values_length())
<< "Estimated size should be at least the total byte size";
auto buf = encoder->FlushValues();

int num_values = static_cast<int>(values->length() - values->null_count());
Expand Down

0 comments on commit 51689a0

Please sign in to comment.