From 60708150f1e668ccf51302959921d3a9977d7118 Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 28 Nov 2023 00:50:54 +0800 Subject: [PATCH] GH-38432: [C++][Parquet] Try to fix performance regression in the DictByteArrayDecoderImpl (#38784) ### Rationale for this change Do some changes mentioned in https://github.com/apache/arrow/issues/38432 I believe this might fix https://github.com/apache/arrow/issues/38577 Problem1: The `BinaryHelper` might call `Prepare()` and `Prepare(estimated-output-binary-length)` for data. This might because: 1. For Plain Encoding ByteArray, the `len_` is similar to the data-page size, so `Reserve` is related. 2. For Dict Encoding. The Data Page is just a RLE encoding Page, it's `len_` might didn't directly related to output-binary. Problem2: `Prepare` using `::arrow::kBinaryMemoryLimit` as min-value, we should use `this->chunk_space_remaining_`. Problem3: `std::optional` is hard to optimize for some compilers ### What changes are included in this PR? Mention the behavior of BinaryHelper. And trying to fix it. ### Are these changes tested? No ### Are there any user-facing changes? Regression fixes * Closes: #38432 Lead-authored-by: mwish Co-authored-by: mwish <1506118561@qq.com> Co-authored-by: Gang Wu Signed-off-by: Antoine Pitrou --- cpp/src/parquet/encoding.cc | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 1bb487c20d3e2..04c6935d2de6e 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1196,24 +1196,40 @@ struct ArrowBinaryHelper { chunk_space_remaining_(::arrow::kBinaryMemoryLimit - acc_->builder->value_data_length()) {} + // Prepare will reserve the number of entries remaining in the current chunk. + // If estimated_data_length is provided, it will also reserve the estimated data length, + // and the caller should better call `UnsafeAppend` instead of `Append` to avoid + // double-checking the data length. Status Prepare(std::optional estimated_data_length = {}) { RETURN_NOT_OK(acc_->builder->Reserve(entries_remaining_)); if (estimated_data_length.has_value()) { RETURN_NOT_OK(acc_->builder->ReserveData( - std::min(*estimated_data_length, ::arrow::kBinaryMemoryLimit))); + std::min(*estimated_data_length, this->chunk_space_remaining_))); } return Status::OK(); } + Status PrepareNextInput(int64_t next_value_length) { + if (ARROW_PREDICT_FALSE(!CanFit(next_value_length))) { + // This element would exceed the capacity of a chunk + RETURN_NOT_OK(PushChunk()); + RETURN_NOT_OK(acc_->builder->Reserve(entries_remaining_)); + } + return Status::OK(); + } + + // If estimated_remaining_data_length is provided, it will also reserve the estimated + // data length, and the caller should better call `UnsafeAppend` instead of + // `Append` to avoid double-checking the data length. Status PrepareNextInput(int64_t next_value_length, - std::optional estimated_remaining_data_length = {}) { + int64_t estimated_remaining_data_length) { if (ARROW_PREDICT_FALSE(!CanFit(next_value_length))) { // This element would exceed the capacity of a chunk RETURN_NOT_OK(PushChunk()); RETURN_NOT_OK(acc_->builder->Reserve(entries_remaining_)); - if (estimated_remaining_data_length.has_value()) { + if (estimated_remaining_data_length) { RETURN_NOT_OK(acc_->builder->ReserveData( - std::min(*estimated_remaining_data_length, chunk_space_remaining_))); + std::min(estimated_remaining_data_length, chunk_space_remaining_))); } } return Status::OK(); @@ -1271,8 +1287,10 @@ struct ArrowBinaryHelper { return acc_->Reserve(entries_remaining_); } + Status PrepareNextInput(int64_t next_value_length) { return Status::OK(); } + Status PrepareNextInput(int64_t next_value_length, - std::optional estimated_remaining_data_length = {}) { + int64_t estimated_remaining_data_length) { return Status::OK(); } @@ -1915,6 +1933,9 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, int32_t indices[kBufferSize]; ArrowBinaryHelper helper(out, num_values); + // The `len_` in the ByteArrayDictDecoder is the total length of the + // RLE/Bit-pack encoded data size, so, we cannot use `len_` to reserve + // space for binary data. RETURN_NOT_OK(helper.Prepare()); auto dict_values = reinterpret_cast(dictionary_->data()); @@ -1983,7 +2004,10 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, int values_decoded = 0; ArrowBinaryHelper helper(out, num_values); - RETURN_NOT_OK(helper.Prepare(len_)); + // The `len_` in the ByteArrayDictDecoder is the total length of the + // RLE/Bit-pack encoded data size, so, we cannot use `len_` to reserve + // space for binary data. + RETURN_NOT_OK(helper.Prepare()); auto dict_values = reinterpret_cast(dictionary_->data());