From a9c767de9b673a16818a0422958b5231d61b648f Mon Sep 17 00:00:00 2001 From: mwish Date: Sun, 8 Sep 2024 21:06:16 +0800 Subject: [PATCH] enhance the document for column-reader --- cpp/src/parquet/column_reader.cc | 32 +++++++++++++++++++++++++------- cpp/src/parquet/column_reader.h | 9 +++++++-- cpp/src/parquet/decoder.cc | 2 ++ 3 files changed, 34 insertions(+), 9 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 60a8a2176b0a8..38a899764ea3d 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -2050,6 +2050,14 @@ class TypedRecordReader : public TypedColumnReaderImpl, LevelInfo leaf_info_; }; +/// In FLBARecordReader, we read fixed length byte array values. +/// +/// Unlike other fixed length types, the `values_` buffer is not used to store +/// values, instead we use `data_builder_` to store the values, and `null_bitmap_builder_` +/// is used to store the null bitmap. +/// +/// The `values_` buffer is used to store the temporary values for `Decode`, and it would +/// be Reset after each `Decode` call. The `valid_bits_` buffer is never used. class FLBARecordReader final : public TypedRecordReader, virtual public BinaryRecordReader { public: @@ -2134,6 +2142,13 @@ class FLBARecordReader final : public TypedRecordReader, ::arrow::BufferBuilder data_builder_; }; +/// ByteArrayRecordReader read variable length byte array values. +/// +/// It only calls `DecodeArrowNonNull` and `DecodeArrow` to read values, and +/// `Decode` and `DecodeSpaced` are not used. +/// +/// The `values_` buffers are never used, and the `accumulator_` +/// is used to store the values. class ByteArrayChunkedRecordReader final : public TypedRecordReader, virtual public BinaryRecordReader { public: @@ -2147,7 +2162,7 @@ class ByteArrayChunkedRecordReader final : public TypedRecordReaderlength() > 0) { + if (result.empty() || accumulator_.builder->length() > 0) { std::shared_ptr<::arrow::Array> last_chunk; PARQUET_THROW_NOT_OK(accumulator_.builder->Finish(&last_chunk)); result.push_back(std::move(last_chunk)); @@ -2176,6 +2191,11 @@ class ByteArrayChunkedRecordReader final : public TypedRecordReader::Accumulator accumulator_; }; +/// ByteArrayDictionaryRecordReader read ::arrow::dictionary(index: int32, values: +/// binary). +/// +/// If underlying column is dictionary encoded, it will call `DecodeIndices` to read, +/// otherwise it will call `DecodeArrowNonNull` to read. class ByteArrayDictionaryRecordReader final : public TypedRecordReader, virtual public DictionaryRecordReader { public: @@ -2225,10 +2245,9 @@ class ByteArrayDictionaryRecordReader final : public TypedRecordReadercurrent_decoder_->DecodeArrowNonNull( static_cast(values_to_read), &builder_); - - /// Flush values since they have been copied into the builder - ResetValues(); } + // Flush values since they have been copied into the builder + ResetValues(); CheckNumberDecoded(num_decoded, values_to_read); } @@ -2244,11 +2263,10 @@ class ByteArrayDictionaryRecordReader final : public TypedRecordReadercurrent_decoder_->DecodeArrow( static_cast(values_to_read), static_cast(null_count), valid_bits_->mutable_data(), values_written_, &builder_); - - /// Flush values since they have been copied into the builder - ResetValues(); } ARROW_DCHECK_EQ(num_decoded, values_to_read - null_count); + // Flush values since they have been copied into the builder + ResetValues(); } private: diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 29e1b2a25e437..61d79d4f9b1b3 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -446,7 +446,9 @@ class PARQUET_EXPORT RecordReader { int64_t null_count_; /// \brief Each bit corresponds to one element in 'values_' and specifies if it - /// is null or not null. Not set if read_dense_for_nullable_ is true. + /// is null or not null. + /// + /// Not set if leaf type is not nullable or read_dense_for_nullable_ is true. std::shared_ptr<::arrow::ResizableBuffer> valid_bits_; /// \brief Buffer for definition levels. May contain more levels than @@ -471,7 +473,10 @@ class PARQUET_EXPORT RecordReader { bool read_dictionary_ = false; // If true, we will not leave any space for the null values in the values_ - // vector. + // vector or fill nulls values in BinaryRecordReader/DictionaryRecordReader. + // + // If read_dense_for_nullable_ is true, the BinaryRecordReader/DictionaryRecordReader + // might still populate the validity bitmap buffer. bool read_dense_for_nullable_ = false; }; diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc index 70810461605b1..7063f423096dc 100644 --- a/cpp/src/parquet/decoder.cc +++ b/cpp/src/parquet/decoder.cc @@ -2047,6 +2047,8 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, public TypedDecoderImpl
buffered_prefix_length_; + // buffer for decoded strings, which gurantees the lifetime of the decoded strings + // until the next call of Decode. std::shared_ptr buffered_data_; };