Skip to content

Commit

Permalink
enhance the document for column-reader
Browse files Browse the repository at this point in the history
  • Loading branch information
mapleFU committed Sep 8, 2024
1 parent 5549fa9 commit a9c767d
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 9 deletions.
32 changes: 25 additions & 7 deletions cpp/src/parquet/column_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2050,6 +2050,14 @@ class TypedRecordReader : public TypedColumnReaderImpl<DType>,
LevelInfo leaf_info_;
};

/// In FLBARecordReader, we read fixed length byte array values.
///
/// Unlike other fixed length types, the `values_` buffer is not used to store
/// values, instead we use `data_builder_` to store the values, and `null_bitmap_builder_`
/// is used to store the null bitmap.
///
/// The `values_` buffer is used to store the temporary values for `Decode`, and it would
/// be Reset after each `Decode` call. The `valid_bits_` buffer is never used.
class FLBARecordReader final : public TypedRecordReader<FLBAType>,
virtual public BinaryRecordReader {
public:
Expand Down Expand Up @@ -2134,6 +2142,13 @@ class FLBARecordReader final : public TypedRecordReader<FLBAType>,
::arrow::BufferBuilder data_builder_;
};

/// ByteArrayRecordReader read variable length byte array values.
///
/// It only calls `DecodeArrowNonNull` and `DecodeArrow` to read values, and
/// `Decode` and `DecodeSpaced` are not used.
///
/// The `values_` buffers are never used, and the `accumulator_`
/// is used to store the values.
class ByteArrayChunkedRecordReader final : public TypedRecordReader<ByteArrayType>,
virtual public BinaryRecordReader {
public:
Expand All @@ -2147,7 +2162,7 @@ class ByteArrayChunkedRecordReader final : public TypedRecordReader<ByteArrayTyp

::arrow::ArrayVector GetBuilderChunks() override {
::arrow::ArrayVector result = accumulator_.chunks;
if (result.size() == 0 || accumulator_.builder->length() > 0) {
if (result.empty() || accumulator_.builder->length() > 0) {
std::shared_ptr<::arrow::Array> last_chunk;
PARQUET_THROW_NOT_OK(accumulator_.builder->Finish(&last_chunk));
result.push_back(std::move(last_chunk));
Expand Down Expand Up @@ -2176,6 +2191,11 @@ class ByteArrayChunkedRecordReader final : public TypedRecordReader<ByteArrayTyp
typename EncodingTraits<ByteArrayType>::Accumulator accumulator_;
};

/// ByteArrayDictionaryRecordReader read ::arrow::dictionary(index: int32, values:
/// binary).
///
/// If underlying column is dictionary encoded, it will call `DecodeIndices` to read,
/// otherwise it will call `DecodeArrowNonNull` to read.
class ByteArrayDictionaryRecordReader final : public TypedRecordReader<ByteArrayType>,
virtual public DictionaryRecordReader {
public:
Expand Down Expand Up @@ -2225,10 +2245,9 @@ class ByteArrayDictionaryRecordReader final : public TypedRecordReader<ByteArray
} else {
num_decoded = this->current_decoder_->DecodeArrowNonNull(
static_cast<int>(values_to_read), &builder_);

/// Flush values since they have been copied into the builder
ResetValues();
}
// Flush values since they have been copied into the builder
ResetValues();
CheckNumberDecoded(num_decoded, values_to_read);
}

Expand All @@ -2244,11 +2263,10 @@ class ByteArrayDictionaryRecordReader final : public TypedRecordReader<ByteArray
num_decoded = this->current_decoder_->DecodeArrow(
static_cast<int>(values_to_read), static_cast<int>(null_count),
valid_bits_->mutable_data(), values_written_, &builder_);

/// Flush values since they have been copied into the builder
ResetValues();
}
ARROW_DCHECK_EQ(num_decoded, values_to_read - null_count);
// Flush values since they have been copied into the builder
ResetValues();
}

private:
Expand Down
9 changes: 7 additions & 2 deletions cpp/src/parquet/column_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,9 @@ class PARQUET_EXPORT RecordReader {
int64_t null_count_;

/// \brief Each bit corresponds to one element in 'values_' and specifies if it
/// is null or not null. Not set if read_dense_for_nullable_ is true.
/// is null or not null.
///
/// Not set if leaf type is not nullable or read_dense_for_nullable_ is true.
std::shared_ptr<::arrow::ResizableBuffer> valid_bits_;

/// \brief Buffer for definition levels. May contain more levels than
Expand All @@ -471,7 +473,10 @@ class PARQUET_EXPORT RecordReader {

bool read_dictionary_ = false;
// If true, we will not leave any space for the null values in the values_
// vector.
// vector or fill nulls values in BinaryRecordReader/DictionaryRecordReader.
//
// If read_dense_for_nullable_ is true, the BinaryRecordReader/DictionaryRecordReader
// might still populate the validity bitmap buffer.
bool read_dense_for_nullable_ = false;
};

Expand Down
2 changes: 2 additions & 0 deletions cpp/src/parquet/decoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2047,6 +2047,8 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, public TypedDecoderImpl<DT
int num_valid_values_{0};
uint32_t prefix_len_offset_{0};
std::shared_ptr<ResizableBuffer> buffered_prefix_length_;
// buffer for decoded strings, which gurantees the lifetime of the decoded strings
// until the next call of Decode.
std::shared_ptr<ResizableBuffer> buffered_data_;
};

Expand Down

0 comments on commit a9c767d

Please sign in to comment.