From 192d232d41f2a61ae45ab520af870675af674355 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 20 Sep 2024 06:39:10 +0900 Subject: [PATCH] GH-44008: [C++][Parquet] Add support for arrow::ArrayStatistics: boolean (#44009) ### Rationale for this change Statistics is useful for fast processing. Target types: * `Boolean` ### What changes are included in this PR? Map `ColumnChunkMetaData` information to `arrow::ArrayStatistics`. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * GitHub Issue: #44008 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .../parquet/arrow/arrow_statistics_test.cc | 4 ++ cpp/src/parquet/arrow/reader_internal.cc | 40 ++++++++++++------- 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_statistics_test.cc b/cpp/src/parquet/arrow/arrow_statistics_test.cc index 5011bf89112c6..a8e2287d37085 100644 --- a/cpp/src/parquet/arrow/arrow_statistics_test.cc +++ b/cpp/src/parquet/arrow/arrow_statistics_test.cc @@ -248,6 +248,10 @@ void TestStatisticsReadArray(std::shared_ptr<::arrow::DataType> arrow_type) { } } // namespace +TEST(TestStatisticsRead, Boolean) { + TestStatisticsReadArray<::arrow::BooleanType, bool>(::arrow::boolean()); +} + TEST(TestStatisticsRead, Int8) { TestStatisticsReadArray<::arrow::Int8Type, int64_t>(::arrow::int8()); } diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index aa84a7a92bbe1..9d3171ea1a95d 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -342,21 +342,24 @@ void AttachStatistics(::arrow::ArrayData* data, static_cast<::parquet::TypedStatistics*>(statistics); const ArrowCType min = typed_statistics->min(); const ArrowCType max = typed_statistics->max(); - if (std::is_floating_point::value) { + if constexpr (std::is_same::value) { + array_statistics->min = static_cast(min); + array_statistics->max = static_cast(max); + } else if constexpr (std::is_floating_point::value) { array_statistics->min = static_cast(min); array_statistics->max = static_cast(max); - } else if (std::is_signed::value) { + } else if constexpr (std::is_signed::value) { array_statistics->min = static_cast(min); array_statistics->max = static_cast(max); } else { array_statistics->min = static_cast(min); array_statistics->max = static_cast(max); } - // We can assume that integer and floating point number based - // min/max are always exact if they exist. Apache Parquet's - // "Statistics" has "is_min_value_exact" and - // "is_max_value_exact" but we can ignore them for integer and - // floating point number based min/max. + // We can assume that integer/floating point number/boolean + // based min/max are always exact if they exist. Apache + // Parquet's "Statistics" has "is_min_value_exact" and + // "is_max_value_exact" but we can ignore them for integer/ + // floating point number/boolean based min/max. // // See also the discussion at dev@parquet.apache.org: // https://lists.apache.org/thread/zfnmg5p51b7oylft5w5k4670wgkd4zv4 @@ -414,11 +417,13 @@ std::shared_ptr TransferZeroCopy( return ::arrow::MakeArray(std::move(data)); } -Status TransferBool(RecordReader* reader, bool nullable, MemoryPool* pool, Datum* out) { +Status TransferBool(RecordReader* reader, + std::unique_ptr<::parquet::ColumnChunkMetaData> metadata, + const ReaderContext* ctx, bool nullable, Datum* out) { int64_t length = reader->values_written(); const int64_t buffer_size = bit_util::BytesForBits(length); - ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(buffer_size, pool)); + ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(buffer_size, ctx->pool)); // Transfer boolean values to packed bitmap auto values = reinterpret_cast(reader->values()); @@ -431,13 +436,19 @@ Status TransferBool(RecordReader* reader, bool nullable, MemoryPool* pool, Datum } } + std::shared_ptr<::arrow::ArrayData> array_data; if (nullable) { - *out = std::make_shared(length, std::move(data), - reader->ReleaseIsValid(), reader->null_count()); + array_data = ::arrow::ArrayData::Make(::arrow::boolean(), length, + {reader->ReleaseIsValid(), std::move(data)}, + reader->null_count()); } else { - *out = std::make_shared(length, std::move(data), - /*null_bitmap=*/nullptr, /*null_count=*/0); + array_data = ::arrow::ArrayData::Make(::arrow::boolean(), length, + {/*null_bitmap=*/nullptr, std::move(data)}, + /*null_count=*/0); } + AttachStatistics<::arrow::BooleanType, BooleanType>(array_data.get(), + std::move(metadata), ctx); + *out = std::make_shared(std::move(array_data)); return Status::OK(); } @@ -833,7 +844,8 @@ Status TransferColumnData(RecordReader* reader, reader, std::move(metadata), ctx, value_field); break; case ::arrow::Type::BOOL: - RETURN_NOT_OK(TransferBool(reader, value_field->nullable(), pool, &result)); + RETURN_NOT_OK(TransferBool(reader, std::move(metadata), ctx, + value_field->nullable(), &result)); break; TRANSFER_INT32(UINT8, ::arrow::UInt8Type); TRANSFER_INT32(INT8, ::arrow::Int8Type);