From 3dc999c918401cd0f0a942c7e6578d30322514f9 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 21 Dec 2024 14:45:54 +0900 Subject: [PATCH] GH-44579: [C++] Use array type to compute min/max statistics arrow type --- cpp/src/arrow/array/statistics.h | 64 ++++++++++++++++++++++++++---- cpp/src/arrow/record_batch.cc | 8 ++-- cpp/src/arrow/record_batch_test.cc | 2 +- 3 files changed, 63 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h index 99b853ab0fe73..6ccd2f4766e67 100644 --- a/cpp/src/arrow/array/statistics.h +++ b/cpp/src/arrow/array/statistics.h @@ -22,7 +22,7 @@ #include #include -#include "arrow/type_fwd.h" +#include "arrow/type.h" #include "arrow/util/visibility.h" namespace arrow { @@ -34,22 +34,38 @@ namespace arrow { /// as Apache Parquet may have statistics. Statistics associated with /// data source can be read unified API via this class. struct ARROW_EXPORT ArrayStatistics { + /// \brief The type for maximum and minimum values. If the target + /// value exists, one of them is used. `std::nullopt` is used + /// otherwise. using ValueType = std::variant; static const std::shared_ptr& ValueToArrowType( - const std::optional& value) { + const std::optional& value, + const std::shared_ptr& array_type) { if (!value.has_value()) { return null(); } struct Visitor { + const std::shared_ptr& array_type; + const std::shared_ptr& operator()(const bool&) { return boolean(); } const std::shared_ptr& operator()(const int64_t&) { return int64(); } const std::shared_ptr& operator()(const uint64_t&) { return uint64(); } const std::shared_ptr& operator()(const double&) { return float64(); } - // GH-44579: How to support binary data? - const std::shared_ptr& operator()(const std::string&) { return utf8(); } - } visitor; + const std::shared_ptr& operator()(const std::string&) { + switch (array_type->id()) { + case Type::STRING: + case Type::BINARY: + case Type::FIXED_SIZE_BINARY: + case Type::LARGE_STRING: + case Type::LARGE_BINARY: + return array_type; + default: + return utf8(); + } + } + } visitor{array_type}; return std::visit(visitor, value.value()); } @@ -62,7 +78,24 @@ struct ARROW_EXPORT ArrayStatistics { /// \brief The minimum value, may not be set std::optional min = std::nullopt; - const std::shared_ptr& MinArrowType() { return ValueToArrowType(min); } + /// \brief Compute Arrow type of the minimum value. + /// + /// If \ref ValueType is `std::string`, `array_type` may be + /// used. If `array_type` is a binary-like type such as \ref + /// arrow::binary and \ref arrow::large_utf8, `array_type` is + /// returned. \ref arrow::utf8 is returned otherwise. + /// + /// If \ref ValueType isn't `std::string`, `array_type` isn't used. + /// + /// \param array_type The Arrow type of the associated array. + /// + /// \return \ref arrow::null if the minimum value is `std::nullopt`, + /// Arrow type based on \ref ValueType of the \ref min + /// otherwise. + const std::shared_ptr& MinArrowType( + const std::shared_ptr& array_type) { + return ValueToArrowType(min, array_type); + } /// \brief Whether the minimum value is exact or not bool is_min_exact = false; @@ -70,7 +103,24 @@ struct ARROW_EXPORT ArrayStatistics { /// \brief The maximum value, may not be set std::optional max = std::nullopt; - const std::shared_ptr& MaxArrowType() { return ValueToArrowType(max); } + /// \brief Compute Arrow type of the maximum value. + /// + /// If \ref ValueType is `std::string`, `array_type` may be + /// used. If `array_type` is a binary-like type such as \ref + /// arrow::binary and \ref arrow::large_utf8, `array_type` is + /// returned. \ref arrow::utf8 is returned otherwise. + /// + /// If \ref ValueType isn't `std::string`, `array_type` isn't used. + /// + /// \param array_type The Arrow type of the associated array. + /// + /// \return \ref arrow::null if the maximum value is `std::nullopt`, + /// Arrow type based on \ref ValueType of the \ref max + /// otherwise. + const std::shared_ptr& MaxArrowType( + const std::shared_ptr& array_type) { + return ValueToArrowType(max, array_type); + } /// \brief Whether the maximum value is exact or not bool is_max_exact = false; diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 3f8237188dc75..5ce33a3731e7e 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -493,8 +493,10 @@ Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics on_stat RETURN_NOT_OK(on_statistics(statistics)); statistics.start_new_column = false; - const auto num_fields = record_batch.schema()->num_fields(); + const auto& schema = record_batch.schema(); + const auto num_fields = schema->num_fields(); for (int nth_column = 0; nth_column < num_fields; ++nth_column) { + const auto& field = schema->field(nth_column); auto column_statistics = record_batch.column(nth_column)->statistics(); if (!column_statistics) { continue; @@ -527,7 +529,7 @@ Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics on_stat } else { statistics.key = ARROW_STATISTICS_KEY_MIN_VALUE_APPROXIMATE; } - statistics.type = column_statistics->MinArrowType(); + statistics.type = column_statistics->MinArrowType(field->type()); statistics.value = column_statistics->min.value(); RETURN_NOT_OK(on_statistics(statistics)); statistics.start_new_column = false; @@ -540,7 +542,7 @@ Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics on_stat } else { statistics.key = ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE; } - statistics.type = column_statistics->MaxArrowType(); + statistics.type = column_statistics->MaxArrowType(field->type()); statistics.value = column_statistics->max.value(); RETURN_NOT_OK(on_statistics(statistics)); statistics.start_new_column = false; diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index 21202c6acb05a..21d51ae5068b6 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -1116,7 +1116,7 @@ Result> MakeStatisticsArray( } keys_indices.push_back(key_index); - auto values_type = ArrayStatistics::ValueToArrowType(value); + auto values_type = ArrayStatistics::ValueToArrowType(value, arrow::null()); int8_t values_type_code = 0; for (; values_type_code < static_cast(values_types.size()); ++values_type_code) {