Skip to content

Commit

Permalink
apacheGH-44579: [C++] Use array type to compute min/max statistics ar…
Browse files Browse the repository at this point in the history
…row type
  • Loading branch information
kou committed Dec 21, 2024
1 parent 164e18a commit 3dc999c
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 11 deletions.
64 changes: 57 additions & 7 deletions cpp/src/arrow/array/statistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
#include <string>
#include <variant>

#include "arrow/type_fwd.h"
#include "arrow/type.h"
#include "arrow/util/visibility.h"

namespace arrow {
Expand All @@ -34,22 +34,38 @@ namespace arrow {
/// as Apache Parquet may have statistics. Statistics associated with
/// data source can be read unified API via this class.
struct ARROW_EXPORT ArrayStatistics {
/// \brief The type for maximum and minimum values. If the target
/// value exists, one of them is used. `std::nullopt` is used
/// otherwise.
using ValueType = std::variant<bool, int64_t, uint64_t, double, std::string>;

static const std::shared_ptr<DataType>& ValueToArrowType(
const std::optional<ValueType>& value) {
const std::optional<ValueType>& value,
const std::shared_ptr<DataType>& array_type) {
if (!value.has_value()) {
return null();
}

struct Visitor {
const std::shared_ptr<DataType>& array_type;

const std::shared_ptr<DataType>& operator()(const bool&) { return boolean(); }
const std::shared_ptr<DataType>& operator()(const int64_t&) { return int64(); }
const std::shared_ptr<DataType>& operator()(const uint64_t&) { return uint64(); }
const std::shared_ptr<DataType>& operator()(const double&) { return float64(); }
// GH-44579: How to support binary data?
const std::shared_ptr<DataType>& operator()(const std::string&) { return utf8(); }
} visitor;
const std::shared_ptr<DataType>& operator()(const std::string&) {
switch (array_type->id()) {
case Type::STRING:
case Type::BINARY:
case Type::FIXED_SIZE_BINARY:
case Type::LARGE_STRING:
case Type::LARGE_BINARY:
return array_type;
default:
return utf8();
}
}
} visitor{array_type};
return std::visit(visitor, value.value());
}

Expand All @@ -62,15 +78,49 @@ struct ARROW_EXPORT ArrayStatistics {
/// \brief The minimum value, may not be set
std::optional<ValueType> min = std::nullopt;

const std::shared_ptr<DataType>& MinArrowType() { return ValueToArrowType(min); }
/// \brief Compute Arrow type of the minimum value.
///
/// If \ref ValueType is `std::string`, `array_type` may be
/// used. If `array_type` is a binary-like type such as \ref
/// arrow::binary and \ref arrow::large_utf8, `array_type` is
/// returned. \ref arrow::utf8 is returned otherwise.
///
/// If \ref ValueType isn't `std::string`, `array_type` isn't used.
///
/// \param array_type The Arrow type of the associated array.
///
/// \return \ref arrow::null if the minimum value is `std::nullopt`,
/// Arrow type based on \ref ValueType of the \ref min
/// otherwise.
const std::shared_ptr<DataType>& MinArrowType(
const std::shared_ptr<DataType>& array_type) {
return ValueToArrowType(min, array_type);
}

/// \brief Whether the minimum value is exact or not
bool is_min_exact = false;

/// \brief The maximum value, may not be set
std::optional<ValueType> max = std::nullopt;

const std::shared_ptr<DataType>& MaxArrowType() { return ValueToArrowType(max); }
/// \brief Compute Arrow type of the maximum value.
///
/// If \ref ValueType is `std::string`, `array_type` may be
/// used. If `array_type` is a binary-like type such as \ref
/// arrow::binary and \ref arrow::large_utf8, `array_type` is
/// returned. \ref arrow::utf8 is returned otherwise.
///
/// If \ref ValueType isn't `std::string`, `array_type` isn't used.
///
/// \param array_type The Arrow type of the associated array.
///
/// \return \ref arrow::null if the maximum value is `std::nullopt`,
/// Arrow type based on \ref ValueType of the \ref max
/// otherwise.
const std::shared_ptr<DataType>& MaxArrowType(
const std::shared_ptr<DataType>& array_type) {
return ValueToArrowType(max, array_type);
}

/// \brief Whether the maximum value is exact or not
bool is_max_exact = false;
Expand Down
8 changes: 5 additions & 3 deletions cpp/src/arrow/record_batch.cc
Original file line number Diff line number Diff line change
Expand Up @@ -493,8 +493,10 @@ Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics on_stat
RETURN_NOT_OK(on_statistics(statistics));
statistics.start_new_column = false;

const auto num_fields = record_batch.schema()->num_fields();
const auto& schema = record_batch.schema();
const auto num_fields = schema->num_fields();
for (int nth_column = 0; nth_column < num_fields; ++nth_column) {
const auto& field = schema->field(nth_column);
auto column_statistics = record_batch.column(nth_column)->statistics();
if (!column_statistics) {
continue;
Expand Down Expand Up @@ -527,7 +529,7 @@ Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics on_stat
} else {
statistics.key = ARROW_STATISTICS_KEY_MIN_VALUE_APPROXIMATE;
}
statistics.type = column_statistics->MinArrowType();
statistics.type = column_statistics->MinArrowType(field->type());
statistics.value = column_statistics->min.value();
RETURN_NOT_OK(on_statistics(statistics));
statistics.start_new_column = false;
Expand All @@ -540,7 +542,7 @@ Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics on_stat
} else {
statistics.key = ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE;
}
statistics.type = column_statistics->MaxArrowType();
statistics.type = column_statistics->MaxArrowType(field->type());
statistics.value = column_statistics->max.value();
RETURN_NOT_OK(on_statistics(statistics));
statistics.start_new_column = false;
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/record_batch_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1116,7 +1116,7 @@ Result<std::shared_ptr<Array>> MakeStatisticsArray(
}
keys_indices.push_back(key_index);

auto values_type = ArrayStatistics::ValueToArrowType(value);
auto values_type = ArrayStatistics::ValueToArrowType(value, arrow::null());
int8_t values_type_code = 0;
for (; values_type_code < static_cast<int32_t>(values_types.size());
++values_type_code) {
Expand Down

0 comments on commit 3dc999c

Please sign in to comment.