From 8ee9b69330f46bda928b4fb81b1285515412a062 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 23 Aug 2024 14:32:07 +0900 Subject: [PATCH] GH-43797: [C++] Attach `arrow::ArrayStatistics` to `arrow::ArrayData` If we can attach associated statistics to an array via `ArrayData`, we can use it in later processes such as query planning. --- cpp/src/arrow/array/array_base.h | 8 +++ cpp/src/arrow/array/array_test.cc | 102 ++++++++++++++++++++++++++++++ cpp/src/arrow/array/data.h | 12 +++- 3 files changed, 120 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/array/array_base.h b/cpp/src/arrow/array/array_base.h index 716ae0722069e..c36d4518bdbd9 100644 --- a/cpp/src/arrow/array/array_base.h +++ b/cpp/src/arrow/array/array_base.h @@ -232,6 +232,14 @@ class ARROW_EXPORT Array { /// \return DeviceAllocationType DeviceAllocationType device_type() const { return data_->device_type(); } + /// \brief Return the statistics of this Array + /// + /// This just delegates to calling statistics on the underlying ArrayData + /// object which backs this Array. + /// + /// \return const ArrayStatistics& + const ArrayStatistics& statistics() const { return data_->statistics; } + protected: Array() = default; ARROW_DEFAULT_MOVE_AND_ASSIGN(Array); diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 32806d9d2edb3..7fdf60fbf3d40 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -3709,6 +3709,108 @@ TEST(TestSwapEndianArrayData, InvalidLength) { } } +class TestArrayDataStatistics : public ::testing::Test { + public: + void SetUp() { + valids_ = {1, 0, 1, 1}; + null_count_ = std::count(valids_.begin(), valids_.end(), 0); + null_buffer_ = *internal::BytesToBits(valids_); + values_ = {1, 0, 3, -4}; + min_ = *std::min_element(values_.begin(), values_.end()); + max_ = *std::max_element(values_.begin(), values_.end()); + values_buffer_ = Buffer::FromVector(values_); + data_ = ArrayData::Make(int32(), values_.size(), {null_buffer_, values_buffer_}, + null_count_); + data_->statistics.null_count = null_count_; + data_->statistics.min = min_; + data_->statistics.is_min_exact = true; + data_->statistics.max = max_; + data_->statistics.is_max_exact = true; + } + + protected: + std::vector valids_; + size_t null_count_; + std::shared_ptr null_buffer_; + std::vector values_; + int64_t min_; + int64_t max_; + std::shared_ptr values_buffer_; + std::shared_ptr data_; +}; + +TEST_F(TestArrayDataStatistics, MoveConstructor) { + ArrayData copied_data(*data_); + ArrayData moved_data(std::move(copied_data)); + + ASSERT_TRUE(moved_data.statistics.null_count.has_value()); + ASSERT_EQ(null_count_, moved_data.statistics.null_count.value()); + + ASSERT_TRUE(moved_data.statistics.min.has_value()); + ASSERT_TRUE(std::holds_alternative(moved_data.statistics.min.value())); + ASSERT_EQ(min_, std::get(moved_data.statistics.min.value())); + ASSERT_TRUE(moved_data.statistics.is_min_exact); + + ASSERT_TRUE(moved_data.statistics.max.has_value()); + ASSERT_TRUE(std::holds_alternative(moved_data.statistics.max.value())); + ASSERT_EQ(max_, std::get(moved_data.statistics.max.value())); + ASSERT_TRUE(moved_data.statistics.is_max_exact); +} + +TEST_F(TestArrayDataStatistics, CopyConstructor) { + ArrayData copied_data(*data_); + + ASSERT_TRUE(copied_data.statistics.null_count.has_value()); + ASSERT_EQ(null_count_, copied_data.statistics.null_count.value()); + + ASSERT_TRUE(copied_data.statistics.min.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data.statistics.min.value())); + ASSERT_EQ(min_, std::get(copied_data.statistics.min.value())); + ASSERT_TRUE(copied_data.statistics.is_min_exact); + + ASSERT_TRUE(copied_data.statistics.max.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data.statistics.max.value())); + ASSERT_EQ(max_, std::get(copied_data.statistics.max.value())); + ASSERT_TRUE(copied_data.statistics.is_max_exact); +} + +TEST_F(TestArrayDataStatistics, MoveAssignment) { + ArrayData copied_data(*data_); + ArrayData moved_data; + moved_data = std::move(copied_data); + + ASSERT_TRUE(moved_data.statistics.null_count.has_value()); + ASSERT_EQ(null_count_, moved_data.statistics.null_count.value()); + + ASSERT_TRUE(moved_data.statistics.min.has_value()); + ASSERT_TRUE(std::holds_alternative(moved_data.statistics.min.value())); + ASSERT_EQ(min_, std::get(moved_data.statistics.min.value())); + ASSERT_TRUE(moved_data.statistics.is_min_exact); + + ASSERT_TRUE(moved_data.statistics.max.has_value()); + ASSERT_TRUE(std::holds_alternative(moved_data.statistics.max.value())); + ASSERT_EQ(max_, std::get(moved_data.statistics.max.value())); + ASSERT_TRUE(moved_data.statistics.is_max_exact); +} + +TEST_F(TestArrayDataStatistics, CopyAssignment) { + ArrayData copied_data; + copied_data = *data_; + + ASSERT_TRUE(copied_data.statistics.null_count.has_value()); + ASSERT_EQ(null_count_, copied_data.statistics.null_count.value()); + + ASSERT_TRUE(copied_data.statistics.min.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data.statistics.min.value())); + ASSERT_EQ(min_, std::get(copied_data.statistics.min.value())); + ASSERT_TRUE(copied_data.statistics.is_min_exact); + + ASSERT_TRUE(copied_data.statistics.max.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data.statistics.max.value())); + ASSERT_EQ(max_, std::get(copied_data.statistics.max.value())); + ASSERT_TRUE(copied_data.statistics.is_max_exact); +} + template class TestPrimitiveArray : public ::testing::Test { public: diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index e0508fe6980a7..14eaed67e71d7 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -24,6 +24,7 @@ #include #include +#include "arrow/array/statistics.h" #include "arrow/buffer.h" #include "arrow/result.h" #include "arrow/type.h" @@ -152,7 +153,8 @@ struct ARROW_EXPORT ArrayData { offset(other.offset), buffers(std::move(other.buffers)), child_data(std::move(other.child_data)), - dictionary(std::move(other.dictionary)) { + dictionary(std::move(other.dictionary)), + statistics(std::move(other.statistics)) { SetNullCount(other.null_count); } @@ -163,7 +165,8 @@ struct ARROW_EXPORT ArrayData { offset(other.offset), buffers(other.buffers), child_data(other.child_data), - dictionary(other.dictionary) { + dictionary(other.dictionary), + statistics(other.statistics) { SetNullCount(other.null_count); } @@ -176,6 +179,7 @@ struct ARROW_EXPORT ArrayData { buffers = std::move(other.buffers); child_data = std::move(other.child_data); dictionary = std::move(other.dictionary); + statistics = std::move(other.statistics); return *this; } @@ -188,6 +192,7 @@ struct ARROW_EXPORT ArrayData { buffers = other.buffers; child_data = other.child_data; dictionary = other.dictionary; + statistics = other.statistics; return *this; } @@ -390,6 +395,9 @@ struct ARROW_EXPORT ArrayData { // The dictionary for this Array, if any. Only used for dictionary type std::shared_ptr dictionary; + + // The statistics for this Array. + ArrayStatistics statistics{}; }; /// \brief A non-owning Buffer reference