Skip to content

Commit

Permalink
apacheGH-43797: [C++] Attach arrow::ArrayStatistics to `arrow::Arra…
Browse files Browse the repository at this point in the history
…yData`

If we can attach associated statistics to an array via `ArrayData`, we
can use it in later processes such as query planning.
  • Loading branch information
kou committed Aug 30, 2024
1 parent 07420b0 commit 0002140
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 2 deletions.
8 changes: 8 additions & 0 deletions cpp/src/arrow/array/array_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,14 @@ class ARROW_EXPORT Array {
/// \return DeviceAllocationType
DeviceAllocationType device_type() const { return data_->device_type(); }

/// \brief Return the statistics of this Array
///
/// This just delegates to calling statistics on the underlying ArrayData
/// object which backs this Array.
///
/// \return const ArrayStatistics&
const ArrayStatistics& statistics() const { return data_->statistics; }

protected:
Array() = default;
ARROW_DEFAULT_MOVE_AND_ASSIGN(Array);
Expand Down
102 changes: 102 additions & 0 deletions cpp/src/arrow/array/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3709,6 +3709,108 @@ TEST(TestSwapEndianArrayData, InvalidLength) {
}
}

class TestArrayDataStatistics : public ::testing::Test {
public:
void SetUp() {
valids_ = {1, 0, 1, 1};
null_count_ = std::count(valids_.begin(), valids_.end(), 0);
null_buffer_ = *internal::BytesToBits(valids_);
values_ = {1, 0, 3, -4};
min_ = *std::min_element(values_.begin(), values_.end());
max_ = *std::max_element(values_.begin(), values_.end());
values_buffer_ = Buffer::FromVector(values_);
data_ = ArrayData::Make(int32(), values_.size(), {null_buffer_, values_buffer_},
null_count_);
data_->statistics.null_count = null_count_;
data_->statistics.min = min_;
data_->statistics.is_min_exact = true;
data_->statistics.max = max_;
data_->statistics.is_max_exact = true;
}

protected:
std::vector<uint8_t> valids_;
size_t null_count_;
std::shared_ptr<Buffer> null_buffer_;
std::vector<int32_t> values_;
int64_t min_;
int64_t max_;
std::shared_ptr<Buffer> values_buffer_;
std::shared_ptr<ArrayData> data_;
};

TEST_F(TestArrayDataStatistics, MoveConstructor) {
ArrayData copied_data(*data_);
ArrayData moved_data(std::move(copied_data));

ASSERT_TRUE(moved_data.statistics.null_count.has_value());
ASSERT_EQ(null_count_, moved_data.statistics.null_count.value());

ASSERT_TRUE(moved_data.statistics.min.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics.min.value()));
ASSERT_EQ(min_, std::get<int64_t>(moved_data.statistics.min.value()));
ASSERT_TRUE(moved_data.statistics.is_min_exact);

ASSERT_TRUE(moved_data.statistics.max.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics.max.value()));
ASSERT_EQ(max_, std::get<int64_t>(moved_data.statistics.max.value()));
ASSERT_TRUE(moved_data.statistics.is_max_exact);
}

TEST_F(TestArrayDataStatistics, CopyConstructor) {
ArrayData copied_data(*data_);

ASSERT_TRUE(copied_data.statistics.null_count.has_value());
ASSERT_EQ(null_count_, copied_data.statistics.null_count.value());

ASSERT_TRUE(copied_data.statistics.min.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics.min.value()));
ASSERT_EQ(min_, std::get<int64_t>(copied_data.statistics.min.value()));
ASSERT_TRUE(copied_data.statistics.is_min_exact);

ASSERT_TRUE(copied_data.statistics.max.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics.max.value()));
ASSERT_EQ(max_, std::get<int64_t>(copied_data.statistics.max.value()));
ASSERT_TRUE(copied_data.statistics.is_max_exact);
}

TEST_F(TestArrayDataStatistics, MoveAssignment) {
ArrayData copied_data(*data_);
ArrayData moved_data;
moved_data = std::move(copied_data);

ASSERT_TRUE(moved_data.statistics.null_count.has_value());
ASSERT_EQ(null_count_, moved_data.statistics.null_count.value());

ASSERT_TRUE(moved_data.statistics.min.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics.min.value()));
ASSERT_EQ(min_, std::get<int64_t>(moved_data.statistics.min.value()));
ASSERT_TRUE(moved_data.statistics.is_min_exact);

ASSERT_TRUE(moved_data.statistics.max.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics.max.value()));
ASSERT_EQ(max_, std::get<int64_t>(moved_data.statistics.max.value()));
ASSERT_TRUE(moved_data.statistics.is_max_exact);
}

TEST_F(TestArrayDataStatistics, CopyAssignment) {
ArrayData copied_data;
copied_data = *data_;

ASSERT_TRUE(copied_data.statistics.null_count.has_value());
ASSERT_EQ(null_count_, copied_data.statistics.null_count.value());

ASSERT_TRUE(copied_data.statistics.min.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics.min.value()));
ASSERT_EQ(min_, std::get<int64_t>(copied_data.statistics.min.value()));
ASSERT_TRUE(copied_data.statistics.is_min_exact);

ASSERT_TRUE(copied_data.statistics.max.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics.max.value()));
ASSERT_EQ(max_, std::get<int64_t>(copied_data.statistics.max.value()));
ASSERT_TRUE(copied_data.statistics.is_max_exact);
}

template <typename PType>
class TestPrimitiveArray : public ::testing::Test {
public:
Expand Down
12 changes: 10 additions & 2 deletions cpp/src/arrow/array/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <utility>
#include <vector>

#include "arrow/array/statistics.h"
#include "arrow/buffer.h"
#include "arrow/result.h"
#include "arrow/type.h"
Expand Down Expand Up @@ -152,7 +153,8 @@ struct ARROW_EXPORT ArrayData {
offset(other.offset),
buffers(std::move(other.buffers)),
child_data(std::move(other.child_data)),
dictionary(std::move(other.dictionary)) {
dictionary(std::move(other.dictionary)),
statistics(std::move(other.statistics)) {
SetNullCount(other.null_count);
}

Expand All @@ -163,7 +165,8 @@ struct ARROW_EXPORT ArrayData {
offset(other.offset),
buffers(other.buffers),
child_data(other.child_data),
dictionary(other.dictionary) {
dictionary(other.dictionary),
statistics(other.statistics) {
SetNullCount(other.null_count);
}

Expand All @@ -176,6 +179,7 @@ struct ARROW_EXPORT ArrayData {
buffers = std::move(other.buffers);
child_data = std::move(other.child_data);
dictionary = std::move(other.dictionary);
statistics = std::move(other.statistics);
return *this;
}

Expand All @@ -188,6 +192,7 @@ struct ARROW_EXPORT ArrayData {
buffers = other.buffers;
child_data = other.child_data;
dictionary = other.dictionary;
statistics = other.statistics;
return *this;
}

Expand Down Expand Up @@ -390,6 +395,9 @@ struct ARROW_EXPORT ArrayData {

// The dictionary for this Array, if any. Only used for dictionary type
std::shared_ptr<ArrayData> dictionary;

// The statistics for this Array.
ArrayStatistics statistics{};
};

/// \brief A non-owning Buffer reference
Expand Down

0 comments on commit 0002140

Please sign in to comment.