Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-43666: [C++] Attach arrow::ArrayStatistics to arrow::Array #43705

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 37 additions & 17 deletions cpp/src/arrow/array/array_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <vector>

#include "arrow/array/data.h"
#include "arrow/array/statistics.h"
#include "arrow/buffer.h"
#include "arrow/compare.h"
#include "arrow/result.h"
Expand Down Expand Up @@ -232,15 +233,32 @@ class ARROW_EXPORT Array {
/// \return DeviceAllocationType
DeviceAllocationType device_type() const { return data_->device_type(); }

/// \brief Return the statistics of this Array
///
/// \return const std::shared_ptr<ArrayStatistics>&
const std::shared_ptr<ArrayStatistics>& statistics() const { return statistics_; }

protected:
Array() = default;
ARROW_DEFAULT_MOVE_AND_ASSIGN(Array);

std::shared_ptr<ArrayData> data_;
const uint8_t* null_bitmap_data_ = NULLPTR;

/// Protected method for constructors
void SetData(const std::shared_ptr<ArrayData>& data) {
/// Protected method for constructors. This must be called from each
/// array class to call its SetData(). You can't use call this in a
/// parent array class.
void Init(const std::shared_ptr<ArrayData>& data,
const std::shared_ptr<ArrayStatistics>& statistics) {
SetData(data);
if (statistics) {
SetStatistics(statistics);
}
}

/// Protected method for constructors. Don't call this method
/// directly. This should be called from Init().
virtual void SetData(const std::shared_ptr<ArrayData>& data) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This probably shouldn't be virtual since it's called from constructors, which adds nuances. Specifically, if BaseArray::BaseArray() calls Array::Init() then that will not call DerivedArray::SetData().

Is there a reason to leave this virtual?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This probably shouldn't be virtual since it's called from constructors, which adds nuances. Specifically, if BaseArray::BaseArray() calls Array::Init() then that will not call DerivedArray::SetData().

You're right. So this implementation ensures calling Array::Init() from DerivedArray::DerivedArray() not BaseArray::BaseArray(). It calls DerivedArray::SetData() but this is a tricky limitation. I'll consider another approach.

if (data->buffers.size() > 0) {
null_bitmap_data_ = data->GetValuesSafe<uint8_t>(0, /*offset=*/0);
} else {
Expand All @@ -249,6 +267,15 @@ class ARROW_EXPORT Array {
data_ = data;
}

// The statistics for this Array.
std::shared_ptr<ArrayStatistics> statistics_;

/// Protected method for constructors. Don't call this method
/// directly. This should be called from Init().
void SetStatistics(const std::shared_ptr<ArrayStatistics>& statistics) {
statistics_ = statistics;
}

private:
ARROW_DISALLOW_COPY_AND_ASSIGN(Array);

Expand All @@ -261,32 +288,22 @@ static inline std::ostream& operator<<(std::ostream& os, const Array& x) {
}

/// Base class for non-nested arrays
class ARROW_EXPORT FlatArray : public Array {
protected:
using Array::Array;
};
class ARROW_EXPORT FlatArray : public Array {};

/// Base class for arrays of fixed-size logical types
class ARROW_EXPORT PrimitiveArray : public FlatArray {
public:
PrimitiveArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);

/// Does not account for any slice offset
const std::shared_ptr<Buffer>& values() const { return data_->buffers[1]; }

protected:
PrimitiveArray() : raw_values_(NULLPTR) {}

void SetData(const std::shared_ptr<ArrayData>& data) {
this->Array::SetData(data);
void SetData(const std::shared_ptr<ArrayData>& data) override {
Array::SetData(data);
raw_values_ = data->GetValuesSafe<uint8_t>(1, /*offset=*/0);
}

explicit PrimitiveArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }

const uint8_t* raw_values_;
};

Expand All @@ -295,11 +312,14 @@ class ARROW_EXPORT NullArray : public FlatArray {
public:
using TypeClass = NullType;

explicit NullArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
explicit NullArray(const std::shared_ptr<ArrayData>& data,
const std::shared_ptr<ArrayStatistics>& statistics = NULLPTR) {
Init(data, statistics);
}
explicit NullArray(int64_t length);

private:
void SetData(const std::shared_ptr<ArrayData>& data) {
void SetData(const std::shared_ptr<ArrayData>& data) override {
null_bitmap_data_ = NULLPTR;
data->null_count = data->length;
data_ = data;
Expand Down
78 changes: 33 additions & 45 deletions cpp/src/arrow/array/array_binary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,36 +32,19 @@ namespace arrow {

using internal::checked_cast;

BinaryArray::BinaryArray(const std::shared_ptr<ArrayData>& data) {
void BinaryArray::SetData(const std::shared_ptr<ArrayData>& data) {
ARROW_CHECK(is_binary_like(data->type->id()));
SetData(data);
BaseBinaryArray<BinaryType>::SetData(data);
}

BinaryArray::BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
int64_t offset) {
SetData(ArrayData::Make(binary(), length, {null_bitmap, value_offsets, data},
null_count, offset));
}

LargeBinaryArray::LargeBinaryArray(const std::shared_ptr<ArrayData>& data) {
void LargeBinaryArray::SetData(const std::shared_ptr<ArrayData>& data) {
ARROW_CHECK(is_large_binary_like(data->type->id()));
SetData(data);
BaseBinaryArray<LargeBinaryType>::SetData(data);
}

LargeBinaryArray::LargeBinaryArray(int64_t length,
const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap,
int64_t null_count, int64_t offset) {
SetData(ArrayData::Make(large_binary(), length, {null_bitmap, value_offsets, data},
null_count, offset));
}

StringArray::StringArray(const std::shared_ptr<ArrayData>& data) {
void StringArray::SetData(const std::shared_ptr<ArrayData>& data) {
ARROW_CHECK_EQ(data->type->id(), Type::STRING);
SetData(data);
BinaryArray::SetData(data);
}

StringArray::StringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
Expand All @@ -74,9 +57,9 @@ StringArray::StringArray(int64_t length, const std::shared_ptr<Buffer>& value_of

Status StringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); }

LargeStringArray::LargeStringArray(const std::shared_ptr<ArrayData>& data) {
void LargeStringArray::SetData(const std::shared_ptr<ArrayData>& data) {
ARROW_CHECK_EQ(data->type->id(), Type::LARGE_STRING);
SetData(data);
LargeBinaryArray::SetData(data);
}

LargeStringArray::LargeStringArray(int64_t length,
Expand All @@ -90,45 +73,50 @@ LargeStringArray::LargeStringArray(int64_t length,

Status LargeStringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); }

BinaryViewArray::BinaryViewArray(std::shared_ptr<ArrayData> data) {
ARROW_CHECK_EQ(data->type->id(), Type::BINARY_VIEW);
SetData(std::move(data));
void BinaryViewArray::SetData(const std::shared_ptr<ArrayData>& data) {
ARROW_CHECK_EQ(data->type->id(), expected_type_id());
FlatArray::SetData(data);
raw_values_ = data_->GetValuesSafe<c_type>(1);
}

namespace {
void InitViewArrayBuffers(BufferVector& buffers, std::shared_ptr<Buffer> views,
std::shared_ptr<Buffer> null_bitmap) {
buffers.insert(buffers.begin(), std::move(views));
buffers.insert(buffers.begin(), std::move(null_bitmap));
}
}; // namespace

BinaryViewArray::BinaryViewArray(std::shared_ptr<DataType> type, int64_t length,
std::shared_ptr<Buffer> views, BufferVector buffers,
std::shared_ptr<Buffer> null_bitmap, int64_t null_count,
int64_t offset) {
buffers.insert(buffers.begin(), std::move(views));
buffers.insert(buffers.begin(), std::move(null_bitmap));
SetData(
ArrayData::Make(std::move(type), length, std::move(buffers), null_count, offset));
InitViewArrayBuffers(buffers, std::move(views), std::move(null_bitmap));
Init(ArrayData::Make(std::move(type), length, std::move(buffers), null_count, offset),
nullptr);
}

std::string_view BinaryViewArray::GetView(int64_t i) const {
const std::shared_ptr<Buffer>* data_buffers = data_->buffers.data() + 2;
return util::FromBinaryView(raw_values_[i], data_buffers);
}

StringViewArray::StringViewArray(std::shared_ptr<ArrayData> data) {
ARROW_CHECK_EQ(data->type->id(), Type::STRING_VIEW);
SetData(std::move(data));
StringViewArray::StringViewArray(std::shared_ptr<DataType> type, int64_t length,
std::shared_ptr<Buffer> views, BufferVector buffers,
std::shared_ptr<Buffer> null_bitmap, int64_t null_count,
int64_t offset) {
InitViewArrayBuffers(buffers, std::move(views), std::move(null_bitmap));
Init(ArrayData::Make(std::move(type), length, std::move(buffers), null_count, offset),
nullptr);
}

Status StringViewArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); }

FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data) {
SetData(data);
void FixedSizeBinaryArray::SetData(const std::shared_ptr<ArrayData>& data) {
PrimitiveArray::SetData(data);
byte_width_ = internal::checked_cast<const FixedSizeBinaryType&>(*type()).byte_width();
}

FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr<DataType>& type,
int64_t length,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap,
int64_t null_count, int64_t offset)
: PrimitiveArray(type, length, data, null_bitmap, null_count, offset),
byte_width_(checked_cast<const FixedSizeBinaryType&>(*type).byte_width()) {}

const uint8_t* FixedSizeBinaryArray::GetValue(int64_t i) const {
return raw_values_ + (i + data_->offset) * byte_width_;
}
Expand Down
Loading
Loading