From 5733de79a36471d3324573578a47c33d0b3b1ec4 Mon Sep 17 00:00:00 2001 From: fengguangyuan Date: Wed, 20 Apr 2016 13:40:59 +0800 Subject: [PATCH 1/5] ARROW-60: Struct type builder API Implement the basic classes, StructArray and StructBuilder, meanwhile, add the perspective test cases for them. Other necessary methods will be added subsequetly. --- cpp/src/arrow/type.h | 1 + cpp/src/arrow/types/construct.cc | 15 +++ cpp/src/arrow/types/construct.h | 7 +- cpp/src/arrow/types/struct-test.cc | 141 ++++++++++++++++++++++++++++- cpp/src/arrow/types/struct.h | 134 ++++++++++++++++++++++++++- 5 files changed, 294 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 77404cd702524..f366645cd5cf9 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -161,6 +161,7 @@ struct Field { std::string ToString() const; }; +typedef std::shared_ptr FieldPtr; template struct PrimitiveType : public DataType { diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index 78036d4bf5711..5f5b5d9dd3f09 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -23,6 +23,7 @@ #include "arrow/types/list.h" #include "arrow/types/primitive.h" #include "arrow/types/string.h" +#include "arrow/types/struct.h" #include "arrow/util/buffer.h" #include "arrow/util/status.h" @@ -71,6 +72,20 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, } } +Status MakeStructBuilder(MemoryPool* pool, const std::shared_ptr& type, + const std::vector>& fields, + std::shared_ptr* out) { + std::vector> values_builder; + + for (auto it = fields.cbegin(); it != fields.cend(); it++) { + std::shared_ptr builder; + RETURN_NOT_OK(MakeBuilder(pool, it->get()->type, &builder)); + values_builder.push_back(builder); + } + out->reset(new StructBuilder(pool, type, fields, values_builder)); + return Status::OK(); +} + #define MAKE_PRIMITIVE_ARRAY_CASE(ENUM, ArrayType) \ case Type::ENUM: \ out->reset(new ArrayType(type, length, data, null_count, null_bitmap)); \ diff --git a/cpp/src/arrow/types/construct.h b/cpp/src/arrow/types/construct.h index 43c0018c67e41..f7182215006bf 100644 --- a/cpp/src/arrow/types/construct.h +++ b/cpp/src/arrow/types/construct.h @@ -20,19 +20,24 @@ #include #include - +#include namespace arrow { class Array; class ArrayBuilder; class Buffer; struct DataType; +struct Field; class MemoryPool; class Status; Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, std::shared_ptr* out); +Status MakeStructBuilder(MemoryPool* pool, const std::shared_ptr& type, + const std::vector>& fields, + std::shared_ptr* out); + // Create new arrays for logical types that are backed by primitive arrays. Status MakePrimitiveArray(const std::shared_ptr& type, int32_t length, const std::shared_ptr& data, int32_t null_count, diff --git a/cpp/src/arrow/types/struct-test.cc b/cpp/src/arrow/types/struct-test.cc index 79d560e19bcc0..427d13e1e4567 100644 --- a/cpp/src/arrow/types/struct-test.cc +++ b/cpp/src/arrow/types/struct-test.cc @@ -22,6 +22,16 @@ #include "gtest/gtest.h" #include "arrow/type.h" +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/test-util.h" +#include "arrow/types/struct.h" +#include "arrow/types/construct.h" +#include "arrow/types/list.h" +#include "arrow/types/primitive.h" +#include "arrow/types/test-common.h" +#include "arrow/util/status.h" + using std::shared_ptr; using std::string; @@ -52,4 +62,133 @@ TEST(TestStructType, Basics) { // TODO(wesm): out of bounds for field(...) } -} // namespace arrow +// ............................................................................. +// Struct test +class TestStructBuilder : public TestBuilder { + public: + void SetUp() { + TestBuilder::SetUp(); + + auto value_type = TypePtr(new Int32Type()); + auto char_type = TypePtr(new Int8Type()); + auto list_type = TypePtr(new ListType(char_type)); + + std::vector types = {list_type, value_type}; + std::vector fields; + fields.push_back(FieldPtr(new Field("list", list_type))); + fields.push_back(FieldPtr(new Field("int", value_type))); + + type_ = TypePtr(new StructType(fields)); + value_fields_ = fields; + + std::shared_ptr tmp; + ASSERT_OK(MakeStructBuilder(pool_, type_, fields, &tmp)); + + builder_ = std::dynamic_pointer_cast(tmp); + } + + void Done() { + result_ = std::dynamic_pointer_cast(builder_->Finish()); + } + + protected: + std::vector value_fields_; + TypePtr type_; + + std::shared_ptr builder_; + std::shared_ptr result_; +}; + +TEST_F(TestStructBuilder, TestAppendNull) { + ASSERT_OK(builder_->AppendNull()); + ASSERT_OK(builder_->AppendNull()); + ASSERT_EQ(2, builder_->value_builder().size()); + + Done(); + + ASSERT_EQ(2, result_->values().size()); + ASSERT_EQ(2, result_->length()); + ASSERT_TRUE(result_->IsNull(0)); + ASSERT_TRUE(result_->IsNull(1)); + + + auto list_char = static_cast(result_->values(0).get()); + auto chars = static_cast(list_char->values().get()); + auto int32 = static_cast(result_->values(1).get()); + ASSERT_EQ(0, list_char->length()); + ASSERT_EQ(0, chars->length()); + ASSERT_EQ(0, int32->length()); + + ASSERT_EQ(Type::LIST, list_char->type_enum()); + ASSERT_EQ(Type::INT8, list_char->values()->type_enum()); + ASSERT_EQ(Type::INT32, int32->type_enum()); +} + +TEST_F(TestStructBuilder, TestBasics) { + vector int_values = {1, 2, 3, 4}; + vector list_values = {'j', 'o', 'e', 'b', 'o', 'b', 'm', 'a', 'r', 'k'}; + vector list_lengths = {3, 0, 3, 4}; + vector list_offsets = {0, 3, 3, 6, 10}; + vector list_is_not_null = {1, 0, 1, 1}; + vector struct_is_not_null = {1, 1, 1, 1}; + + ListBuilder* list_vb = static_cast( + builder_->value_builder().at(0).get()); + Int8Builder* char_vb = static_cast( + list_vb->value_builder().get()); + Int32Builder* int_vb = static_cast( + builder_->value_builder().at(1).get()); + ASSERT_EQ(2, builder_->value_builder().size()); + + EXPECT_OK(builder_->Reserve(list_lengths.size())); + EXPECT_OK(char_vb->Reserve(list_values.size())); + EXPECT_OK(int_vb->Reserve(int_values.size())); + + int pos = 0; + for (size_t i = 0; i < list_lengths.size(); ++i) { + ASSERT_OK(list_vb->Append(list_is_not_null[i] > 0)); + int_vb->Append(int_values[i]); + for (int j = 0; j < list_lengths[i]; ++j) { + char_vb->Append(list_values[pos++]); + } + } + + for (size_t i = 0; i < struct_is_not_null.size(); ++i) { + ASSERT_OK(builder_->Append(struct_is_not_null[i] > 0)); + } + + Done(); + + ASSERT_EQ(4, result_->length()); + + auto list_char = static_cast(result_->values(0).get()); + auto chars = static_cast(list_char->values().get()); + auto int32 = static_cast(result_->values(1).get()); + + ASSERT_EQ(0, result_->null_count()); + ASSERT_EQ(1, list_char->null_count()); + ASSERT_EQ(0, int32->null_count()); + + + for (int i = 0; i < result_->length(); ++i) { + ASSERT_EQ(!static_cast(struct_is_not_null[i]), result_->IsNull(i)); + ASSERT_EQ(!static_cast(list_is_not_null[i]), list_char->IsNull(i)); + } + + // List + ASSERT_EQ(4, list_char->length()); + ASSERT_EQ(10, list_char->values()->length()); + for (size_t i = 0; i < list_offsets.size(); ++i) { + ASSERT_EQ(list_offsets[i], list_char->offsets()[i]); + } + for (size_t i = 0; i < list_values.size(); ++i) { + ASSERT_EQ(list_values[i], chars->Value(i)); + } + + // Int32 + ASSERT_EQ(4, int32->length()); + for (size_t i = 0; i < int_values.size(); ++i) { + ASSERT_EQ(int_values[i], int32->Value(i)); + } +} +} // namespace arrow diff --git a/cpp/src/arrow/types/struct.h b/cpp/src/arrow/types/struct.h index 17e32993bf975..bf2cc3d90f90c 100644 --- a/cpp/src/arrow/types/struct.h +++ b/cpp/src/arrow/types/struct.h @@ -23,7 +23,137 @@ #include #include "arrow/type.h" +#include "arrow/types/primitive.h" +#include "arrow/types/list.h" -namespace arrow {} // namespace arrow +namespace arrow { -#endif // ARROW_TYPES_STRUCT_H +class StructArray : public Array { + public: + StructArray(const TypePtr& type, int32_t length, + std::vector& values, + int32_t null_count = 0, + std::shared_ptr null_bitmap = nullptr) : + Array(type, length, null_count, null_bitmap) { + type_ = type; + values_ = values; + } + + virtual ~StructArray() {} + + // Return a shared pointer in case the requestor desires to share ownership + // with this array. + const std::shared_ptr& values(int32_t pos) const {return values_.at(pos);} + const std::vector& values() const {return values_;} + + const std::shared_ptr& value_type(int32_t pos) const { + return values_.at(pos)->type(); + } + + bool EqualsExact(const ListArray& other) const { + return true; + } + bool Equals(const std::shared_ptr& arr) const override { + return true; + } + + protected: + // Contains kinds of Arrays. + std::vector values_; +}; + +// ............................................................................ +// StrcutArray builder +class StructBuilder : public ArrayBuilder { + public: + StructBuilder(MemoryPool* pool, const std::shared_ptr& type, + const std::vector& fields, + std::vector>& value_builder) + : ArrayBuilder(pool, type) { + fields_ = fields; + value_builder_ = value_builder; + } + + Status Init(int32_t elements) { + return ArrayBuilder::Init(elements); + } + + Status Resize(int32_t capacity) { + // Need space for the end offset + if (capacity < MIN_BUILDER_CAPACITY) { + capacity = MIN_BUILDER_CAPACITY; + } + + if (capacity_ == 0) { + RETURN_NOT_OK(ArrayBuilder::Init(capacity)); + } else { + RETURN_NOT_OK(ArrayBuilder::Resize(capacity)); + } + capacity_ = capacity; + + for (auto it = value_builder_.begin(); it != value_builder_.end(); ++it) { + it->get()->Resize(capacity); + } + return Status::OK(); + } + + // Vector append + // + // If passed, valid_bytes is of equal length to values, and any zero byte + // will be considered as a null for that slot + Status Append(const uint8_t* null_bitmap, int32_t length) { + RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(null_bitmap, length); + return Status::OK(); + } + + template + std::shared_ptr Transfer() { + DCHECK(value_builder_.size()); + + std::vector> items; + for (auto it = value_builder_.cbegin(); it != value_builder_.cend(); it++) { + items.push_back(it->get()->Finish()); + } + // Here, for ListArray, offsets_ is needed, but StructArray dont need it. + auto result = std::make_shared(type_, length_, items, + null_count_, null_bitmap_); + + null_bitmap_ = nullptr; + capacity_ = length_ = null_count_ = 0; + + return result; + } + + std::shared_ptr Finish() override { + return Transfer(); + } + + // Start a new variable-length list slot + // + // This function should be called before beginning to append elements to the + // value builder + // TODO: This method should be a virtual method or not, to append null to + // it's all children? + Status Append(bool is_valid = true) { + RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(is_valid); + return Status::OK(); + } + + Status AppendNull() { + return Append(false); + } + + const std::vector>& value_builder() const { + return value_builder_; + } + + protected: + std::vector> value_builder_; + std::vector fields_; +}; + +} // namespace arrow + +#endif // ARROW_TYPES_STRUCT_H From bfabdc1e1a04d4b2ce9a7ccd331ac61768a5e091 Mon Sep 17 00:00:00 2001 From: fengguangyuan Date: Mon, 25 Apr 2016 17:39:32 +0800 Subject: [PATCH 2/5] ARROW-60: Struct type builder API Refine the previous committed patch. Add validate methods for testing StructArray and StructBuilder. TODO, Equals methods also need to be tested, but now it's not convient to do it. --- cpp/src/arrow/types/construct.cc | 28 ++--- cpp/src/arrow/types/construct.h | 4 - cpp/src/arrow/types/struct-test.cc | 192 ++++++++++++++++++++--------- cpp/src/arrow/types/struct.cc | 92 +++++++++++++- cpp/src/arrow/types/struct.h | 117 ++++++++---------- 5 files changed, 291 insertions(+), 142 deletions(-) diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index 5f5b5d9dd3f09..188f023850bfd 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -67,23 +67,23 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, out->reset(new ListBuilder(pool, value_builder)); return Status::OK(); } - default: - return Status::NotImplemented(type->ToString()); - } -} -Status MakeStructBuilder(MemoryPool* pool, const std::shared_ptr& type, - const std::vector>& fields, - std::shared_ptr* out) { - std::vector> values_builder; + case Type::STRUCT: { + auto fields = type->children_; + std::vector> values_builder; - for (auto it = fields.cbegin(); it != fields.cend(); it++) { - std::shared_ptr builder; - RETURN_NOT_OK(MakeBuilder(pool, it->get()->type, &builder)); - values_builder.push_back(builder); + for (auto it : fields) { + std::shared_ptr builder; + RETURN_NOT_OK(MakeBuilder(pool, it->type, &builder)); + values_builder.push_back(builder); + } + out->reset(new StructBuilder(pool, type, values_builder)); + return Status::OK(); + } + + default: + return Status::NotImplemented(type->ToString()); } - out->reset(new StructBuilder(pool, type, fields, values_builder)); - return Status::OK(); } #define MAKE_PRIMITIVE_ARRAY_CASE(ENUM, ArrayType) \ diff --git a/cpp/src/arrow/types/construct.h b/cpp/src/arrow/types/construct.h index f7182215006bf..d0370840ca108 100644 --- a/cpp/src/arrow/types/construct.h +++ b/cpp/src/arrow/types/construct.h @@ -34,10 +34,6 @@ class Status; Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, std::shared_ptr* out); -Status MakeStructBuilder(MemoryPool* pool, const std::shared_ptr& type, - const std::vector>& fields, - std::shared_ptr* out); - // Create new arrays for logical types that are backed by primitive arrays. Status MakePrimitiveArray(const std::shared_ptr& type, int32_t length, const std::shared_ptr& data, int32_t null_count, diff --git a/cpp/src/arrow/types/struct-test.cc b/cpp/src/arrow/types/struct-test.cc index 427d13e1e4567..b6cdc66340fe3 100644 --- a/cpp/src/arrow/types/struct-test.cc +++ b/cpp/src/arrow/types/struct-test.cc @@ -21,18 +21,17 @@ #include "gtest/gtest.h" -#include "arrow/type.h" #include "arrow/array.h" #include "arrow/builder.h" #include "arrow/test-util.h" -#include "arrow/types/struct.h" +#include "arrow/type.h" #include "arrow/types/construct.h" #include "arrow/types/list.h" #include "arrow/types/primitive.h" +#include "arrow/types/struct.h" #include "arrow/types/test-common.h" #include "arrow/util/status.h" - using std::shared_ptr; using std::string; using std::vector; @@ -62,7 +61,39 @@ TEST(TestStructType, Basics) { // TODO(wesm): out of bounds for field(...) } -// ............................................................................. +void ValidateBasicStructArray(const StructArray* result, + const vector& struct_is_valid, const vector& list_values, + const vector& list_is_valid, const vector& list_lengths, + const vector& list_offsets, const vector& int_values) { + ASSERT_EQ(4, result->length()); + ASSERT_OK(result->Validate()); + + auto list_char_arr = static_cast(result->field(0).get()); + auto char_arr = static_cast(list_char_arr->values().get()); + auto int32_arr = static_cast(result->field(1).get()); + + ASSERT_EQ(0, result->null_count()); + ASSERT_EQ(1, list_char_arr->null_count()); + ASSERT_EQ(0, int32_arr->null_count()); + + // List + ASSERT_EQ(4, list_char_arr->length()); + ASSERT_EQ(10, list_char_arr->values()->length()); + for (size_t i = 0; i < list_offsets.size(); ++i) { + ASSERT_EQ(list_offsets[i], list_char_arr->offsets()[i]); + } + for (size_t i = 0; i < list_values.size(); ++i) { + ASSERT_EQ(list_values[i], char_arr->Value(i)); + } + + // Int32 + ASSERT_EQ(4, int32_arr->length()); + for (size_t i = 0; i < int_values.size(); ++i) { + ASSERT_EQ(int_values[i], int32_arr->Value(i)); + } +} + +// ---------------------------------------------------------------------------------- // Struct test class TestStructBuilder : public TestBuilder { public: @@ -82,14 +113,12 @@ class TestStructBuilder : public TestBuilder { value_fields_ = fields; std::shared_ptr tmp; - ASSERT_OK(MakeStructBuilder(pool_, type_, fields, &tmp)); + ASSERT_OK(MakeBuilder(pool_, type_, &tmp)); builder_ = std::dynamic_pointer_cast(tmp); } - void Done() { - result_ = std::dynamic_pointer_cast(builder_->Finish()); - } + void Done() { result_ = std::dynamic_pointer_cast(builder_->Finish()); } protected: std::vector value_fields_; @@ -102,26 +131,35 @@ class TestStructBuilder : public TestBuilder { TEST_F(TestStructBuilder, TestAppendNull) { ASSERT_OK(builder_->AppendNull()); ASSERT_OK(builder_->AppendNull()); - ASSERT_EQ(2, builder_->value_builder().size()); + ASSERT_EQ(2, builder_->field_builders().size()); + + ListBuilder* list_vb = static_cast(builder_->field_builder(0).get()); + ASSERT_OK(list_vb->AppendNull()); + ASSERT_OK(list_vb->AppendNull()); + ASSERT_EQ(2, list_vb->length()); + + Int32Builder* int_vb = static_cast(builder_->field_builder(1).get()); + ASSERT_OK(int_vb->AppendNull()); + ASSERT_OK(int_vb->AppendNull()); + ASSERT_EQ(2, int_vb->length()); Done(); - ASSERT_EQ(2, result_->values().size()); + ASSERT_OK(result_->Validate()); + + ASSERT_EQ(2, result_->fields().size()); ASSERT_EQ(2, result_->length()); + ASSERT_EQ(2, result_->field(0)->length()); + ASSERT_EQ(2, result_->field(1)->length()); ASSERT_TRUE(result_->IsNull(0)); ASSERT_TRUE(result_->IsNull(1)); + ASSERT_TRUE(result_->field(0)->IsNull(0)); + ASSERT_TRUE(result_->field(0)->IsNull(1)); + ASSERT_TRUE(result_->field(1)->IsNull(0)); + ASSERT_TRUE(result_->field(1)->IsNull(1)); - - auto list_char = static_cast(result_->values(0).get()); - auto chars = static_cast(list_char->values().get()); - auto int32 = static_cast(result_->values(1).get()); - ASSERT_EQ(0, list_char->length()); - ASSERT_EQ(0, chars->length()); - ASSERT_EQ(0, int32->length()); - - ASSERT_EQ(Type::LIST, list_char->type_enum()); - ASSERT_EQ(Type::INT8, list_char->values()->type_enum()); - ASSERT_EQ(Type::INT32, int32->type_enum()); + ASSERT_EQ(Type::LIST, result_->field(0)->type_enum()); + ASSERT_EQ(Type::INT32, result_->field(1)->type_enum()); } TEST_F(TestStructBuilder, TestBasics) { @@ -129,16 +167,13 @@ TEST_F(TestStructBuilder, TestBasics) { vector list_values = {'j', 'o', 'e', 'b', 'o', 'b', 'm', 'a', 'r', 'k'}; vector list_lengths = {3, 0, 3, 4}; vector list_offsets = {0, 3, 3, 6, 10}; - vector list_is_not_null = {1, 0, 1, 1}; - vector struct_is_not_null = {1, 1, 1, 1}; + vector list_is_valid = {1, 0, 1, 1}; + vector struct_is_valid = {1, 1, 1, 1}; - ListBuilder* list_vb = static_cast( - builder_->value_builder().at(0).get()); - Int8Builder* char_vb = static_cast( - list_vb->value_builder().get()); - Int32Builder* int_vb = static_cast( - builder_->value_builder().at(1).get()); - ASSERT_EQ(2, builder_->value_builder().size()); + ListBuilder* list_vb = static_cast(builder_->field_builder(0).get()); + Int8Builder* char_vb = static_cast(list_vb->value_builder().get()); + Int32Builder* int_vb = static_cast(builder_->field_builder(1).get()); + ASSERT_EQ(2, builder_->field_builders().size()); EXPECT_OK(builder_->Reserve(list_lengths.size())); EXPECT_OK(char_vb->Reserve(list_values.size())); @@ -146,49 +181,92 @@ TEST_F(TestStructBuilder, TestBasics) { int pos = 0; for (size_t i = 0; i < list_lengths.size(); ++i) { - ASSERT_OK(list_vb->Append(list_is_not_null[i] > 0)); - int_vb->Append(int_values[i]); + ASSERT_OK(list_vb->Append(list_is_valid[i] > 0)); + int_vb->UnsafeAppend(int_values[i]); for (int j = 0; j < list_lengths[i]; ++j) { - char_vb->Append(list_values[pos++]); + char_vb->UnsafeAppend(list_values[pos++]); } } - for (size_t i = 0; i < struct_is_not_null.size(); ++i) { - ASSERT_OK(builder_->Append(struct_is_not_null[i] > 0)); + for (size_t i = 0; i < struct_is_valid.size(); ++i) { + ASSERT_OK(builder_->Append(struct_is_valid[i] > 0)); } Done(); - ASSERT_EQ(4, result_->length()); + ValidateBasicStructArray(result_.get(), struct_is_valid, list_values, list_is_valid, + list_lengths, list_offsets, int_values); +} - auto list_char = static_cast(result_->values(0).get()); - auto chars = static_cast(list_char->values().get()); - auto int32 = static_cast(result_->values(1).get()); +TEST_F(TestStructBuilder, BulkAppend) { + vector int_values = {1, 2, 3, 4}; + vector list_values = {'j', 'o', 'e', 'b', 'o', 'b', 'm', 'a', 'r', 'k'}; + vector list_lengths = {3, 0, 3, 4}; + vector list_offsets = {0, 3, 3, 6}; + vector list_is_valid = {1, 0, 1, 1}; + vector struct_is_valid = {1, 1, 1, 1}; - ASSERT_EQ(0, result_->null_count()); - ASSERT_EQ(1, list_char->null_count()); - ASSERT_EQ(0, int32->null_count()); + ListBuilder* list_vb = static_cast(builder_->field_builder(0).get()); + Int8Builder* char_vb = static_cast(list_vb->value_builder().get()); + Int32Builder* int_vb = static_cast(builder_->field_builder(1).get()); + ASSERT_OK(builder_->Reserve(list_lengths.size())); + ASSERT_OK(char_vb->Reserve(list_values.size())); + ASSERT_OK(int_vb->Reserve(int_values.size())); - for (int i = 0; i < result_->length(); ++i) { - ASSERT_EQ(!static_cast(struct_is_not_null[i]), result_->IsNull(i)); - ASSERT_EQ(!static_cast(list_is_not_null[i]), list_char->IsNull(i)); + builder_->Append(struct_is_valid.data(), struct_is_valid.size()); + + list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); + for (int8_t value : list_values) { + char_vb->UnsafeAppend(value); } - // List - ASSERT_EQ(4, list_char->length()); - ASSERT_EQ(10, list_char->values()->length()); - for (size_t i = 0; i < list_offsets.size(); ++i) { - ASSERT_EQ(list_offsets[i], list_char->offsets()[i]); + for (int32_t value : int_values) { + int_vb->UnsafeAppend(value); } - for (size_t i = 0; i < list_values.size(); ++i) { - ASSERT_EQ(list_values[i], chars->Value(i)); + + Done(); + ValidateBasicStructArray(result_.get(), struct_is_valid, list_values, list_is_valid, + list_lengths, list_offsets, int_values); +} + +TEST_F(TestStructBuilder, BulkAppendInvalid) { + vector int_values = {1, 2, 3, 4}; + vector list_values = {'j', 'o', 'e', 'b', 'o', 'b', 'm', 'a', 'r', 'k'}; + vector list_lengths = {3, 0, 3, 4}; + vector list_offsets = {0, 3, 3, 6}; + vector list_is_valid = {1, 0, 1, 1}; + vector struct_is_valid = {1, 0, 1, 1}; // should be 1, 1, 1, 1 + + ListBuilder* list_vb = static_cast(builder_->field_builder(0).get()); + Int8Builder* char_vb = static_cast(list_vb->value_builder().get()); + Int32Builder* int_vb = static_cast(builder_->field_builder(1).get()); + + ASSERT_OK(builder_->Reserve(list_lengths.size())); + ASSERT_OK(char_vb->Reserve(list_values.size())); + ASSERT_OK(int_vb->Reserve(int_values.size())); + + builder_->Append(struct_is_valid.data(), struct_is_valid.size()); + + list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); + for (int8_t value : list_values) { + char_vb->UnsafeAppend(value); } - // Int32 - ASSERT_EQ(4, int32->length()); - for (size_t i = 0; i < int_values.size(); ++i) { - ASSERT_EQ(int_values[i], int32->Value(i)); + for (int32_t value : int_values) { + int_vb->UnsafeAppend(value); } + + Done(); + ASSERT_RAISES(Invalid, result_->Validate()); +} + +TEST_F(TestStructBuilder, TestEquals) {} + +TEST_F(TestStructBuilder, TestZeroLength) { + // All buffers are null + Done(); + ASSERT_OK(result_->Validate()); } -} // namespace arrow + +} // namespace arrow diff --git a/cpp/src/arrow/types/struct.cc b/cpp/src/arrow/types/struct.cc index 04a277a86fa58..883007c9bffbf 100644 --- a/cpp/src/arrow/types/struct.cc +++ b/cpp/src/arrow/types/struct.cc @@ -17,4 +17,94 @@ #include "arrow/types/struct.h" -namespace arrow {} // namespace arrow +#include + +namespace arrow { + +bool StructArray::EqualsExact(const StructArray& other) const { + if (this == &other) { return true; } + if (null_count_ != other.null_count_) { return false; } + + bool equal_null_bitmap = true; + if (null_count_ > 0) { + equal_null_bitmap = + null_bitmap_->Equals(*other.null_bitmap_, util::bytes_for_bits(length_)); + } + + if (!equal_null_bitmap) { return false; } + bool values_equal = false; + for (size_t i = 0; i < field_arrays_.size(); ++i) { + values_equal = field_arrays_.at(i).get()->Equals(other.fields().at(i)); + } + + return values_equal; +} + +bool StructArray::Equals(const std::shared_ptr& arr) const { + if (this == arr.get()) { return true; } + if (this->type_enum() != arr->type_enum()) { return false; } + return EqualsExact(*static_cast(arr.get())); +} + +Status StructArray::Validate() const { + if (length_ < 0) { return Status::Invalid("Length was negative"); } + + if (null_count() > length_) { + return Status::Invalid("Null count exceeds the length of this struct"); + } + + if (fields().size() <= 0 && length_ > 0) { + return Status::Invalid("Fields are not existed, but length is not zero"); + } + + int32_t struct_length = length(); + if (struct_length > 0) { + // Validate fields + bool fields_equal = true; + for (auto it : field_arrays_) { + fields_equal = (it->length() == struct_length); + if (fields_equal == false) { + std::stringstream ss; + ss << "The length of field [ " << it->type()->ToString() << " ] " + << "is not equal to this StructArray's length"; + return Status::Invalid(ss.str()); + } + fields_equal = true; + + const Status child_valid = it->Validate(); + if (!child_valid.ok()) { + std::stringstream ss; + ss << "Child array invalid: " << child_valid.ToString(); + return Status::Invalid(ss.str()); + } + } + + // Validate bitmap + // If struct is null at pos i, then its children should be null + // at the same position. + bool children_null = true; + for (int i = 0; i < length_; ++i) { + for (size_t j = 0; j < field_arrays_.size(); ++j) { + children_null &= field_arrays_[j]->IsNull(i); + } + + if (children_null == IsNull(i)) { + children_null = true; + continue; + } + + std::stringstream ss; + if (true == IsNull(i)) { + ss << type()->ToString() << "is null at position " << i + << ", but some of its child fields are not null at the same position"; + } else { + ss << type()->ToString() << "is not null at position " << i + << ", but some of its child fields are null at the same position"; + } + return Status::Invalid(ss.str()); + } + } + return Status::OK(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/types/struct.h b/cpp/src/arrow/types/struct.h index bf2cc3d90f90c..83c742d403fc9 100644 --- a/cpp/src/arrow/types/struct.h +++ b/cpp/src/arrow/types/struct.h @@ -23,66 +23,59 @@ #include #include "arrow/type.h" -#include "arrow/types/primitive.h" #include "arrow/types/list.h" +#include "arrow/types/primitive.h" namespace arrow { class StructArray : public Array { public: - StructArray(const TypePtr& type, int32_t length, - std::vector& values, - int32_t null_count = 0, - std::shared_ptr null_bitmap = nullptr) : - Array(type, length, null_count, null_bitmap) { + StructArray(const TypePtr& type, int32_t length, std::vector& field_arrays, + int32_t null_count = 0, std::shared_ptr null_bitmap = nullptr) + : Array(type, length, null_count, null_bitmap) { type_ = type; - values_ = values; + field_arrays_ = field_arrays; } + Status Validate() const override; + virtual ~StructArray() {} // Return a shared pointer in case the requestor desires to share ownership // with this array. - const std::shared_ptr& values(int32_t pos) const {return values_.at(pos);} - const std::vector& values() const {return values_;} + const std::shared_ptr& field(int32_t pos) const { return field_arrays_.at(pos); } + const std::vector& fields() const { return field_arrays_; } - const std::shared_ptr& value_type(int32_t pos) const { - return values_.at(pos)->type(); + const std::shared_ptr& field_type(int32_t pos) const { + return field_arrays_.at(pos)->type(); } - bool EqualsExact(const ListArray& other) const { - return true; - } - bool Equals(const std::shared_ptr& arr) const override { - return true; - } + bool EqualsExact(const StructArray& other) const; + bool Equals(const std::shared_ptr& arr) const override; protected: - // Contains kinds of Arrays. - std::vector values_; + // The child arrays corresponding to each field of the struct data type. + std::vector field_arrays_; }; -// ............................................................................ -// StrcutArray builder +// --------------------------------------------------------------------------------- +// StructArray builder class StructBuilder : public ArrayBuilder { public: StructBuilder(MemoryPool* pool, const std::shared_ptr& type, - const std::vector& fields, - std::vector>& value_builder) - : ArrayBuilder(pool, type) { - fields_ = fields; - value_builder_ = value_builder; + std::vector>& field_builders) + : ArrayBuilder(pool, type) { + field_builders_ = field_builders; } - Status Init(int32_t elements) { - return ArrayBuilder::Init(elements); + Status Init(int32_t elements) override { + RETURN_NOT_OK(ArrayBuilder::Init(elements)); + return Status::OK(); } - Status Resize(int32_t capacity) { + Status Resize(int32_t capacity) override { // Need space for the end offset - if (capacity < MIN_BUILDER_CAPACITY) { - capacity = MIN_BUILDER_CAPACITY; - } + if (capacity < MIN_BUILDER_CAPACITY) { capacity = MIN_BUILDER_CAPACITY; } if (capacity_ == 0) { RETURN_NOT_OK(ArrayBuilder::Init(capacity)); @@ -91,33 +84,32 @@ class StructBuilder : public ArrayBuilder { } capacity_ = capacity; - for (auto it = value_builder_.begin(); it != value_builder_.end(); ++it) { - it->get()->Resize(capacity); + for (auto it : field_builders_) { + RETURN_NOT_OK(it->Resize(capacity)); } return Status::OK(); } - // Vector append - // - // If passed, valid_bytes is of equal length to values, and any zero byte - // will be considered as a null for that slot + // null_bitmap is of equal length to every child field, and any zero byte + // will be considered as a null for that field, but users must using app- + // end methods or advance methods of the child builders independently to + // insert data. Status Append(const uint8_t* null_bitmap, int32_t length) { RETURN_NOT_OK(Reserve(length)); UnsafeAppendToBitmap(null_bitmap, length); return Status::OK(); } - template - std::shared_ptr Transfer() { - DCHECK(value_builder_.size()); + std::shared_ptr Finish() override { + DCHECK(field_builders_.size()); - std::vector> items; - for (auto it = value_builder_.cbegin(); it != value_builder_.cend(); it++) { - items.push_back(it->get()->Finish()); + std::vector fields; + for (auto it : field_builders_) { + fields.push_back(it->Finish()); } - // Here, for ListArray, offsets_ is needed, but StructArray dont need it. - auto result = std::make_shared(type_, length_, items, - null_count_, null_bitmap_); + + auto result = + std::make_shared(type_, length_, fields, null_count_, null_bitmap_); null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; @@ -125,35 +117,28 @@ class StructBuilder : public ArrayBuilder { return result; } - std::shared_ptr Finish() override { - return Transfer(); - } - - // Start a new variable-length list slot - // - // This function should be called before beginning to append elements to the - // value builder - // TODO: This method should be a virtual method or not, to append null to - // it's all children? + // This function just appends elements to StructBuilder, if you want to + // inserting data into the child fields, please make sure that the chi- + // ld builders' append methods must be called independently before that. Status Append(bool is_valid = true) { RETURN_NOT_OK(Reserve(1)); UnsafeAppendToBitmap(is_valid); return Status::OK(); } - Status AppendNull() { - return Append(false); - } + Status AppendNull() { return Append(false); } - const std::vector>& value_builder() const { - return value_builder_; + const std::shared_ptr field_builder(int pos) const { + return field_builders_.at(pos); + } + const std::vector>& field_builders() const { + return field_builders_; } protected: - std::vector> value_builder_; - std::vector fields_; + std::vector> field_builders_; }; -} // namespace arrow +} // namespace arrow -#endif // ARROW_TYPES_STRUCT_H +#endif // ARROW_TYPES_STRUCT_H From fa856fda63f2ea0e531912f00b442d905ab52ba3 Mon Sep 17 00:00:00 2001 From: fengguangyuan Date: Mon, 23 May 2016 11:12:39 +0800 Subject: [PATCH 3/5] ARROW-60:[C++] Struct typebuilder API Modify Validate() refered to the specification. --- cpp/src/arrow/types/struct-test.cc | 26 +++++++------ cpp/src/arrow/types/struct.cc | 62 +++++++++--------------------- cpp/src/arrow/types/struct.h | 52 ++++++++----------------- 3 files changed, 47 insertions(+), 93 deletions(-) diff --git a/cpp/src/arrow/types/struct-test.cc b/cpp/src/arrow/types/struct-test.cc index b6cdc66340fe3..4d6d36628c6f8 100644 --- a/cpp/src/arrow/types/struct-test.cc +++ b/cpp/src/arrow/types/struct-test.cc @@ -100,14 +100,14 @@ class TestStructBuilder : public TestBuilder { void SetUp() { TestBuilder::SetUp(); - auto value_type = TypePtr(new Int32Type()); + auto int32_type = TypePtr(new Int32Type()); auto char_type = TypePtr(new Int8Type()); auto list_type = TypePtr(new ListType(char_type)); - std::vector types = {list_type, value_type}; + std::vector types = {list_type, int32_type}; std::vector fields; fields.push_back(FieldPtr(new Field("list", list_type))); - fields.push_back(FieldPtr(new Field("int", value_type))); + fields.push_back(FieldPtr(new Field("int", int32_type))); type_ = TypePtr(new StructType(fields)); value_fields_ = fields; @@ -116,6 +116,7 @@ class TestStructBuilder : public TestBuilder { ASSERT_OK(MakeBuilder(pool_, type_, &tmp)); builder_ = std::dynamic_pointer_cast(tmp); + ASSERT_EQ(2, builder_->field_builders().size()); } void Done() { result_ = std::dynamic_pointer_cast(builder_->Finish()); } @@ -175,9 +176,9 @@ TEST_F(TestStructBuilder, TestBasics) { Int32Builder* int_vb = static_cast(builder_->field_builder(1).get()); ASSERT_EQ(2, builder_->field_builders().size()); - EXPECT_OK(builder_->Reserve(list_lengths.size())); - EXPECT_OK(char_vb->Reserve(list_values.size())); - EXPECT_OK(int_vb->Reserve(int_values.size())); + EXPECT_OK(builder_->Resize(list_lengths.size())); + EXPECT_OK(char_vb->Resize(list_values.size())); + EXPECT_OK(int_vb->Resize(int_values.size())); int pos = 0; for (size_t i = 0; i < list_lengths.size(); ++i) { @@ -210,11 +211,11 @@ TEST_F(TestStructBuilder, BulkAppend) { Int8Builder* char_vb = static_cast(list_vb->value_builder().get()); Int32Builder* int_vb = static_cast(builder_->field_builder(1).get()); - ASSERT_OK(builder_->Reserve(list_lengths.size())); - ASSERT_OK(char_vb->Reserve(list_values.size())); - ASSERT_OK(int_vb->Reserve(int_values.size())); + ASSERT_OK(builder_->Resize(list_lengths.size())); + ASSERT_OK(char_vb->Resize(list_values.size())); + ASSERT_OK(int_vb->Resize(int_values.size())); - builder_->Append(struct_is_valid.data(), struct_is_valid.size()); + builder_->Appends(struct_is_valid.size(), struct_is_valid.data()); list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); for (int8_t value : list_values) { @@ -246,7 +247,7 @@ TEST_F(TestStructBuilder, BulkAppendInvalid) { ASSERT_OK(char_vb->Reserve(list_values.size())); ASSERT_OK(int_vb->Reserve(int_values.size())); - builder_->Append(struct_is_valid.data(), struct_is_valid.size()); + builder_->Appends(struct_is_valid.size(), struct_is_valid.data()); list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); for (int8_t value : list_values) { @@ -258,7 +259,8 @@ TEST_F(TestStructBuilder, BulkAppendInvalid) { } Done(); - ASSERT_RAISES(Invalid, result_->Validate()); + // Even null bitmap of the parent Struct is not valid, Validate() will ignore it. + ASSERT_OK(result_->Validate()); } TEST_F(TestStructBuilder, TestEquals) {} diff --git a/cpp/src/arrow/types/struct.cc b/cpp/src/arrow/types/struct.cc index 883007c9bffbf..5b81a954111db 100644 --- a/cpp/src/arrow/types/struct.cc +++ b/cpp/src/arrow/types/struct.cc @@ -23,25 +23,26 @@ namespace arrow { bool StructArray::EqualsExact(const StructArray& other) const { if (this == &other) { return true; } + if (field_arrays_.size() != other.fields().size()) { return false; } if (null_count_ != other.null_count_) { return false; } - bool equal_null_bitmap = true; if (null_count_ > 0) { - equal_null_bitmap = + bool equal_null_bitmap = null_bitmap_->Equals(*other.null_bitmap_, util::bytes_for_bits(length_)); - } + if (!equal_null_bitmap) { return false; } - if (!equal_null_bitmap) { return false; } - bool values_equal = false; - for (size_t i = 0; i < field_arrays_.size(); ++i) { - values_equal = field_arrays_.at(i).get()->Equals(other.fields().at(i)); + bool fields_equal = true; + for (size_t i = 0; i < field_arrays_.size(); ++i) { + fields_equal = field_arrays_[i].get()->Equals(other.field(i)); + if (!fields_equal) { return false; } + } } - - return values_equal; + return true; } bool StructArray::Equals(const std::shared_ptr& arr) const { if (this == arr.get()) { return true; } + if (!arr) { return false; } if (this->type_enum() != arr->type_enum()) { return false; } return EqualsExact(*static_cast(arr.get())); } @@ -53,23 +54,15 @@ Status StructArray::Validate() const { return Status::Invalid("Null count exceeds the length of this struct"); } - if (fields().size() <= 0 && length_ > 0) { - return Status::Invalid("Fields are not existed, but length is not zero"); - } - - int32_t struct_length = length(); - if (struct_length > 0) { + if (field_arrays_.size() > 0) { // Validate fields - bool fields_equal = true; + int32_t array_length = field_arrays_[0]->length(); for (auto it : field_arrays_) { - fields_equal = (it->length() == struct_length); - if (fields_equal == false) { + if (!(it->length() == array_length)) { std::stringstream ss; - ss << "The length of field [ " << it->type()->ToString() << " ] " - << "is not equal to this StructArray's length"; + ss << "Length is not equal from field " << it->type()->ToString(); return Status::Invalid(ss.str()); } - fields_equal = true; const Status child_valid = it->Validate(); if (!child_valid.ok()) { @@ -79,29 +72,10 @@ Status StructArray::Validate() const { } } - // Validate bitmap - // If struct is null at pos i, then its children should be null - // at the same position. - bool children_null = true; - for (int i = 0; i < length_; ++i) { - for (size_t j = 0; j < field_arrays_.size(); ++j) { - children_null &= field_arrays_[j]->IsNull(i); - } - - if (children_null == IsNull(i)) { - children_null = true; - continue; - } - - std::stringstream ss; - if (true == IsNull(i)) { - ss << type()->ToString() << "is null at position " << i - << ", but some of its child fields are not null at the same position"; - } else { - ss << type()->ToString() << "is not null at position " << i - << ", but some of its child fields are null at the same position"; - } - return Status::Invalid(ss.str()); + // Validate null bitmap + // TODO: No checks for the consistency of bitmaps. Its cost maybe expensive. + if (array_length > 0 && array_length != length_) { + return Status::Invalid("Struct's length is not equal to its child arrays"); } } return Status::OK(); diff --git a/cpp/src/arrow/types/struct.h b/cpp/src/arrow/types/struct.h index 83c742d403fc9..b18688c7d35ab 100644 --- a/cpp/src/arrow/types/struct.h +++ b/cpp/src/arrow/types/struct.h @@ -43,12 +43,11 @@ class StructArray : public Array { // Return a shared pointer in case the requestor desires to share ownership // with this array. - const std::shared_ptr& field(int32_t pos) const { return field_arrays_.at(pos); } - const std::vector& fields() const { return field_arrays_; } - - const std::shared_ptr& field_type(int32_t pos) const { - return field_arrays_.at(pos)->type(); + const std::shared_ptr& field(int32_t pos) const { + DCHECK_GT(field_arrays_.size(), 0); + return field_arrays_[pos]; } + const std::vector& fields() const { return field_arrays_; } bool EqualsExact(const StructArray& other) const; bool Equals(const std::shared_ptr& arr) const override; @@ -60,49 +59,28 @@ class StructArray : public Array { // --------------------------------------------------------------------------------- // StructArray builder +// Append, Appends, Resize and Reserve methods are acting on StructBuilder. +// Please make sure all these methods of all child-builders' are consistently +// called to maintain data-structure consistency. class StructBuilder : public ArrayBuilder { public: StructBuilder(MemoryPool* pool, const std::shared_ptr& type, - std::vector>& field_builders) + const std::vector>& field_builders) : ArrayBuilder(pool, type) { field_builders_ = field_builders; } - Status Init(int32_t elements) override { - RETURN_NOT_OK(ArrayBuilder::Init(elements)); - return Status::OK(); - } - - Status Resize(int32_t capacity) override { - // Need space for the end offset - if (capacity < MIN_BUILDER_CAPACITY) { capacity = MIN_BUILDER_CAPACITY; } - - if (capacity_ == 0) { - RETURN_NOT_OK(ArrayBuilder::Init(capacity)); - } else { - RETURN_NOT_OK(ArrayBuilder::Resize(capacity)); - } - capacity_ = capacity; - - for (auto it : field_builders_) { - RETURN_NOT_OK(it->Resize(capacity)); - } - return Status::OK(); - } - // null_bitmap is of equal length to every child field, and any zero byte // will be considered as a null for that field, but users must using app- - // end methods or advance methods of the child builders independently to + // end methods or advance methods of the child builders' independently to // insert data. - Status Append(const uint8_t* null_bitmap, int32_t length) { + Status Appends(int32_t length, const uint8_t* valid_bytes) { RETURN_NOT_OK(Reserve(length)); - UnsafeAppendToBitmap(null_bitmap, length); + UnsafeAppendToBitmap(valid_bytes, length); return Status::OK(); } std::shared_ptr Finish() override { - DCHECK(field_builders_.size()); - std::vector fields; for (auto it : field_builders_) { fields.push_back(it->Finish()); @@ -117,9 +95,8 @@ class StructBuilder : public ArrayBuilder { return result; } - // This function just appends elements to StructBuilder, if you want to - // inserting data into the child fields, please make sure that the chi- - // ld builders' append methods must be called independently before that. + // Append an element to the Struct. All child-builders' Append method must + // be called independently to maintain data-structure consistency. Status Append(bool is_valid = true) { RETURN_NOT_OK(Reserve(1)); UnsafeAppendToBitmap(is_valid); @@ -129,7 +106,8 @@ class StructBuilder : public ArrayBuilder { Status AppendNull() { return Append(false); } const std::shared_ptr field_builder(int pos) const { - return field_builders_.at(pos); + DCHECK_GT(field_builders_.size(), 0); + return field_builders_[pos]; } const std::vector>& field_builders() const { return field_builders_; From ae74c80339b8fd0ec76d806212c69c683fbdeeef Mon Sep 17 00:00:00 2001 From: fengguangyuan Date: Wed, 25 May 2016 18:27:40 +0800 Subject: [PATCH 4/5] ARROW-60: Struct type builder API Add RangeEquals method to implement Equals method. --- cpp/src/arrow/types/construct.cc | 2 +- cpp/src/arrow/types/struct-test.cc | 123 +++++++++++++++++++++++++++-- cpp/src/arrow/types/struct.cc | 43 +++++----- cpp/src/arrow/types/struct.h | 8 +- 4 files changed, 146 insertions(+), 30 deletions(-) diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index 188f023850bfd..bcb0ec490901f 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -69,7 +69,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, } case Type::STRUCT: { - auto fields = type->children_; + std::vector& fields = type->children_; std::vector> values_builder; for (auto it : fields) { diff --git a/cpp/src/arrow/types/struct-test.cc b/cpp/src/arrow/types/struct-test.cc index 4d6d36628c6f8..d2bd2971d0438 100644 --- a/cpp/src/arrow/types/struct-test.cc +++ b/cpp/src/arrow/types/struct-test.cc @@ -215,13 +215,12 @@ TEST_F(TestStructBuilder, BulkAppend) { ASSERT_OK(char_vb->Resize(list_values.size())); ASSERT_OK(int_vb->Resize(int_values.size())); - builder_->Appends(struct_is_valid.size(), struct_is_valid.data()); + builder_->Append(struct_is_valid.size(), struct_is_valid.data()); list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); for (int8_t value : list_values) { char_vb->UnsafeAppend(value); } - for (int32_t value : int_values) { int_vb->UnsafeAppend(value); } @@ -247,13 +246,12 @@ TEST_F(TestStructBuilder, BulkAppendInvalid) { ASSERT_OK(char_vb->Reserve(list_values.size())); ASSERT_OK(int_vb->Reserve(int_values.size())); - builder_->Appends(struct_is_valid.size(), struct_is_valid.data()); + builder_->Append(struct_is_valid.size(), struct_is_valid.data()); list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); for (int8_t value : list_values) { char_vb->UnsafeAppend(value); } - for (int32_t value : int_values) { int_vb->UnsafeAppend(value); } @@ -263,7 +261,122 @@ TEST_F(TestStructBuilder, BulkAppendInvalid) { ASSERT_OK(result_->Validate()); } -TEST_F(TestStructBuilder, TestEquals) {} +TEST_F(TestStructBuilder, TestEquality) { + ArrayPtr array, equal_array; + ArrayPtr unequal_bitmap_array, unequal_offsets_array, unequal_values_array; + + vector int_values = {1, 2, 3, 4}; + vector list_values = {'j', 'o', 'e', 'b', 'o', 'b', 'm', 'a', 'r', 'k'}; + vector list_lengths = {3, 0, 3, 4}; + vector list_offsets = {0, 3, 3, 6}; + vector list_is_valid = {1, 0, 1, 1}; + vector struct_is_valid = {1, 1, 1, 1}; + + vector unequal_int_values = {4, 2, 3, 1}; + vector unequal_list_values = {'j', 'o', 'e', 'b', 'o', 'b', 'l', 'u', 'c', 'y'}; + vector unequal_list_offsets = {0, 3, 4, 6}; + vector unequal_list_is_valid = {1, 1, 1, 1}; + vector unequal_struct_is_valid = {1, 0, 0, 1}; + + ListBuilder* list_vb = static_cast(builder_->field_builder(0).get()); + Int8Builder* char_vb = static_cast(list_vb->value_builder().get()); + Int32Builder* int_vb = static_cast(builder_->field_builder(1).get()); + ASSERT_OK(builder_->Reserve(list_lengths.size())); + ASSERT_OK(char_vb->Reserve(list_values.size())); + ASSERT_OK(int_vb->Reserve(int_values.size())); + + // setup two equal arrays, one of which takes an unequal bitmap + builder_->Append(struct_is_valid.size(), struct_is_valid.data()); + list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); + for (int8_t value : list_values) { + char_vb->UnsafeAppend(value); + } + for (int32_t value : int_values) { + int_vb->UnsafeAppend(value); + } + array = builder_->Finish(); + + ASSERT_OK(builder_->Resize(list_lengths.size())); + ASSERT_OK(char_vb->Resize(list_values.size())); + ASSERT_OK(int_vb->Resize(int_values.size())); + + builder_->Append(struct_is_valid.size(), struct_is_valid.data()); + list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); + for (int8_t value : list_values) { + char_vb->UnsafeAppend(value); + } + for (int32_t value : int_values) { + int_vb->UnsafeAppend(value); + } + equal_array = builder_->Finish(); + + ASSERT_OK(builder_->Resize(list_lengths.size())); + ASSERT_OK(char_vb->Resize(list_values.size())); + ASSERT_OK(int_vb->Resize(int_values.size())); + + // setup an unequal one with the unequal bitmap + builder_->Append(unequal_struct_is_valid.size(), unequal_struct_is_valid.data()); + list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); + for (int8_t value : list_values) { + char_vb->UnsafeAppend(value); + } + for (int32_t value : int_values) { + int_vb->UnsafeAppend(value); + } + unequal_bitmap_array = builder_->Finish(); + + ASSERT_OK(builder_->Resize(list_lengths.size())); + ASSERT_OK(char_vb->Resize(list_values.size())); + ASSERT_OK(int_vb->Resize(int_values.size())); + + // setup an unequal one with unequal offsets + builder_->Append(struct_is_valid.size(), struct_is_valid.data()); + list_vb->Append(unequal_list_offsets.data(), unequal_list_offsets.size(), + unequal_list_is_valid.data()); + for (int8_t value : list_values) { + char_vb->UnsafeAppend(value); + } + for (int32_t value : int_values) { + int_vb->UnsafeAppend(value); + } + unequal_offsets_array = builder_->Finish(); + + ASSERT_OK(builder_->Resize(list_lengths.size())); + ASSERT_OK(char_vb->Resize(list_values.size())); + ASSERT_OK(int_vb->Resize(int_values.size())); + + // setup anunequal one with unequal values + builder_->Append(struct_is_valid.size(), struct_is_valid.data()); + list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); + for (int8_t value : unequal_list_values) { + char_vb->UnsafeAppend(value); + } + for (int32_t value : unequal_int_values) { + int_vb->UnsafeAppend(value); + } + unequal_values_array = builder_->Finish(); + + // Test array equality + EXPECT_TRUE(array->Equals(array)); + EXPECT_TRUE(array->Equals(equal_array)); + EXPECT_TRUE(equal_array->Equals(array)); + EXPECT_FALSE(equal_array->Equals(unequal_bitmap_array)); + EXPECT_FALSE(unequal_bitmap_array->Equals(equal_array)); + EXPECT_FALSE(unequal_bitmap_array->Equals(unequal_values_array)); + EXPECT_FALSE(unequal_values_array->Equals(unequal_bitmap_array)); + EXPECT_FALSE(unequal_bitmap_array->Equals(unequal_offsets_array)); + EXPECT_FALSE(unequal_offsets_array->Equals(unequal_bitmap_array)); + + // Test range equality + EXPECT_TRUE(array->RangeEquals(0, 4, 0, equal_array)); + EXPECT_TRUE(array->RangeEquals(3, 4, 3, unequal_bitmap_array)); + EXPECT_TRUE(array->RangeEquals(0, 1, 0, unequal_offsets_array)); + EXPECT_FALSE(array->RangeEquals(0, 2, 0, unequal_offsets_array)); + EXPECT_FALSE(array->RangeEquals(1, 2, 1, unequal_offsets_array)); + EXPECT_FALSE(array->RangeEquals(0, 1, 0, unequal_values_array)); + EXPECT_TRUE(array->RangeEquals(1, 3, 1, unequal_values_array)); + EXPECT_FALSE(array->RangeEquals(3, 4, 3, unequal_values_array)); +} TEST_F(TestStructBuilder, TestZeroLength) { // All buffers are null diff --git a/cpp/src/arrow/types/struct.cc b/cpp/src/arrow/types/struct.cc index 5b81a954111db..4deddf6f9f39b 100644 --- a/cpp/src/arrow/types/struct.cc +++ b/cpp/src/arrow/types/struct.cc @@ -21,30 +21,32 @@ namespace arrow { -bool StructArray::EqualsExact(const StructArray& other) const { - if (this == &other) { return true; } - if (field_arrays_.size() != other.fields().size()) { return false; } - if (null_count_ != other.null_count_) { return false; } +bool StructArray::Equals(const std::shared_ptr& arr) const { + if (this == arr.get()) { return true; } + if (!arr) { return false; } + if (this->type_enum() != arr->type_enum()) { return false; } + if (null_count_ != arr->null_count()) { return false; } + return RangeEquals(0, length_, 0, arr); +} - if (null_count_ > 0) { - bool equal_null_bitmap = - null_bitmap_->Equals(*other.null_bitmap_, util::bytes_for_bits(length_)); - if (!equal_null_bitmap) { return false; } +bool StructArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const std::shared_ptr& arr) const { + if (this == arr.get()) { return true; } + if (!arr) { return false; } + if (Type::STRUCT != arr->type_enum()) { return false; } + const auto other = static_cast(arr.get()); - bool fields_equal = true; - for (size_t i = 0; i < field_arrays_.size(); ++i) { - fields_equal = field_arrays_[i].get()->Equals(other.field(i)); - if (!fields_equal) { return false; } + bool equal_fields = true; + for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) { + if (IsNull(i) != arr->IsNull(o_i)) { return false; } + if (IsNull(i)) continue; + for (size_t j = 0; j < field_arrays_.size(); ++j) { + equal_fields = field(j)->RangeEquals(i, i + 1, o_i, other->field(j)); + if (!equal_fields) { return false; } } } - return true; -} -bool StructArray::Equals(const std::shared_ptr& arr) const { - if (this == arr.get()) { return true; } - if (!arr) { return false; } - if (this->type_enum() != arr->type_enum()) { return false; } - return EqualsExact(*static_cast(arr.get())); + return true; } Status StructArray::Validate() const { @@ -58,7 +60,7 @@ Status StructArray::Validate() const { // Validate fields int32_t array_length = field_arrays_[0]->length(); for (auto it : field_arrays_) { - if (!(it->length() == array_length)) { + if (it->length() != array_length) { std::stringstream ss; ss << "Length is not equal from field " << it->type()->ToString(); return Status::Invalid(ss.str()); @@ -73,7 +75,6 @@ Status StructArray::Validate() const { } // Validate null bitmap - // TODO: No checks for the consistency of bitmaps. Its cost maybe expensive. if (array_length > 0 && array_length != length_) { return Status::Invalid("Struct's length is not equal to its child arrays"); } diff --git a/cpp/src/arrow/types/struct.h b/cpp/src/arrow/types/struct.h index b18688c7d35ab..78afd29eb8df5 100644 --- a/cpp/src/arrow/types/struct.h +++ b/cpp/src/arrow/types/struct.h @@ -51,6 +51,8 @@ class StructArray : public Array { bool EqualsExact(const StructArray& other) const; bool Equals(const std::shared_ptr& arr) const override; + bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const std::shared_ptr& arr) const override; protected: // The child arrays corresponding to each field of the struct data type. @@ -59,7 +61,7 @@ class StructArray : public Array { // --------------------------------------------------------------------------------- // StructArray builder -// Append, Appends, Resize and Reserve methods are acting on StructBuilder. +// Append, Resize and Reserve methods are acting on StructBuilder. // Please make sure all these methods of all child-builders' are consistently // called to maintain data-structure consistency. class StructBuilder : public ArrayBuilder { @@ -70,11 +72,11 @@ class StructBuilder : public ArrayBuilder { field_builders_ = field_builders; } - // null_bitmap is of equal length to every child field, and any zero byte + // Null bitmap is of equal length to every child field, and any zero byte // will be considered as a null for that field, but users must using app- // end methods or advance methods of the child builders' independently to // insert data. - Status Appends(int32_t length, const uint8_t* valid_bytes) { + Status Append(int32_t length, const uint8_t* valid_bytes) { RETURN_NOT_OK(Reserve(length)); UnsafeAppendToBitmap(valid_bytes, length); return Status::OK(); From 190967f9990e8a1ec6e8a631c3fc8731e9adfbb3 Mon Sep 17 00:00:00 2001 From: fengguangyuan Date: Sat, 28 May 2016 20:33:24 +0800 Subject: [PATCH 5/5] ARROW-60: [C++] Struct type builder API Add field index and TODO comment. --- cpp/src/arrow/types/struct.cc | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/types/struct.cc b/cpp/src/arrow/types/struct.cc index 4deddf6f9f39b..e8176f08268b4 100644 --- a/cpp/src/arrow/types/struct.cc +++ b/cpp/src/arrow/types/struct.cc @@ -41,6 +41,8 @@ bool StructArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_ if (IsNull(i) != arr->IsNull(o_i)) { return false; } if (IsNull(i)) continue; for (size_t j = 0; j < field_arrays_.size(); ++j) { + // TODO: really we should be comparing stretches of non-null data rather + // than looking at one value at a time. equal_fields = field(j)->RangeEquals(i, i + 1, o_i, other->field(j)); if (!equal_fields) { return false; } } @@ -59,22 +61,25 @@ Status StructArray::Validate() const { if (field_arrays_.size() > 0) { // Validate fields int32_t array_length = field_arrays_[0]->length(); + size_t idx = 0; for (auto it : field_arrays_) { if (it->length() != array_length) { std::stringstream ss; - ss << "Length is not equal from field " << it->type()->ToString(); + ss << "Length is not equal from field " << it->type()->ToString() + << " at position {" << idx << "}"; return Status::Invalid(ss.str()); } const Status child_valid = it->Validate(); if (!child_valid.ok()) { std::stringstream ss; - ss << "Child array invalid: " << child_valid.ToString(); + ss << "Child array invalid: " << child_valid.ToString() << " at position {" << idx + << "}"; return Status::Invalid(ss.str()); } + ++idx; } - // Validate null bitmap if (array_length > 0 && array_length != length_) { return Status::Invalid("Struct's length is not equal to its child arrays"); }