Skip to content

Commit

Permalink
ARROW-5588: [C++] Better support for building union arrays
Browse files Browse the repository at this point in the history
- Simplify DenseUnionBuilder
- Add SparesUnionBuilder
- MakeBuilder can now produce a {Sparse,Dense}UnionBuilder
- ArrayFromJSON can now produce union arrays

Author: Benjamin Kietzman <[email protected]>

Closes #4781 from bkietz/5588-Better-support-for-building-UnionArrays and squashes the following commits:

38e2828 <Benjamin Kietzman> iwyu #include <limits>
0efe91d <Benjamin Kietzman> address review comments
17e6e27 <Benjamin Kietzman> construct offset_builder_ with a MemoryPool
4131fe3 <Benjamin Kietzman> separate child builder array indexable by type_id
fd64c1b <Benjamin Kietzman> rewrite union builder to share a base class, let children_ be indexed by type_id
37de5f2 <Benjamin Kietzman> explicit uint8_t for msvc
673916e <Benjamin Kietzman> Disable ListOfDictionary test until ListBuilder is updated
cf1c5be <Benjamin Kietzman> revert changes to reader.cc
5742db9 <Benjamin Kietzman> debugging: highlight the broken case and a similar one
5b1ec93 <Benjamin Kietzman> improve doccomments, dedupe test code
33fade1 <Benjamin Kietzman> Adding support for DenseUnions to ArrayFromJSON
6245c82 <Benjamin Kietzman> add SparseUnionBuilder and MakeBuilder case
8d4f36d <Benjamin Kietzman> add tests for building lists where the value builder has mutable type
351905d <Benjamin Kietzman> add test for lazily typed union builder
7902d12 <Benjamin Kietzman> first pass at updating DenseUnionBuilder
d20055a <Benjamin Kietzman> minor refactors, adding some asserts
  • Loading branch information
bkietz authored and kszucs committed Jul 22, 2019
1 parent 2356c9e commit 4124a4b
Show file tree
Hide file tree
Showing 17 changed files with 788 additions and 120 deletions.
45 changes: 45 additions & 0 deletions cpp/src/arrow/array-dict-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -948,4 +948,49 @@ TEST(TestDictionary, TransposeNulls) {
AssertArraysEqual(*expected, *out);
}

TEST(TestDictionary, DISABLED_ListOfDictionary) {
std::unique_ptr<ArrayBuilder> root_builder;
ASSERT_OK(MakeBuilder(default_memory_pool(), list(dictionary(int8(), utf8())),
&root_builder));
auto list_builder = checked_cast<ListBuilder*>(root_builder.get());
auto dict_builder =
checked_cast<DictionaryBuilder<StringType>*>(list_builder->value_builder());

ASSERT_OK(list_builder->Append());
std::vector<std::string> expected;
for (char a : "abc") {
for (char d : "def") {
for (char g : "ghi") {
for (char j : "jkl") {
for (char m : "mno") {
for (char p : "pqr") {
if ((static_cast<int>(a) + d + g + j + m + p) % 16 == 0) {
ASSERT_OK(list_builder->Append());
}
// 3**6 distinct strings; too large for int8
char str[6] = {a, d, g, j, m, p};
ASSERT_OK(dict_builder->Append(str));
expected.push_back(str);
}
}
}
}
}
}
std::shared_ptr<Array> expected_dict;
ArrayFromVector<StringType, std::string>(expected, &expected_dict);

std::shared_ptr<Array> array;
ASSERT_OK(root_builder->Finish(&array));
ASSERT_OK(ValidateArray(*array));

auto expected_type = list(dictionary(int16(), utf8()));
ASSERT_EQ(array->type()->ToString(), expected_type->ToString());

auto list_array = checked_cast<const ListArray*>(array.get());
auto actual_dict =
checked_cast<const DictionaryArray&>(*list_array->values()).dictionary();
ASSERT_ARRAYS_EQUAL(*expected_dict, *actual_dict);
}

} // namespace arrow
215 changes: 215 additions & 0 deletions cpp/src/arrow/array-union-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -188,4 +188,219 @@ TEST_F(TestUnionArrayFactories, TestMakeSparse) {
type_codes);
}

template <typename B>
class UnionBuilderTest : public ::testing::Test {
public:
uint8_t I8 = 8, STR = 13, DBL = 7;

virtual void AppendInt(int8_t i) {
expected_types_vector.push_back(I8);
ASSERT_OK(union_builder->Append(I8));
ASSERT_OK(i8_builder->Append(i));
}

virtual void AppendString(const std::string& str) {
expected_types_vector.push_back(STR);
ASSERT_OK(union_builder->Append(STR));
ASSERT_OK(str_builder->Append(str));
}

virtual void AppendDouble(double dbl) {
expected_types_vector.push_back(DBL);
ASSERT_OK(union_builder->Append(DBL));
ASSERT_OK(dbl_builder->Append(dbl));
}

void AppendBasics() {
AppendInt(33);
AppendString("abc");
AppendDouble(1.0);
AppendDouble(-1.0);
AppendString("");
AppendInt(10);
AppendString("def");
AppendInt(-10);
AppendDouble(0.5);
ASSERT_OK(union_builder->Finish(&actual));
ArrayFromVector<Int8Type, uint8_t>(expected_types_vector, &expected_types);
}

void AppendInferred() {
I8 = union_builder->AppendChild(i8_builder, "i8");
ASSERT_EQ(I8, 0);
AppendInt(33);
AppendInt(10);

STR = union_builder->AppendChild(str_builder, "str");
ASSERT_EQ(STR, 1);
AppendString("abc");
AppendString("");
AppendString("def");
AppendInt(-10);

DBL = union_builder->AppendChild(dbl_builder, "dbl");
ASSERT_EQ(DBL, 2);
AppendDouble(1.0);
AppendDouble(-1.0);
AppendDouble(0.5);
ASSERT_OK(union_builder->Finish(&actual));
ArrayFromVector<Int8Type, uint8_t>(expected_types_vector, &expected_types);

ASSERT_EQ(I8, 0);
ASSERT_EQ(STR, 1);
ASSERT_EQ(DBL, 2);
}

void AppendListOfInferred(std::shared_ptr<ListArray>* actual) {
ListBuilder list_builder(default_memory_pool(), union_builder);

ASSERT_OK(list_builder.Append());
I8 = union_builder->AppendChild(i8_builder, "i8");
ASSERT_EQ(I8, 0);
AppendInt(10);

ASSERT_OK(list_builder.Append());
STR = union_builder->AppendChild(str_builder, "str");
ASSERT_EQ(STR, 1);
AppendString("abc");
AppendInt(-10);

ASSERT_OK(list_builder.Append());
DBL = union_builder->AppendChild(dbl_builder, "dbl");
ASSERT_EQ(DBL, 2);
AppendDouble(0.5);

ASSERT_OK(list_builder.Finish(actual));
ArrayFromVector<Int8Type, uint8_t>(expected_types_vector, &expected_types);

ASSERT_EQ(I8, 0);
ASSERT_EQ(STR, 1);
ASSERT_EQ(DBL, 2);
}

std::vector<uint8_t> expected_types_vector;
std::shared_ptr<Array> expected_types;
std::shared_ptr<Int8Builder> i8_builder = std::make_shared<Int8Builder>();
std::shared_ptr<StringBuilder> str_builder = std::make_shared<StringBuilder>();
std::shared_ptr<DoubleBuilder> dbl_builder = std::make_shared<DoubleBuilder>();
std::shared_ptr<B> union_builder{new B(default_memory_pool())};
std::shared_ptr<UnionArray> actual;
};

class DenseUnionBuilderTest : public UnionBuilderTest<DenseUnionBuilder> {};
class SparseUnionBuilderTest : public UnionBuilderTest<SparseUnionBuilder> {
public:
using Base = UnionBuilderTest<SparseUnionBuilder>;

void AppendInt(int8_t i) override {
Base::AppendInt(i);
ASSERT_OK(str_builder->AppendNull());
ASSERT_OK(dbl_builder->AppendNull());
}

void AppendString(const std::string& str) override {
Base::AppendString(str);
ASSERT_OK(i8_builder->AppendNull());
ASSERT_OK(dbl_builder->AppendNull());
}

void AppendDouble(double dbl) override {
Base::AppendDouble(dbl);
ASSERT_OK(i8_builder->AppendNull());
ASSERT_OK(str_builder->AppendNull());
}
};

TEST_F(DenseUnionBuilderTest, Basics) {
union_builder.reset(new DenseUnionBuilder(
default_memory_pool(), {i8_builder, str_builder, dbl_builder},
union_({field("i8", int8()), field("str", utf8()), field("dbl", float64())},
{I8, STR, DBL}, UnionMode::DENSE)));
AppendBasics();

auto expected_i8 = ArrayFromJSON(int8(), "[33, 10, -10]");
auto expected_str = ArrayFromJSON(utf8(), R"(["abc", "", "def"])");
auto expected_dbl = ArrayFromJSON(float64(), "[1.0, -1.0, 0.5]");

auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 0, 1, 1, 1, 2, 2, 2]");

std::shared_ptr<Array> expected;
ASSERT_OK(UnionArray::MakeDense(*expected_types, *expected_offsets,
{expected_i8, expected_str, expected_dbl},
{"i8", "str", "dbl"}, {I8, STR, DBL}, &expected));

ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString());
ASSERT_ARRAYS_EQUAL(*expected, *actual);
}

TEST_F(DenseUnionBuilderTest, InferredType) {
AppendInferred();

auto expected_i8 = ArrayFromJSON(int8(), "[33, 10, -10]");
auto expected_str = ArrayFromJSON(utf8(), R"(["abc", "", "def"])");
auto expected_dbl = ArrayFromJSON(float64(), "[1.0, -1.0, 0.5]");

auto expected_offsets = ArrayFromJSON(int32(), "[0, 1, 0, 1, 2, 2, 0, 1, 2]");

std::shared_ptr<Array> expected;
ASSERT_OK(UnionArray::MakeDense(*expected_types, *expected_offsets,
{expected_i8, expected_str, expected_dbl},
{"i8", "str", "dbl"}, {I8, STR, DBL}, &expected));

ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString());
ASSERT_ARRAYS_EQUAL(*expected, *actual);
}

TEST_F(DenseUnionBuilderTest, ListOfInferredType) {
std::shared_ptr<ListArray> actual;
AppendListOfInferred(&actual);

auto expected_type =
list(union_({field("i8", int8()), field("str", utf8()), field("dbl", float64())},
{I8, STR, DBL}, UnionMode::DENSE));
ASSERT_EQ(expected_type->ToString(), actual->type()->ToString());
}

TEST_F(SparseUnionBuilderTest, Basics) {
union_builder.reset(new SparseUnionBuilder(
default_memory_pool(), {i8_builder, str_builder, dbl_builder},
union_({field("i8", int8()), field("str", utf8()), field("dbl", float64())},
{I8, STR, DBL}, UnionMode::SPARSE)));

AppendBasics();

auto expected_i8 =
ArrayFromJSON(int8(), "[33, null, null, null, null, 10, null, -10, null]");
auto expected_str =
ArrayFromJSON(utf8(), R"([null, "abc", null, null, "", null, "def", null, null])");
auto expected_dbl =
ArrayFromJSON(float64(), "[null, null, 1.0, -1.0, null, null, null, null, 0.5]");

std::shared_ptr<Array> expected;
ASSERT_OK(UnionArray::MakeSparse(*expected_types,
{expected_i8, expected_str, expected_dbl},
{"i8", "str", "dbl"}, {I8, STR, DBL}, &expected));

ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString());
ASSERT_ARRAYS_EQUAL(*expected, *actual);
}

TEST_F(SparseUnionBuilderTest, InferredType) {
AppendInferred();

auto expected_i8 =
ArrayFromJSON(int8(), "[33, 10, null, null, null, -10, null, null, null]");
auto expected_str =
ArrayFromJSON(utf8(), R"([null, null, "abc", "", "def", null, null, null, null])");
auto expected_dbl =
ArrayFromJSON(float64(), "[null, null, null, null, null, null, 1.0, -1.0, 0.5]");

std::shared_ptr<Array> expected;
ASSERT_OK(UnionArray::MakeSparse(*expected_types,
{expected_i8, expected_str, expected_dbl},
{"i8", "str", "dbl"}, {I8, STR, DBL}, &expected));

ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString());
ASSERT_ARRAYS_EQUAL(*expected, *actual);
}
} // namespace arrow
1 change: 1 addition & 0 deletions cpp/src/arrow/array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,7 @@ void UnionArray::SetData(const std::shared_ptr<ArrayData>& data) {

ARROW_CHECK_EQ(data->type->id(), Type::UNION);
ARROW_CHECK_EQ(data->buffers.size(), 3);
union_type_ = checked_cast<const UnionType*>(data_->type.get());

auto type_ids = data_->buffers[1];
auto value_offsets = data_->buffers[2];
Expand Down
7 changes: 4 additions & 3 deletions cpp/src/arrow/array.h
Original file line number Diff line number Diff line change
Expand Up @@ -1032,9 +1032,9 @@ class ARROW_EXPORT UnionArray : public Array {
const type_id_t* raw_type_ids() const { return raw_type_ids_ + data_->offset; }
const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; }

UnionMode::type mode() const {
return internal::checked_cast<const UnionType&>(*type()).mode();
}
const UnionType* union_type() const { return union_type_; }

UnionMode::type mode() const { return union_type_->mode(); }

// Return the given field as an individual array.
// For sparse unions, the returned array has its offset, length and null
Expand All @@ -1047,6 +1047,7 @@ class ARROW_EXPORT UnionArray : public Array {

const type_id_t* raw_type_ids_;
const int32_t* raw_value_offsets_;
const UnionType* union_type_;

// For caching boxed child data
mutable std::vector<std::shared_ptr<Array>> boxed_fields_;
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/array/builder_nested.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ Status ListBuilder::CheckNextOffset() const {
const int64_t num_values = value_builder_->length();
ARROW_RETURN_IF(
num_values > kListMaximumElements,
Status::CapacityError("ListArray cannot contain more then 2^31 - 1 child elements,",
Status::CapacityError("ListArray cannot contain more than 2^31 - 1 child elements,",
" have ", num_values));
return Status::OK();
}
Expand Down Expand Up @@ -300,7 +300,7 @@ Status FixedSizeListBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
// Struct

StructBuilder::StructBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
std::vector<std::shared_ptr<ArrayBuilder>>&& field_builders)
std::vector<std::shared_ptr<ArrayBuilder>> field_builders)
: ArrayBuilder(type, pool) {
children_ = std::move(field_builders);
}
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/array/builder_nested.h
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder {
class ARROW_EXPORT StructBuilder : public ArrayBuilder {
public:
StructBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
std::vector<std::shared_ptr<ArrayBuilder>>&& field_builders);
std::vector<std::shared_ptr<ArrayBuilder>> field_builders);

Status FinishInternal(std::shared_ptr<ArrayData>* out) override;

Expand Down
Loading

0 comments on commit 4124a4b

Please sign in to comment.