Skip to content

Commit

Permalink
rewrite union builder to share a base class, let children_ be indexed…
Browse files Browse the repository at this point in the history
… by type_id
  • Loading branch information
bkietz committed Jul 11, 2019
1 parent 37de5f2 commit fd64c1b
Show file tree
Hide file tree
Showing 4 changed files with 113 additions and 171 deletions.
9 changes: 0 additions & 9 deletions cpp/src/arrow/array-union-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -188,15 +188,6 @@ TEST_F(TestUnionArrayFactories, TestMakeSparse) {
type_codes);
}

// FIXME(bkietz):
// segfault in test_serialization.py: trying to roundtrip {(): 2}
// this is a dict -> list<struct<
// union<list<union<>>>,
// union<int64>
// >>
// This breaks when trying to deserialize the *values* array, which has null offsets for
// some reason

template <typename B>
class UnionBuilderTest : public ::testing::Test {
public:
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/array/builder_nested.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ Status ListBuilder::CheckNextOffset() const {
const int64_t num_values = value_builder_->length();
ARROW_RETURN_IF(
num_values > kListMaximumElements,
Status::CapacityError("ListArray cannot contain more then 2^31 - 1 child elements,",
Status::CapacityError("ListArray cannot contain more than 2^31 - 1 child elements,",
" have ", num_values));
return Status::OK();
}
Expand Down
138 changes: 49 additions & 89 deletions cpp/src/arrow/array/builder_union.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,122 +26,82 @@ namespace arrow {

using internal::checked_cast;

DenseUnionBuilder::DenseUnionBuilder(MemoryPool* pool)
: ArrayBuilder(nullptr, pool), types_builder_(pool), offsets_builder_(pool) {}

DenseUnionBuilder::DenseUnionBuilder(MemoryPool* pool,
std::vector<std::shared_ptr<ArrayBuilder>> children,
const std::shared_ptr<DataType>& type)
: ArrayBuilder(type, pool), types_builder_(pool), offsets_builder_(pool) {
auto union_type = checked_cast<const UnionType*>(type.get());
DCHECK_NE(union_type, nullptr);
type_id_to_child_num_.resize(union_type->max_type_code() + 1, -1);
DCHECK_EQ(union_type->mode(), UnionMode::DENSE);
int child_i = 0;
for (auto type_id : union_type->type_codes()) {
type_id_to_child_num_[type_id] = child_i++;
}
children_ = std::move(children);
}

Status DenseUnionBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
std::shared_ptr<Buffer> types, offsets, null_bitmap;
Status BasicUnionBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
std::shared_ptr<Buffer> types, null_bitmap;
RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
RETURN_NOT_OK(types_builder_.Finish(&types));
RETURN_NOT_OK(offsets_builder_.Finish(&offsets));

std::vector<std::shared_ptr<ArrayData>> child_data(children_.size());
for (size_t i = 0; i < children_.size(); ++i) {
std::shared_ptr<ArrayData> data;
RETURN_NOT_OK(children_[i]->FinishInternal(&data));
child_data[i] = data;
// If the type has not been specified in the constructor, gather type_ids
std::vector<uint8_t> type_ids;
if (type_ == nullptr) {
for (size_t i = 0; i < children_.size(); ++i) {
type_ids.push_back(static_cast<uint8_t>(i));
}
} else {
type_ids = checked_cast<const UnionType&>(*type_).type_codes();
}

std::vector<std::shared_ptr<ArrayData>> child_data(type_ids.size());
for (size_t i = 0; i < type_ids.size(); ++i) {
RETURN_NOT_OK(children_[type_ids[i]]->FinishInternal(&child_data[i]));
}

// If the type has not been specified in the constructor, infer it
if (!type_) {
if (type_ == nullptr) {
std::vector<std::shared_ptr<Field>> fields;
std::vector<uint8_t> type_ids;
for (size_t i = 0; i < children_.size(); ++i) {
fields.push_back(field(field_names_[i], children_[i]->type()));
type_ids.push_back(static_cast<uint8_t>(i));
auto field_names_it = field_names_.begin();
for (auto&& data : child_data) {
fields.push_back(field(*field_names_it++, data->type));
}
type_ = union_(fields, type_ids, UnionMode::DENSE);
type_ = union_(fields, type_ids, mode_);
}

*out = ArrayData::Make(type_, length(), {null_bitmap, types, offsets}, null_count_);
*out = ArrayData::Make(type_, length(), {null_bitmap, types, nullptr}, null_count_);
(*out)->child_data = std::move(child_data);
return Status::OK();
}

SparseUnionBuilder::SparseUnionBuilder(MemoryPool* pool)
: ArrayBuilder(nullptr, pool), types_builder_(pool) {}

SparseUnionBuilder::SparseUnionBuilder(
MemoryPool* pool, std::vector<std::shared_ptr<ArrayBuilder>> children,
BasicUnionBuilder::BasicUnionBuilder(
MemoryPool* pool, UnionMode::type mode,
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
const std::shared_ptr<DataType>& type)
: ArrayBuilder(type, pool), types_builder_(pool) {
: ArrayBuilder(type, pool), mode_(mode), types_builder_(pool) {
auto union_type = checked_cast<const UnionType*>(type.get());
DCHECK_NE(union_type, nullptr);
type_id_to_child_num_.resize(union_type->max_type_code() + 1, -1);
DCHECK_EQ(union_type->mode(), UnionMode::SPARSE);
int child_i = 0;
for (auto type_id : union_type->type_codes()) {
type_id_to_child_num_[type_id] = child_i++;
}
children_ = std::move(children);
for (auto&& child : children_) {
DCHECK_EQ(child->length(), 0);
}
}
DCHECK_EQ(union_type->mode(), mode);

Status SparseUnionBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
std::shared_ptr<Buffer> types, offsets, null_bitmap;
RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
RETURN_NOT_OK(types_builder_.Finish(&types));
// NB: children_ is indexed by the child array's type_id, *not* by the index
// of the child_data in the Finished array data
children_.resize(union_type->max_type_code() + 1, nullptr);

std::vector<std::shared_ptr<ArrayData>> child_data(children_.size());
for (size_t i = 0; i < children_.size(); ++i) {
std::shared_ptr<ArrayData> data;
RETURN_NOT_OK(children_[i]->FinishInternal(&data));
child_data[i] = data;
}

// If the type has not been specified in the constructor, infer it
if (!type_) {
std::vector<std::shared_ptr<Field>> fields;
std::vector<uint8_t> type_ids;
for (size_t i = 0; i < children_.size(); ++i) {
fields.push_back(field(field_names_[i], children_[i]->type()));
type_ids.push_back(static_cast<uint8_t>(i));
}
type_ = union_(fields, type_ids, UnionMode::SPARSE);
auto field_it = type->children().begin();
auto children_it = children.begin();
for (auto type_id : union_type->type_codes()) {
children_[type_id] = *children_it++;
field_names_.push_back((*field_it++)->name());
}

*out = ArrayData::Make(type_, length(), {null_bitmap, types, offsets}, null_count_);
(*out)->child_data = std::move(child_data);
return Status::OK();
DCHECK_EQ(children_it, children.end());
DCHECK_EQ(field_it, type->children().end());
}

int8_t SparseUnionBuilder::AppendChild(const std::shared_ptr<ArrayBuilder>& child,
const std::string& field_name) {
int8_t BasicUnionBuilder::AppendChild(const std::shared_ptr<ArrayBuilder>& new_child,
const std::string& field_name) {
// force type inferrence in Finish
type_ = NULLPTR;
DCHECK_EQ(child->length(), length_);
type_ = nullptr;

children_.push_back(child);
field_names_.push_back(field_name);
auto child_num = static_cast<int8_t>(children_.size() - 1);
// search for an available type_id
// FIXME(bkietz) far from optimal
auto max_type = static_cast<int8_t>(type_id_to_child_num_.size());
for (int8_t type = 0; type < max_type; ++type) {
if (type_id_to_child_num_[type] == -1) {
type_id_to_child_num_[type] = child_num;
return type;

for (int8_t type_id = dense_type_id_; static_cast<size_t>(type_id) < children_.size();
++type_id) {
if (children_[type_id] == NULLPTR) {
children_[type_id] = new_child;
return type_id;
}
dense_type_id_ = type_id;
}
type_id_to_child_num_.push_back(child_num);
return max_type;

children_.push_back(new_child);
return dense_type_id_++;
}

} // namespace arrow
135 changes: 63 additions & 72 deletions cpp/src/arrow/array/builder_union.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,64 @@

namespace arrow {

class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder {
public:
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;

/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond

Status Finish(std::shared_ptr<UnionArray>* out) { return FinishTyped(out); }

/// \brief Make a new child builder available to the UnionArray
///
/// \param[in] new_child the child builder
/// \param[in] field_name the name of the field in the union array type
/// if type inference is used
/// \return child index, which is the "type" argument that needs
/// to be passed to the "Append" method to add a new element to
/// the union array.
int8_t AppendChild(const std::shared_ptr<ArrayBuilder>& new_child,
const std::string& field_name = "");

protected:
/// Use this constructor to initialize the UnionBuilder with no child builders,
/// allowing type to be inferred. You will need to call AppendChild for each of the
/// children builders you want to use.
explicit BasicUnionBuilder(MemoryPool* pool, UnionMode::type mode)
: ArrayBuilder(nullptr, pool), mode_(mode), types_builder_(pool) {}

/// Use this constructor to specify the type explicitly.
/// You can still add child builders to the union after using this constructor
BasicUnionBuilder(MemoryPool* pool, UnionMode::type mode,
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
const std::shared_ptr<DataType>& type);

UnionMode::type mode_;
int8_t dense_type_id_ = 0;
TypedBufferBuilder<int8_t> types_builder_;
std::vector<std::string> field_names_;
};

/// \class DenseUnionBuilder
///
/// This API is EXPERIMENTAL.
class ARROW_EXPORT DenseUnionBuilder : public ArrayBuilder {
class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder {
public:
/// Use this constructor to initialize the UnionBuilder with no child builders,
/// allowing type to be inferred. You will need to call AppendChild for each of the
/// children builders you want to use.
explicit DenseUnionBuilder(MemoryPool* pool);
explicit DenseUnionBuilder(MemoryPool* pool)
: BasicUnionBuilder(pool, UnionMode::DENSE) {}

/// Use this constructor to specify the type explicitly.
/// You can still add child builders to the union after using this constructor
DenseUnionBuilder(MemoryPool* pool, std::vector<std::shared_ptr<ArrayBuilder>> children,
const std::shared_ptr<DataType>& type);
DenseUnionBuilder(MemoryPool* pool,
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
const std::shared_ptr<DataType>& type)
: BasicUnionBuilder(pool, UnionMode::DENSE, children, type),
offsets_builder_(pool) {}

Status AppendNull() final {
ARROW_RETURN_NOT_OK(types_builder_.Append(0));
Expand All @@ -63,71 +107,42 @@ class ARROW_EXPORT DenseUnionBuilder : public ArrayBuilder {
/// is called.
Status Append(int8_t next_type) {
ARROW_RETURN_NOT_OK(types_builder_.Append(next_type));
auto offset =
static_cast<int32_t>(children_[type_id_to_child_num_[next_type]]->length());
if (children_[next_type]->length() == kListMaximumElements) {
return Status::CapacityError(
"a dense UnionArray cannot contain more than 2^31 - 1 elements from a single "
"child");
}
auto offset = static_cast<int32_t>(children_[next_type]->length());
ARROW_RETURN_NOT_OK(offsets_builder_.Append(offset));
return AppendToBitmap(true);
}

Status FinishInternal(std::shared_ptr<ArrayData>* out) override;

/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond

Status Finish(std::shared_ptr<UnionArray>* out) { return FinishTyped(out); }

/// \brief Make a new child builder available to the UnionArray
///
/// \param[in] child the child builder
/// \param[in] field_name the name of the field in the union array type
/// if type inference is used
/// \return child index, which is the "type" argument that needs
/// to be passed to the "Append" method to add a new element to
/// the union array.
int8_t AppendChild(const std::shared_ptr<ArrayBuilder>& child,
const std::string& field_name = "") {
// force type inferrence in Finish
type_ = NULLPTR;

children_.push_back(child);
field_names_.push_back(field_name);
auto child_num = static_cast<int8_t>(children_.size() - 1);
// search for an available type_id
// FIXME(bkietz) far from optimal
auto max_type = static_cast<int8_t>(type_id_to_child_num_.size());
for (int8_t type = 0; type < max_type; ++type) {
if (type_id_to_child_num_[type] == -1) {
type_id_to_child_num_[type] = child_num;
return type;
}
}
type_id_to_child_num_.push_back(child_num);
return max_type;
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
ARROW_RETURN_NOT_OK(BasicUnionBuilder::FinishInternal(out));
return offsets_builder_.Finish(&(*out)->buffers[2]);
}

private:
TypedBufferBuilder<int8_t> types_builder_;
TypedBufferBuilder<int32_t> offsets_builder_;
std::vector<std::string> field_names_;
std::vector<int8_t> type_id_to_child_num_;
};

/// \class SparseUnionBuilder
///
/// This API is EXPERIMENTAL.
class ARROW_EXPORT SparseUnionBuilder : public ArrayBuilder {
class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder {
public:
/// Use this constructor to initialize the UnionBuilder with no child builders,
/// allowing type to be inferred. You will need to call AppendChild for each of the
/// children builders you want to use.
explicit SparseUnionBuilder(MemoryPool* pool);
explicit SparseUnionBuilder(MemoryPool* pool)
: BasicUnionBuilder(pool, UnionMode::SPARSE) {}

/// Use this constructor to specify the type explicitly.
/// You can still add child builders to the union after using this constructor
SparseUnionBuilder(MemoryPool* pool,
std::vector<std::shared_ptr<ArrayBuilder>> children,
const std::shared_ptr<DataType>& type);
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
const std::shared_ptr<DataType>& type)
: BasicUnionBuilder(pool, UnionMode::SPARSE, children, type) {}

Status AppendNull() final {
ARROW_RETURN_NOT_OK(types_builder_.Append(0));
Expand All @@ -150,30 +165,6 @@ class ARROW_EXPORT SparseUnionBuilder : public ArrayBuilder {
ARROW_RETURN_NOT_OK(types_builder_.Append(next_type));
return AppendToBitmap(true);
}

Status FinishInternal(std::shared_ptr<ArrayData>* out) override;

/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond

Status Finish(std::shared_ptr<UnionArray>* out) { return FinishTyped(out); }

/// \brief Make a new child builder available to the UnionArray
///
/// \param[in] child the child builder
/// \param[in] field_name the name of the field in the union array type
/// if type inference is used
/// \return child index, which is the "type" argument that needs
/// to be passed to the "Append" method to add a new element to
/// the union array.
int8_t AppendChild(const std::shared_ptr<ArrayBuilder>& child,
const std::string& field_name = "");

private:
TypedBufferBuilder<int8_t> types_builder_;
std::vector<std::string> field_names_;
std::vector<int8_t> type_id_to_child_num_;
};

} // namespace arrow

0 comments on commit fd64c1b

Please sign in to comment.