Skip to content

Commit

Permalink
apacheGH-41055: [C++] Support flatten for combining nested list relat…
Browse files Browse the repository at this point in the history
…ed types (apache#41092)

### Rationale for this change
Support flatten for combining nested list related types.

### What changes are included in this PR?
Add the recursively flatten function for auto detect and flatten the combining nested list types.

### Are these changes tested?
Yes

### Are there any user-facing changes?
Yes, user can flatten a combining nested-list or related array by use `Flatten` API.

* GitHub Issue: apache#41055

Authored-by: ZhangHuiGui <[email protected]>
Signed-off-by: Felipe Oliveira Carvalho <[email protected]>
  • Loading branch information
ZhangHuiGui authored and tolleybot committed May 2, 2024
1 parent 8983532 commit a39b01d
Show file tree
Hide file tree
Showing 3 changed files with 144 additions and 1 deletion.
69 changes: 68 additions & 1 deletion cpp/src/arrow/array/array_list_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -735,7 +735,7 @@ class TestListArray : public ::testing::Test {
ArrayFromJSON(type, "[[1, 2], [3], [4], null, [5], [], [6]]"));
auto sliced_list_array =
std::dynamic_pointer_cast<ArrayType>(list_array->Slice(3, 4));
ASSERT_OK_AND_ASSIGN(auto flattened, list_array->Flatten());
ASSERT_OK_AND_ASSIGN(auto flattened, sliced_list_array->Flatten());
ASSERT_OK(flattened->ValidateFull());
// Note the difference between values() and Flatten().
EXPECT_TRUE(flattened->Equals(ArrayFromJSON(int32(), "[5, 6]")));
Expand Down Expand Up @@ -763,6 +763,52 @@ class TestListArray : public ::testing::Test {
<< flattened->ToString();
}

void TestFlattenRecursively() {
auto inner_type = std::make_shared<T>(int32());
auto type = std::make_shared<T>(inner_type);

// List types with two nested level: list<list<int32>>
auto nested_list_array = std::dynamic_pointer_cast<ArrayType>(ArrayFromJSON(type, R"([
[[0, 1, 2], null, [3, null]],
[null],
[[2, 9], [4], [], [6, 5]]
])"));
ASSERT_OK_AND_ASSIGN(auto flattened, nested_list_array->FlattenRecursively());
ASSERT_OK(flattened->ValidateFull());
ASSERT_EQ(10, flattened->length());
ASSERT_TRUE(
flattened->Equals(ArrayFromJSON(int32(), "[0, 1, 2, 3, null, 2, 9, 4, 6, 5]")));

// Empty nested list should flatten until non-list type is reached
nested_list_array =
std::dynamic_pointer_cast<ArrayType>(ArrayFromJSON(type, R"([null])"));
ASSERT_OK_AND_ASSIGN(flattened, nested_list_array->FlattenRecursively());
ASSERT_TRUE(flattened->type()->Equals(int32()));

// List types with three nested level: list<list<fixed_size_list<int32, 2>>>
type = std::make_shared<T>(std::make_shared<T>(fixed_size_list(int32(), 2)));
nested_list_array = std::dynamic_pointer_cast<ArrayType>(ArrayFromJSON(type, R"([
[
[[null, 0]],
[[3, 7], null]
],
[
[[4, null], [5, 8]],
[[8, null]],
null
],
[
null
]
])"));
ASSERT_OK_AND_ASSIGN(flattened, nested_list_array->FlattenRecursively());
ASSERT_OK(flattened->ValidateFull());
ASSERT_EQ(10, flattened->length());
ASSERT_EQ(3, flattened->null_count());
ASSERT_TRUE(flattened->Equals(
ArrayFromJSON(int32(), "[null, 0, 3, 7, 4, null, 5, 8, 8, null]")));
}

Status ValidateOffsetsAndSizes(int64_t length, std::vector<offset_type> offsets,
std::vector<offset_type> sizes,
std::shared_ptr<Array> values, int64_t offset = 0) {
Expand Down Expand Up @@ -925,10 +971,12 @@ TYPED_TEST(TestListArray, BuilderPreserveFieldName) {
TYPED_TEST(TestListArray, FlattenSimple) { this->TestFlattenSimple(); }
TYPED_TEST(TestListArray, FlattenNulls) { this->TestFlattenNulls(); }
TYPED_TEST(TestListArray, FlattenAllEmpty) { this->TestFlattenAllEmpty(); }
TYPED_TEST(TestListArray, FlattenSliced) { this->TestFlattenSliced(); }
TYPED_TEST(TestListArray, FlattenZeroLength) { this->TestFlattenZeroLength(); }
TYPED_TEST(TestListArray, TestFlattenNonEmptyBackingNulls) {
this->TestFlattenNonEmptyBackingNulls();
}
TYPED_TEST(TestListArray, FlattenRecursively) { this->TestFlattenRecursively(); }

TYPED_TEST(TestListArray, ValidateDimensions) { this->TestValidateDimensions(); }

Expand Down Expand Up @@ -1714,4 +1762,23 @@ TEST_F(TestFixedSizeListArray, Flatten) {
}
}

TEST_F(TestFixedSizeListArray, FlattenRecursively) {
// Nested fixed-size list-array: fixed_size_list(fixed_size_list(int32, 2), 2)
auto inner_type = fixed_size_list(value_type_, 2);
type_ = fixed_size_list(inner_type, 2);

auto values = std::dynamic_pointer_cast<FixedSizeListArray>(ArrayFromJSON(type_, R"([
[[0, 1], [null, 3]],
[[7, null], [2, 5]],
[null, null]
])"));
ASSERT_OK(values->ValidateFull());
ASSERT_OK_AND_ASSIGN(auto flattened, values->FlattenRecursively());
ASSERT_OK(flattened->ValidateFull());
ASSERT_EQ(8, flattened->length());
ASSERT_EQ(2, flattened->null_count());
AssertArraysEqual(*flattened,
*ArrayFromJSON(value_type_, "[0, 1, null, 3, 7, null, 2, 5]"));
}

} // namespace arrow
44 changes: 44 additions & 0 deletions cpp/src/arrow/array/array_nested.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
#include "arrow/util/checked_cast.h"
#include "arrow/util/list_util.h"
#include "arrow/util/logging.h"
#include "arrow/util/unreachable.h"

namespace arrow {

Expand Down Expand Up @@ -469,6 +470,49 @@ inline void SetListData(VarLengthListLikeArray<TYPE>* self,
self->values_ = MakeArray(self->data_->child_data[0]);
}

Result<std::shared_ptr<Array>> FlattenLogicalListRecursively(const Array& in_array,
MemoryPool* memory_pool) {
std::shared_ptr<Array> array = in_array.Slice(0, in_array.length());
for (auto kind = array->type_id(); is_list(kind) || is_list_view(kind);
kind = array->type_id()) {
switch (kind) {
case Type::LIST: {
ARROW_ASSIGN_OR_RAISE(
array, (checked_cast<const ListArray*>(array.get())->Flatten(memory_pool)));
break;
}
case Type::LARGE_LIST: {
ARROW_ASSIGN_OR_RAISE(
array,
(checked_cast<const LargeListArray*>(array.get())->Flatten(memory_pool)));
break;
}
case Type::LIST_VIEW: {
ARROW_ASSIGN_OR_RAISE(
array,
(checked_cast<const ListViewArray*>(array.get())->Flatten(memory_pool)));
break;
}
case Type::LARGE_LIST_VIEW: {
ARROW_ASSIGN_OR_RAISE(
array,
(checked_cast<const LargeListViewArray*>(array.get())->Flatten(memory_pool)));
break;
}
case Type::FIXED_SIZE_LIST: {
ARROW_ASSIGN_OR_RAISE(
array,
(checked_cast<const FixedSizeListArray*>(array.get())->Flatten(memory_pool)));
break;
}
default:
Unreachable("unexpected non-list type");
break;
}
}
return array;
}

} // namespace internal

// ----------------------------------------------------------------------
Expand Down
32 changes: 32 additions & 0 deletions cpp/src/arrow/array/array_nested.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,20 @@ void SetListData(VarLengthListLikeArray<TYPE>* self,
const std::shared_ptr<ArrayData>& data,
Type::type expected_type_id = TYPE::type_id);

/// \brief A version of Flatten that keeps recursively flattening until an array of
/// non-list values is reached.
///
/// Array types considered to be lists by this function:
/// - list
/// - large_list
/// - list_view
/// - large_list_view
/// - fixed_size_list
///
/// \see ListArray::Flatten
ARROW_EXPORT Result<std::shared_ptr<Array>> FlattenLogicalListRecursively(
const Array& in_array, MemoryPool* memory_pool);

} // namespace internal

/// Base class for variable-sized list and list-view arrays, regardless of offset size.
Expand Down Expand Up @@ -103,6 +117,15 @@ class VarLengthListLikeArray : public Array {
return values_->Slice(value_offset(i), value_length(i));
}

/// \brief Flatten all level recursively until reach a non-list type, and return
/// a non-list type Array.
///
/// \see internal::FlattenLogicalListRecursively
Result<std::shared_ptr<Array>> FlattenRecursively(
MemoryPool* memory_pool = default_memory_pool()) const {
return internal::FlattenLogicalListRecursively(*this, memory_pool);
}

protected:
friend void internal::SetListData<TYPE>(VarLengthListLikeArray<TYPE>* self,
const std::shared_ptr<ArrayData>& data,
Expand Down Expand Up @@ -595,6 +618,15 @@ class ARROW_EXPORT FixedSizeListArray : public Array {
Result<std::shared_ptr<Array>> Flatten(
MemoryPool* memory_pool = default_memory_pool()) const;

/// \brief Flatten all level recursively until reach a non-list type, and return
/// a non-list type Array.
///
/// \see internal::FlattenLogicalListRecursively
Result<std::shared_ptr<Array>> FlattenRecursively(
MemoryPool* memory_pool = default_memory_pool()) const {
return internal::FlattenLogicalListRecursively(*this, memory_pool);
}

/// \brief Construct FixedSizeListArray from child value array and value_length
///
/// \param[in] values Array containing list values
Expand Down

0 comments on commit a39b01d

Please sign in to comment.