Skip to content

Commit

Permalink
apacheGH-37782: [C++] Add CanReferenceFieldsByNames method to `arro…
Browse files Browse the repository at this point in the history
…w::StructArray` (apache#37823)

### Rationale for this change

`arrow::Schema` has a method called `CanReferenceFieldsByNames` which callers can use prior to calling `GetFieldByName`. It would be nice if `arrow::StructArray` also had `CanReferenceFieldsByNames` as a method. 

I also think it would be nice to add a `CanReferenceFieldByName` method that accepts a `std::string` instead of a `std::vector<std::string>` to `StructArray` and `Schema`. That way, users wouldn't have to create a `std::vector` containing one `std::string` when they just have one field name.

### What changes are included in this PR?

1. Added `CanReferenceFieldsByNames` method to `StructArray`
2. Added `CanReferenceFieldByName` method to `StructArray` 
3. Added `CanReferenceFieldsByName` method to `Schema`

### Are these changes tested?

Yes. I added unit tests for `CanReferenceFieldsByNames` and `CanReferenceFieldByName` to `array_struct_test.cc` and `type_test.cc`. 

### Are there any user-facing changes?

Yes. `CanReferenceFieldsByNames` and `CanReferenceFieldByName` can be called on a `StructArray`. Users can also call `CanReferenceFieldByName` on a `Schema`.

* Closes: apache#37782

Authored-by: Sarah Gilmore <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
  • Loading branch information
sgilmore10 authored and dgreiss committed Feb 17, 2024
1 parent 79886f1 commit 4e1ede5
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 5 deletions.
16 changes: 16 additions & 0 deletions cpp/src/arrow/array/array_nested.cc
Original file line number Diff line number Diff line change
Expand Up @@ -627,6 +627,22 @@ std::shared_ptr<Array> StructArray::GetFieldByName(const std::string& name) cons
return i == -1 ? nullptr : field(i);
}

Status StructArray::CanReferenceFieldByName(const std::string& name) const {
if (GetFieldByName(name) == nullptr) {
return Status::Invalid("Field named '", name,
"' not found or not unique in the struct.");
}
return Status::OK();
}

Status StructArray::CanReferenceFieldsByNames(
const std::vector<std::string>& names) const {
for (const auto& name : names) {
ARROW_RETURN_NOT_OK(CanReferenceFieldByName(name));
}
return Status::OK();
}

Result<ArrayVector> StructArray::Flatten(MemoryPool* pool) const {
ArrayVector flattened;
flattened.resize(data_->child_data.size());
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/array/array_nested.h
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,12 @@ class ARROW_EXPORT StructArray : public Array {
/// Returns null if name not found
std::shared_ptr<Array> GetFieldByName(const std::string& name) const;

/// Indicate if field named `name` can be found unambiguously in the struct.
Status CanReferenceFieldByName(const std::string& name) const;

/// Indicate if fields named `names` can be found unambiguously in the struct.
Status CanReferenceFieldsByNames(const std::vector<std::string>& names) const;

/// \brief Flatten this array as a vector of arrays, one for each field
///
/// \param[in] pool The pool to allocate null bitmaps from, if necessary
Expand Down
52 changes: 52 additions & 0 deletions cpp/src/arrow/array/array_struct_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,58 @@ TEST(StructArray, FlattenOfSlice) {
ASSERT_OK(arr->ValidateFull());
}

TEST(StructArray, CanReferenceFieldByName) {
auto a = ArrayFromJSON(int8(), "[4, 5]");
auto b = ArrayFromJSON(int16(), "[6, 7]");
auto c = ArrayFromJSON(int32(), "[8, 9]");
auto d = ArrayFromJSON(int64(), "[10, 11]");
auto children = std::vector<std::shared_ptr<Array>>{a, b, c, d};

auto f0 = field("f0", int8());
auto f1 = field("f1", int16());
auto f2 = field("f2", int32());
auto f3 = field("f1", int64());
auto type = struct_({f0, f1, f2, f3});

auto arr = std::make_shared<StructArray>(type, 2, children);

ASSERT_OK(arr->CanReferenceFieldByName("f0"));
ASSERT_OK(arr->CanReferenceFieldByName("f2"));
// Not found
ASSERT_RAISES(Invalid, arr->CanReferenceFieldByName("nope"));

// Duplicates
ASSERT_RAISES(Invalid, arr->CanReferenceFieldByName("f1"));
}

TEST(StructArray, CanReferenceFieldsByNames) {
auto a = ArrayFromJSON(int8(), "[4, 5]");
auto b = ArrayFromJSON(int16(), "[6, 7]");
auto c = ArrayFromJSON(int32(), "[8, 9]");
auto d = ArrayFromJSON(int64(), "[10, 11]");
auto children = std::vector<std::shared_ptr<Array>>{a, b, c, d};

auto f0 = field("f0", int8());
auto f1 = field("f1", int16());
auto f2 = field("f2", int32());
auto f3 = field("f1", int64());
auto type = struct_({f0, f1, f2, f3});

auto arr = std::make_shared<StructArray>(type, 2, children);

ASSERT_OK(arr->CanReferenceFieldsByNames({"f0", "f2"}));
ASSERT_OK(arr->CanReferenceFieldsByNames({"f2", "f0"}));

// Not found
ASSERT_RAISES(Invalid, arr->CanReferenceFieldsByNames({"nope"}));
ASSERT_RAISES(Invalid, arr->CanReferenceFieldsByNames({"f0", "nope"}));
// Duplicates
ASSERT_RAISES(Invalid, arr->CanReferenceFieldsByNames({"f1"}));
ASSERT_RAISES(Invalid, arr->CanReferenceFieldsByNames({"f0", "f1"}));
// Both
ASSERT_RAISES(Invalid, arr->CanReferenceFieldsByNames({"f0", "f1", "nope"}));
}

// ----------------------------------------------------------------------------------
// Struct test
class TestStructBuilder : public ::testing::Test {
Expand Down
14 changes: 9 additions & 5 deletions cpp/src/arrow/type.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1847,14 +1847,18 @@ std::vector<int> Schema::GetAllFieldIndices(const std::string& name) const {
return result;
}

Status Schema::CanReferenceFieldByName(const std::string& name) const {
if (GetFieldByName(name) == nullptr) {
return Status::Invalid("Field named '", name,
"' not found or not unique in the schema.");
}
return Status::OK();
}

Status Schema::CanReferenceFieldsByNames(const std::vector<std::string>& names) const {
for (const auto& name : names) {
if (GetFieldByName(name) == nullptr) {
return Status::Invalid("Field named '", name,
"' not found or not unique in the schema.");
}
ARROW_RETURN_NOT_OK(CanReferenceFieldByName(name));
}

return Status::OK();
}

Expand Down
3 changes: 3 additions & 0 deletions cpp/src/arrow/type.h
Original file line number Diff line number Diff line change
Expand Up @@ -2048,6 +2048,9 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable,
/// Return the indices of all fields having this name
std::vector<int> GetAllFieldIndices(const std::string& name) const;

/// Indicate if field named `name` can be found unambiguously in the schema.
Status CanReferenceFieldByName(const std::string& name) const;

/// Indicate if fields named `names` can be found unambiguously in the schema.
Status CanReferenceFieldsByNames(const std::vector<std::string>& names) const;

Expand Down
18 changes: 18 additions & 0 deletions cpp/src/arrow/type_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -548,6 +548,24 @@ TEST_F(TestSchema, GetFieldDuplicates) {
ASSERT_EQ(results.size(), 0);
}

TEST_F(TestSchema, CanReferenceFieldByName) {
auto f0 = field("f0", int32());
auto f1 = field("f1", uint8(), false);
auto f2 = field("f2", utf8());
auto f3 = field("f1", list(int16()));

auto schema = ::arrow::schema({f0, f1, f2, f3});

ASSERT_OK(schema->CanReferenceFieldByName("f0"));
ASSERT_OK(schema->CanReferenceFieldByName("f2"));

// Not found
ASSERT_RAISES(Invalid, schema->CanReferenceFieldByName("nope"));

// Duplicates
ASSERT_RAISES(Invalid, schema->CanReferenceFieldByName("f1"));
}

TEST_F(TestSchema, CanReferenceFieldsByNames) {
auto f0 = field("f0", int32());
auto f1 = field("f1", uint8(), false);
Expand Down

0 comments on commit 4e1ede5

Please sign in to comment.