From e80aed8744c6efd8165858402fd3a50436d308fd Mon Sep 17 00:00:00 2001 From: Sarah Gilmore Date: Thu, 21 Sep 2023 10:53:28 -0400 Subject: [PATCH 1/6] 1. Add CanReferenceFieldByName to StructArray 2. Add CanReferenceFieldsByNames to StructArray --- cpp/src/arrow/array/array_nested.cc | 16 ++++++++++++++++ cpp/src/arrow/array/array_nested.h | 6 ++++++ 2 files changed, 22 insertions(+) diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index df60074c78470..5e22cae480a48 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -627,6 +627,22 @@ std::shared_ptr StructArray::GetFieldByName(const std::string& name) cons return i == -1 ? nullptr : field(i); } +Status StructArray::CanReferenceFieldByName(const std::string& name) const { + if (GetFieldByName(name) == nullptr) { + return Status::Invalid("Field named '", name, + "' not found or not unique in the struct."); + } + return Status::OK(); +} + +Status StructArray::CanReferenceFieldsByNames( + const std::vector& names) const { + for (const auto& name: names) { + ARROW_RETURN_NOT_OK(CanReferenceFieldByName(name)); + } + return Status::OK(); +} + Result StructArray::Flatten(MemoryPool* pool) const { ArrayVector flattened; flattened.resize(data_->child_data.size()); diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 47c1db039ccc9..8d5cc95fec00d 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -404,6 +404,12 @@ class ARROW_EXPORT StructArray : public Array { /// Returns null if name not found std::shared_ptr GetFieldByName(const std::string& name) const; + /// Indicate if field named `name` can be found unambiguously in the struct. + Status CanReferenceFieldByName(const std::string& name) const; + + /// Indicate if fields named `names` can be found unambiguously in the struct. + Status CanReferenceFieldsByNames(const std::vector& names) const; + /// \brief Flatten this array as a vector of arrays, one for each field /// /// \param[in] pool The pool to allocate null bitmaps from, if necessary From e2a821ddefe1409a5c425ddfadd2217890eeb794 Mon Sep 17 00:00:00 2001 From: Sarah Gilmore Date: Thu, 21 Sep 2023 14:09:33 -0400 Subject: [PATCH 2/6] Add CanReferenceFieldByName method to Schema --- cpp/src/arrow/type.cc | 14 +++++++++----- cpp/src/arrow/type.h | 3 +++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 3d294a3fa8642..295559ce2dba3 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -1847,14 +1847,18 @@ std::vector Schema::GetAllFieldIndices(const std::string& name) const { return result; } +Status Schema::CanReferenceFieldByName(const std::string& name) const { + if (GetFieldByName(name) == nullptr) { + return Status::Invalid("Field named '", name, + "' not found or not unique in the schema."); + } + return Status::OK(); +} + Status Schema::CanReferenceFieldsByNames(const std::vector& names) const { for (const auto& name : names) { - if (GetFieldByName(name) == nullptr) { - return Status::Invalid("Field named '", name, - "' not found or not unique in the schema."); - } + ARROW_RETURN_NOT_OK(CanReferenceFieldByName(name)); } - return Status::OK(); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 718540d449226..19910979287cc 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -2048,6 +2048,9 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable, /// Return the indices of all fields having this name std::vector GetAllFieldIndices(const std::string& name) const; + /// Indicate if field named `name` can be found unambiguously in the schema. + Status CanReferenceFieldByName(const std::string& name) const; + /// Indicate if fields named `names` can be found unambiguously in the schema. Status CanReferenceFieldsByNames(const std::vector& names) const; From 2b062b10987374f78a8ebc8f6eee8d5a32e1f245 Mon Sep 17 00:00:00 2001 From: Sarah Gilmore Date: Thu, 21 Sep 2023 14:21:36 -0400 Subject: [PATCH 3/6] Add unit tests for Schema::CanReferenceFieldByName --- cpp/src/arrow/type_test.cc | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index c55b33b4151e4..3dbefdcf0c564 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -548,6 +548,24 @@ TEST_F(TestSchema, GetFieldDuplicates) { ASSERT_EQ(results.size(), 0); } +TEST_F(TestSchema, CanReferenceFieldByName) { + auto f0 = field("f0", int32()); + auto f1 = field("f1", uint8(), false); + auto f2 = field("f2", utf8()); + auto f3 = field("f1", list(int16())); + + auto schema = ::arrow::schema({f0, f1, f2, f3}); + + ASSERT_OK(schema->CanReferenceFieldByName("f0")); + ASSERT_OK(schema->CanReferenceFieldByName("f2")); + + // Not found + ASSERT_RAISES(Invalid, schema->CanReferenceFieldByName("nope")); + + // Duplicates + ASSERT_RAISES(Invalid, schema->CanReferenceFieldByName("f1")); +} + TEST_F(TestSchema, CanReferenceFieldsByNames) { auto f0 = field("f0", int32()); auto f1 = field("f1", uint8(), false); From 95fa14b1909a90d105ff3f79795ec4036b9141ea Mon Sep 17 00:00:00 2001 From: Sarah Gilmore Date: Thu, 21 Sep 2023 14:40:25 -0400 Subject: [PATCH 4/6] Add unit tests for StructArray::CanReferenceFieldByName and StructArray::CanReferenceFieldsByName --- cpp/src/arrow/array/array_struct_test.cc | 52 ++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/cpp/src/arrow/array/array_struct_test.cc b/cpp/src/arrow/array/array_struct_test.cc index 318c83860e009..73d53a7efa59b 100644 --- a/cpp/src/arrow/array/array_struct_test.cc +++ b/cpp/src/arrow/array/array_struct_test.cc @@ -303,6 +303,58 @@ TEST(StructArray, FlattenOfSlice) { ASSERT_OK(arr->ValidateFull()); } +TEST(StructArray, CanReferenceFieldByName) { + auto a = ArrayFromJSON(int8(), "[4, 5]"); + auto b = ArrayFromJSON(int16(), "[6, 7]"); + auto c = ArrayFromJSON(int32(), "[8, 9]"); + auto d = ArrayFromJSON(int64(), "[10, 11]"); + auto children = std::vector>{a, b, c, d}; + + auto f0 = field("f0", int8()); + auto f1 = field("f1", int16()); + auto f2 = field("f2", int32()); + auto f3 = field("f1", int64()); + auto type = struct_({f0, f1, f2, f3}); + + auto arr = std::make_shared(type, 2, children); + + ASSERT_OK(arr->CanReferenceFieldByName("f0")); + ASSERT_OK(arr->CanReferenceFieldByName("f2")); + // Not found + ASSERT_RAISES(Invalid, arr->CanReferenceFieldByName("nope")); + + // Duplicates + ASSERT_RAISES(Invalid, arr->CanReferenceFieldByName("f1")); +} + +TEST(StructArray, CanReferenceFieldsByNames) { + auto a = ArrayFromJSON(int8(), "[4, 5]"); + auto b = ArrayFromJSON(int16(), "[6, 7]"); + auto c = ArrayFromJSON(int32(), "[8, 9]"); + auto d = ArrayFromJSON(int64(), "[10, 11]"); + auto children = std::vector>{a, b, c, d}; + + auto f0 = field("f0", int8()); + auto f1 = field("f1", int16()); + auto f2 = field("f2", int32()); + auto f3 = field("f1", int64()); + auto type = struct_({f0, f1, f2, f3}); + + auto arr = std::make_shared(type, 2, children); + + ASSERT_OK(arr->CanReferenceFieldsByNames({"f0", "f2"})); + ASSERT_OK(arr->CanReferenceFieldsByNames({"f2", "f0"})); + + // Not found + ASSERT_RAISES(Invalid, arr->CanReferenceFieldsByNames({"nope"})); + ASSERT_RAISES(Invalid, arr->CanReferenceFieldsByNames({"f0", "nope"})); + // Duplicates + ASSERT_RAISES(Invalid, arr->CanReferenceFieldsByNames({"f1"})); + ASSERT_RAISES(Invalid, arr->CanReferenceFieldsByNames({"f0", "f1"})); + // Both + ASSERT_RAISES(Invalid, arr->CanReferenceFieldsByNames({"f0", "f1", "nope"})); +} + // ---------------------------------------------------------------------------------- // Struct test class TestStructBuilder : public ::testing::Test { From 6f8cba7e55400a0a6d53ad653cacf8f566ac086c Mon Sep 17 00:00:00 2001 From: Sarah Gilmore Date: Thu, 21 Sep 2023 16:28:01 -0400 Subject: [PATCH 5/6] Fix format errors --- cpp/src/arrow/array/array_nested.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 5e22cae480a48..d8308c824953a 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -636,11 +636,11 @@ Status StructArray::CanReferenceFieldByName(const std::string& name) const { } Status StructArray::CanReferenceFieldsByNames( - const std::vector& names) const { - for (const auto& name: names) { - ARROW_RETURN_NOT_OK(CanReferenceFieldByName(name)); - } - return Status::OK(); + const std::vector& names) const { + for (const auto& name : names) { + ARROW_RETURN_NOT_OK(CanReferenceFieldByName(name)); + } + return Status::OK(); } Result StructArray::Flatten(MemoryPool* pool) const { From da5524f27be20ecda5c9ee028477475b0f9f81f1 Mon Sep 17 00:00:00 2001 From: Sarah Gilmore Date: Thu, 21 Sep 2023 16:40:01 -0400 Subject: [PATCH 6/6] fix indentation --- cpp/src/arrow/type.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 295559ce2dba3..47bf52660ffe9 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -1851,7 +1851,7 @@ Status Schema::CanReferenceFieldByName(const std::string& name) const { if (GetFieldByName(name) == nullptr) { return Status::Invalid("Field named '", name, "' not found or not unique in the schema."); - } + } return Status::OK(); }