From ca936cc5b19763428693dd1ce01392a183c4ffa1 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Tue, 5 Mar 2024 14:56:49 +0100 Subject: [PATCH 1/4] Add TensorFromJSON helper function --- cpp/src/arrow/record_batch_test.cc | 51 +++++++++-------------------- cpp/src/arrow/testing/gtest_util.cc | 10 ++++++ cpp/src/arrow/testing/gtest_util.h | 7 ++++ 3 files changed, 33 insertions(+), 35 deletions(-) diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index 05a20aa487abc..5ae04cfd1419e 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -705,17 +705,12 @@ TEST_F(TestRecordBatch, ToTensorSupportedNaN) { std::vector shape = {9, 2}; const int64_t f32_size = sizeof(float); std::vector f_strides = {f32_size, f32_size * shape[0]}; - std::vector f_values = { - static_cast(NAN), 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, - static_cast(NAN), 60, 70, 80, 90}; - auto data = Buffer::Wrap(f_values); - - std::shared_ptr tensor_expected; - ASSERT_OK_AND_ASSIGN(tensor_expected, Tensor::Make(float32(), data, shape, f_strides)); + std::shared_ptr tensor_expected = TensorFromJSON( + float32(), shape, + "[NaN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, NaN, 60, 70, 80, 90]", f_strides); EXPECT_FALSE(tensor_expected->Equals(*tensor)); EXPECT_TRUE(tensor_expected->Equals(*tensor, EqualOptions().nans_equal(true))); - CheckTensor(tensor, 18, shape, f_strides); } @@ -752,15 +747,11 @@ TYPED_TEST_P(TestBatchToTensor, SupportedTypes) { std::vector shape = {9, 3}; std::vector f_strides = {unit_size, unit_size * shape[0]}; - std::vector f_values = {1, 2, 3, 4, 5, 6, 7, 8, 9, - 10, 20, 30, 40, 50, 60, 70, 80, 90, - 100, 100, 100, 100, 100, 100, 100, 100, 100}; - auto data = Buffer::Wrap(f_values); - - std::shared_ptr tensor_expected; - ASSERT_OK_AND_ASSIGN( - tensor_expected, - Tensor::Make(TypeTraits::type_singleton(), data, shape, f_strides)); + std::shared_ptr tensor_expected = TensorFromJSON( + TypeTraits::type_singleton(), shape, + "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, " + "80, 90, 100, 100, 100, 100, 100, 100, 100, 100, 100]", + f_strides); EXPECT_TRUE(tensor_expected->Equals(*tensor)); CheckTensor(tensor, 27, shape, f_strides); @@ -773,15 +764,11 @@ TYPED_TEST_P(TestBatchToTensor, SupportedTypes) { std::vector shape_sliced = {8, 3}; std::vector f_strides_sliced = {unit_size, unit_size * shape_sliced[0]}; - std::vector f_values_sliced = {2, 3, 4, 5, 6, 7, 8, 9, - 20, 30, 40, 50, 60, 70, 80, 90, - 100, 100, 100, 100, 100, 100, 100, 100}; - auto data_sliced = Buffer::Wrap(f_values_sliced); - - std::shared_ptr tensor_expected_sliced; - ASSERT_OK_AND_ASSIGN(tensor_expected_sliced, - Tensor::Make(TypeTraits::type_singleton(), data_sliced, - shape_sliced, f_strides_sliced)); + std::shared_ptr tensor_expected_sliced = + TensorFromJSON(TypeTraits::type_singleton(), shape_sliced, + "[2, 3, 4, 5, 6, 7, 8, 9, 20, 30, 40, 50, 60, " + "70, 80, 90, 100, 100, 100, 100, 100, 100, 100, 100]", + f_strides_sliced); EXPECT_TRUE(tensor_expected_sliced->Equals(*tensor_sliced)); CheckTensor(tensor_expected_sliced, 24, shape_sliced, f_strides_sliced); @@ -793,15 +780,9 @@ TYPED_TEST_P(TestBatchToTensor, SupportedTypes) { std::vector shape_sliced_1 = {5, 3}; std::vector f_strides_sliced_1 = {unit_size, unit_size * shape_sliced_1[0]}; - std::vector f_values_sliced_1 = { - 2, 3, 4, 5, 6, 20, 30, 40, 50, 60, 100, 100, 100, 100, 100, - }; - auto data_sliced_1 = Buffer::Wrap(f_values_sliced_1); - - std::shared_ptr tensor_expected_sliced_1; - ASSERT_OK_AND_ASSIGN(tensor_expected_sliced_1, - Tensor::Make(TypeTraits::type_singleton(), data_sliced_1, - shape_sliced_1, f_strides_sliced_1)); + std::shared_ptr tensor_expected_sliced_1 = TensorFromJSON( + TypeTraits::type_singleton(), shape_sliced_1, + "[2, 3, 4, 5, 6, 20, 30, 40, 50, 60, 100, 100, 100, 100, 100]", f_strides_sliced_1); EXPECT_TRUE(tensor_expected_sliced_1->Equals(*tensor_sliced_1)); CheckTensor(tensor_expected_sliced_1, 15, shape_sliced_1, f_strides_sliced_1); diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index 37865948882da..0e01f81e4b347 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -53,6 +53,7 @@ #include "arrow/pretty_print.h" #include "arrow/status.h" #include "arrow/table.h" +#include "arrow/tensor.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" #include "arrow/util/config.h" @@ -425,6 +426,15 @@ std::shared_ptr TableFromJSON(const std::shared_ptr& schema, return *Table::FromRecordBatches(schema, std::move(batches)); } +std::shared_ptr TensorFromJSON(const std::shared_ptr& type, + const std::vector& shape, + std::string_view json, + const std::vector& strides, + const std::vector& dim_names) { + std::shared_ptr array = ArrayFromJSON(type, json); + return *Tensor::Make(type, array->data()->buffers[1], shape, strides, dim_names); +} + Result> RunEndEncodeTableColumns( const Table& table, const std::vector& column_indices) { const int num_columns = table.num_columns(); diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 916067d85b753..2480e66fcfca2 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -354,6 +354,13 @@ ARROW_TESTING_EXPORT std::shared_ptr
TableFromJSON(const std::shared_ptr&, const std::vector& json); +ARROW_TESTING_EXPORT +std::shared_ptr TensorFromJSON(const std::shared_ptr& type, + const std::vector& shape, + std::string_view json, + const std::vector& strides = {}, + const std::vector& dim_names = {}); + ARROW_TESTING_EXPORT Result> RunEndEncodeTableColumns( const Table& table, const std::vector& column_indices); From 3f8c82090b1a2284635a4d30b5a3e2c57355e6ec Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 6 Mar 2024 16:30:35 +0100 Subject: [PATCH 2/4] Update cpp/src/arrow/testing/gtest_util.h Co-authored-by: Rok Mihevc --- cpp/src/arrow/testing/gtest_util.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 2480e66fcfca2..6cd48618e950e 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -356,10 +356,10 @@ std::shared_ptr
TableFromJSON(const std::shared_ptr&, ARROW_TESTING_EXPORT std::shared_ptr TensorFromJSON(const std::shared_ptr& type, - const std::vector& shape, - std::string_view json, - const std::vector& strides = {}, - const std::vector& dim_names = {}); + std::string_view data, + std::string_view shape, + std::string_view dim_names = "[]", + std::string_view strides = "[]"); ARROW_TESTING_EXPORT Result> RunEndEncodeTableColumns( From fe164a921e0f04696608a07ed28e94717283a1d6 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Wed, 13 Mar 2024 11:14:25 +0100 Subject: [PATCH 3/4] Add extra signature with json input for shape, dim_names and strides --- cpp/src/arrow/record_batch_test.cc | 17 ++++++----- cpp/src/arrow/testing/gtest_util.cc | 37 ++++++++++++++++++++++-- cpp/src/arrow/testing/gtest_util.h | 12 ++++++-- cpp/src/arrow/testing/gtest_util_test.cc | 37 ++++++++++++++++++++++++ 4 files changed, 90 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index 5ae04cfd1419e..36b9d0ae06cb6 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -706,8 +706,8 @@ TEST_F(TestRecordBatch, ToTensorSupportedNaN) { const int64_t f32_size = sizeof(float); std::vector f_strides = {f32_size, f32_size * shape[0]}; std::shared_ptr tensor_expected = TensorFromJSON( - float32(), shape, - "[NaN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, NaN, 60, 70, 80, 90]", f_strides); + float32(), "[NaN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, NaN, 60, 70, 80, 90]", + shape, f_strides); EXPECT_FALSE(tensor_expected->Equals(*tensor)); EXPECT_TRUE(tensor_expected->Equals(*tensor, EqualOptions().nans_equal(true))); @@ -748,10 +748,10 @@ TYPED_TEST_P(TestBatchToTensor, SupportedTypes) { std::vector shape = {9, 3}; std::vector f_strides = {unit_size, unit_size * shape[0]}; std::shared_ptr tensor_expected = TensorFromJSON( - TypeTraits::type_singleton(), shape, + TypeTraits::type_singleton(), "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, " "80, 90, 100, 100, 100, 100, 100, 100, 100, 100, 100]", - f_strides); + shape, f_strides); EXPECT_TRUE(tensor_expected->Equals(*tensor)); CheckTensor(tensor, 27, shape, f_strides); @@ -765,10 +765,10 @@ TYPED_TEST_P(TestBatchToTensor, SupportedTypes) { std::vector shape_sliced = {8, 3}; std::vector f_strides_sliced = {unit_size, unit_size * shape_sliced[0]}; std::shared_ptr tensor_expected_sliced = - TensorFromJSON(TypeTraits::type_singleton(), shape_sliced, + TensorFromJSON(TypeTraits::type_singleton(), "[2, 3, 4, 5, 6, 7, 8, 9, 20, 30, 40, 50, 60, " "70, 80, 90, 100, 100, 100, 100, 100, 100, 100, 100]", - f_strides_sliced); + shape_sliced, f_strides_sliced); EXPECT_TRUE(tensor_expected_sliced->Equals(*tensor_sliced)); CheckTensor(tensor_expected_sliced, 24, shape_sliced, f_strides_sliced); @@ -781,8 +781,9 @@ TYPED_TEST_P(TestBatchToTensor, SupportedTypes) { std::vector shape_sliced_1 = {5, 3}; std::vector f_strides_sliced_1 = {unit_size, unit_size * shape_sliced_1[0]}; std::shared_ptr tensor_expected_sliced_1 = TensorFromJSON( - TypeTraits::type_singleton(), shape_sliced_1, - "[2, 3, 4, 5, 6, 20, 30, 40, 50, 60, 100, 100, 100, 100, 100]", f_strides_sliced_1); + TypeTraits::type_singleton(), + "[2, 3, 4, 5, 6, 20, 30, 40, 50, 60, 100, 100, 100, 100, 100]", + shape_sliced_1, f_strides_sliced_1); EXPECT_TRUE(tensor_expected_sliced_1->Equals(*tensor_sliced_1)); CheckTensor(tensor_expected_sliced_1, 15, shape_sliced_1, f_strides_sliced_1); diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index 0e01f81e4b347..95de16c715f19 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -50,6 +50,7 @@ #include "arrow/compute/api_vector.h" #include "arrow/datum.h" #include "arrow/ipc/json_simple.h" +#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep #include "arrow/pretty_print.h" #include "arrow/status.h" #include "arrow/table.h" @@ -63,6 +64,10 @@ #include "arrow/util/thread_pool.h" #include "arrow/util/windows_compatibility.h" +#include + +namespace rj = arrow::rapidjson; + namespace arrow { using internal::checked_cast; @@ -427,11 +432,39 @@ std::shared_ptr
TableFromJSON(const std::shared_ptr& schema, } std::shared_ptr TensorFromJSON(const std::shared_ptr& type, + std::string_view data, std::string_view shape, + std::string_view strides, + std::string_view dim_names) { + std::shared_ptr array = ArrayFromJSON(type, data); + + rj::Document json_shape; + json_shape.Parse(shape.data(), shape.length()); + std::vector shape_vector; + for (auto& x : json_shape.GetArray()) { + shape_vector.emplace_back(x.GetInt64()); + } + rj::Document json_strides; + json_strides.Parse(strides.data(), strides.length()); + std::vector strides_vector; + for (auto& x : json_strides.GetArray()) { + strides_vector.emplace_back(x.GetInt64()); + } + rj::Document json_dim_names; + json_dim_names.Parse(dim_names.data(), dim_names.length()); + std::vector dim_names_vector; + for (auto& x : json_dim_names.GetArray()) { + dim_names_vector.emplace_back(x.GetString()); + } + return *Tensor::Make(type, array->data()->buffers[1], shape_vector, strides_vector, + dim_names_vector); +} + +std::shared_ptr TensorFromJSON(const std::shared_ptr& type, + std::string_view data, const std::vector& shape, - std::string_view json, const std::vector& strides, const std::vector& dim_names) { - std::shared_ptr array = ArrayFromJSON(type, json); + std::shared_ptr array = ArrayFromJSON(type, data); return *Tensor::Make(type, array->data()->buffers[1], shape, strides, dim_names); } diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 6cd48618e950e..85b4c1f1f0138 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -354,12 +354,18 @@ ARROW_TESTING_EXPORT std::shared_ptr
TableFromJSON(const std::shared_ptr&, const std::vector& json); +ARROW_TESTING_EXPORT +std::shared_ptr TensorFromJSON(const std::shared_ptr& type, + std::string_view data, std::string_view shape, + std::string_view strides = "[]", + std::string_view dim_names = "[]"); + ARROW_TESTING_EXPORT std::shared_ptr TensorFromJSON(const std::shared_ptr& type, std::string_view data, - std::string_view shape, - std::string_view dim_names = "[]", - std::string_view strides = "[]"); + const std::vector& shape, + const std::vector& strides = {}, + const std::vector& dim_names = {}); ARROW_TESTING_EXPORT Result> RunEndEncodeTableColumns( diff --git a/cpp/src/arrow/testing/gtest_util_test.cc b/cpp/src/arrow/testing/gtest_util_test.cc index 14c17a972aa06..9b4514197d776 100644 --- a/cpp/src/arrow/testing/gtest_util_test.cc +++ b/cpp/src/arrow/testing/gtest_util_test.cc @@ -21,6 +21,7 @@ #include "arrow/array/builder_decimal.h" #include "arrow/datum.h" #include "arrow/record_batch.h" +#include "arrow/tensor.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/type.h" @@ -134,4 +135,40 @@ TEST_F(TestAssertContainsNaN, DatumEqual) { AssertDatumsEqual(expected_chunked, actual_chunked); } +class TestTensorFromJSON : public ::testing::Test {}; + +TEST_F(TestTensorFromJSON, FromJSONAndArray) { + std::vector shape = {9, 2}; + const int64_t i64_size = sizeof(int64_t); + std::vector f_strides = {i64_size, i64_size * shape[0]}; + std::vector f_values = {1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 20, 30, 40, 50, 60, 70, 80, 90}; + auto data = Buffer::Wrap(f_values); + + std::shared_ptr tensor_expected; + ASSERT_OK_AND_ASSIGN(tensor_expected, Tensor::Make(int64(), data, shape, f_strides)); + + std::shared_ptr result = TensorFromJSON( + int64(), "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90]", + shape, f_strides); + + EXPECT_TRUE(tensor_expected->Equals(*result)); +} + +TEST_F(TestTensorFromJSON, FromJSON) { + std::vector shape = {9, 2}; + std::vector values = {1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 20, 30, 40, 50, 60, 70, 80, 90}; + auto data = Buffer::Wrap(values); + + std::shared_ptr tensor_expected; + ASSERT_OK_AND_ASSIGN(tensor_expected, Tensor::Make(int64(), data, shape)); + + std::shared_ptr result = TensorFromJSON( + int64(), "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90]", + "[9, 2]"); + + EXPECT_TRUE(tensor_expected->Equals(*result)); +} + } // namespace arrow From 27c94f34aeaf8eeabebde6addcb2febeea104ab3 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Wed, 13 Mar 2024 11:24:36 +0100 Subject: [PATCH 4/4] Fix linter error --- cpp/src/arrow/record_batch_test.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index 36b9d0ae06cb6..db68a9a93790d 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -780,10 +780,10 @@ TYPED_TEST_P(TestBatchToTensor, SupportedTypes) { std::vector shape_sliced_1 = {5, 3}; std::vector f_strides_sliced_1 = {unit_size, unit_size * shape_sliced_1[0]}; - std::shared_ptr tensor_expected_sliced_1 = TensorFromJSON( - TypeTraits::type_singleton(), - "[2, 3, 4, 5, 6, 20, 30, 40, 50, 60, 100, 100, 100, 100, 100]", - shape_sliced_1, f_strides_sliced_1); + std::shared_ptr tensor_expected_sliced_1 = + TensorFromJSON(TypeTraits::type_singleton(), + "[2, 3, 4, 5, 6, 20, 30, 40, 50, 60, 100, 100, 100, 100, 100]", + shape_sliced_1, f_strides_sliced_1); EXPECT_TRUE(tensor_expected_sliced_1->Equals(*tensor_sliced_1)); CheckTensor(tensor_expected_sliced_1, 15, shape_sliced_1, f_strides_sliced_1);