Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-40297: [C++] Add TensorFromJSON helper function #40365

Merged
merged 4 commits into from
Mar 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 17 additions & 35 deletions cpp/src/arrow/record_batch_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -705,17 +705,12 @@ TEST_F(TestRecordBatch, ToTensorSupportedNaN) {
std::vector<int64_t> shape = {9, 2};
const int64_t f32_size = sizeof(float);
std::vector<int64_t> f_strides = {f32_size, f32_size * shape[0]};
std::vector<float> f_values = {
static_cast<float>(NAN), 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40,
static_cast<float>(NAN), 60, 70, 80, 90};
auto data = Buffer::Wrap(f_values);

std::shared_ptr<Tensor> tensor_expected;
ASSERT_OK_AND_ASSIGN(tensor_expected, Tensor::Make(float32(), data, shape, f_strides));
std::shared_ptr<Tensor> tensor_expected = TensorFromJSON(
float32(), "[NaN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, NaN, 60, 70, 80, 90]",
shape, f_strides);

EXPECT_FALSE(tensor_expected->Equals(*tensor));
EXPECT_TRUE(tensor_expected->Equals(*tensor, EqualOptions().nans_equal(true)));

CheckTensor<FloatType>(tensor, 18, shape, f_strides);
}

Expand Down Expand Up @@ -752,15 +747,11 @@ TYPED_TEST_P(TestBatchToTensor, SupportedTypes) {

std::vector<int64_t> shape = {9, 3};
std::vector<int64_t> f_strides = {unit_size, unit_size * shape[0]};
std::vector<c_data_type> f_values = {1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 20, 30, 40, 50, 60, 70, 80, 90,
100, 100, 100, 100, 100, 100, 100, 100, 100};
auto data = Buffer::Wrap(f_values);

std::shared_ptr<Tensor> tensor_expected;
ASSERT_OK_AND_ASSIGN(
tensor_expected,
Tensor::Make(TypeTraits<DataType>::type_singleton(), data, shape, f_strides));
std::shared_ptr<Tensor> tensor_expected = TensorFromJSON(
TypeTraits<DataType>::type_singleton(),
"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, "
"80, 90, 100, 100, 100, 100, 100, 100, 100, 100, 100]",
shape, f_strides);

EXPECT_TRUE(tensor_expected->Equals(*tensor));
CheckTensor<DataType>(tensor, 27, shape, f_strides);
Expand All @@ -773,15 +764,11 @@ TYPED_TEST_P(TestBatchToTensor, SupportedTypes) {

std::vector<int64_t> shape_sliced = {8, 3};
std::vector<int64_t> f_strides_sliced = {unit_size, unit_size * shape_sliced[0]};
std::vector<c_data_type> f_values_sliced = {2, 3, 4, 5, 6, 7, 8, 9,
20, 30, 40, 50, 60, 70, 80, 90,
100, 100, 100, 100, 100, 100, 100, 100};
auto data_sliced = Buffer::Wrap(f_values_sliced);

std::shared_ptr<Tensor> tensor_expected_sliced;
ASSERT_OK_AND_ASSIGN(tensor_expected_sliced,
Tensor::Make(TypeTraits<DataType>::type_singleton(), data_sliced,
shape_sliced, f_strides_sliced));
std::shared_ptr<Tensor> tensor_expected_sliced =
TensorFromJSON(TypeTraits<DataType>::type_singleton(),
"[2, 3, 4, 5, 6, 7, 8, 9, 20, 30, 40, 50, 60, "
"70, 80, 90, 100, 100, 100, 100, 100, 100, 100, 100]",
shape_sliced, f_strides_sliced);

EXPECT_TRUE(tensor_expected_sliced->Equals(*tensor_sliced));
CheckTensor<DataType>(tensor_expected_sliced, 24, shape_sliced, f_strides_sliced);
Expand All @@ -793,15 +780,10 @@ TYPED_TEST_P(TestBatchToTensor, SupportedTypes) {

std::vector<int64_t> shape_sliced_1 = {5, 3};
std::vector<int64_t> f_strides_sliced_1 = {unit_size, unit_size * shape_sliced_1[0]};
std::vector<c_data_type> f_values_sliced_1 = {
2, 3, 4, 5, 6, 20, 30, 40, 50, 60, 100, 100, 100, 100, 100,
};
auto data_sliced_1 = Buffer::Wrap(f_values_sliced_1);

std::shared_ptr<Tensor> tensor_expected_sliced_1;
ASSERT_OK_AND_ASSIGN(tensor_expected_sliced_1,
Tensor::Make(TypeTraits<DataType>::type_singleton(), data_sliced_1,
shape_sliced_1, f_strides_sliced_1));
std::shared_ptr<Tensor> tensor_expected_sliced_1 =
TensorFromJSON(TypeTraits<DataType>::type_singleton(),
"[2, 3, 4, 5, 6, 20, 30, 40, 50, 60, 100, 100, 100, 100, 100]",
shape_sliced_1, f_strides_sliced_1);

EXPECT_TRUE(tensor_expected_sliced_1->Equals(*tensor_sliced_1));
CheckTensor<DataType>(tensor_expected_sliced_1, 15, shape_sliced_1, f_strides_sliced_1);
Expand Down
43 changes: 43 additions & 0 deletions cpp/src/arrow/testing/gtest_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,11 @@
#include "arrow/compute/api_vector.h"
#include "arrow/datum.h"
#include "arrow/ipc/json_simple.h"
#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep
#include "arrow/pretty_print.h"
#include "arrow/status.h"
#include "arrow/table.h"
#include "arrow/tensor.h"
#include "arrow/type.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/config.h"
Expand All @@ -62,6 +64,10 @@
#include "arrow/util/thread_pool.h"
#include "arrow/util/windows_compatibility.h"

#include <rapidjson/document.h>

namespace rj = arrow::rapidjson;

namespace arrow {

using internal::checked_cast;
Expand Down Expand Up @@ -425,6 +431,43 @@ std::shared_ptr<Table> TableFromJSON(const std::shared_ptr<Schema>& schema,
return *Table::FromRecordBatches(schema, std::move(batches));
}

std::shared_ptr<Tensor> TensorFromJSON(const std::shared_ptr<DataType>& type,
std::string_view data, std::string_view shape,
std::string_view strides,
std::string_view dim_names) {
std::shared_ptr<Array> array = ArrayFromJSON(type, data);

rj::Document json_shape;
json_shape.Parse(shape.data(), shape.length());
std::vector<int64_t> shape_vector;
for (auto& x : json_shape.GetArray()) {
shape_vector.emplace_back(x.GetInt64());
}
rj::Document json_strides;
json_strides.Parse(strides.data(), strides.length());
std::vector<int64_t> strides_vector;
for (auto& x : json_strides.GetArray()) {
strides_vector.emplace_back(x.GetInt64());
}
rj::Document json_dim_names;
json_dim_names.Parse(dim_names.data(), dim_names.length());
std::vector<std::string> dim_names_vector;
for (auto& x : json_dim_names.GetArray()) {
dim_names_vector.emplace_back(x.GetString());
}
return *Tensor::Make(type, array->data()->buffers[1], shape_vector, strides_vector,
dim_names_vector);
}

std::shared_ptr<Tensor> TensorFromJSON(const std::shared_ptr<DataType>& type,
std::string_view data,
const std::vector<int64_t>& shape,
const std::vector<int64_t>& strides,
const std::vector<std::string>& dim_names) {
std::shared_ptr<Array> array = ArrayFromJSON(type, data);
return *Tensor::Make(type, array->data()->buffers[1], shape, strides, dim_names);
}

Result<std::shared_ptr<Table>> RunEndEncodeTableColumns(
const Table& table, const std::vector<int>& column_indices) {
const int num_columns = table.num_columns();
Expand Down
13 changes: 13 additions & 0 deletions cpp/src/arrow/testing/gtest_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,19 @@ ARROW_TESTING_EXPORT
std::shared_ptr<Table> TableFromJSON(const std::shared_ptr<Schema>&,
const std::vector<std::string>& json);

ARROW_TESTING_EXPORT
std::shared_ptr<Tensor> TensorFromJSON(const std::shared_ptr<DataType>& type,
std::string_view data, std::string_view shape,
std::string_view strides = "[]",
std::string_view dim_names = "[]");

ARROW_TESTING_EXPORT
std::shared_ptr<Tensor> TensorFromJSON(const std::shared_ptr<DataType>& type,
std::string_view data,
const std::vector<int64_t>& shape,
const std::vector<int64_t>& strides = {},
const std::vector<std::string>& dim_names = {});

ARROW_TESTING_EXPORT
Result<std::shared_ptr<Table>> RunEndEncodeTableColumns(
const Table& table, const std::vector<int>& column_indices);
Expand Down
37 changes: 37 additions & 0 deletions cpp/src/arrow/testing/gtest_util_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "arrow/array/builder_decimal.h"
#include "arrow/datum.h"
#include "arrow/record_batch.h"
#include "arrow/tensor.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/testing/random.h"
#include "arrow/type.h"
Expand Down Expand Up @@ -134,4 +135,40 @@ TEST_F(TestAssertContainsNaN, DatumEqual) {
AssertDatumsEqual(expected_chunked, actual_chunked);
}

class TestTensorFromJSON : public ::testing::Test {};

TEST_F(TestTensorFromJSON, FromJSONAndArray) {
std::vector<int64_t> shape = {9, 2};
const int64_t i64_size = sizeof(int64_t);
std::vector<int64_t> f_strides = {i64_size, i64_size * shape[0]};
std::vector<int64_t> f_values = {1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 20, 30, 40, 50, 60, 70, 80, 90};
auto data = Buffer::Wrap(f_values);

std::shared_ptr<Tensor> tensor_expected;
ASSERT_OK_AND_ASSIGN(tensor_expected, Tensor::Make(int64(), data, shape, f_strides));

std::shared_ptr<Tensor> result = TensorFromJSON(
int64(), "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90]",
shape, f_strides);

EXPECT_TRUE(tensor_expected->Equals(*result));
}

TEST_F(TestTensorFromJSON, FromJSON) {
std::vector<int64_t> shape = {9, 2};
std::vector<int64_t> values = {1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 20, 30, 40, 50, 60, 70, 80, 90};
auto data = Buffer::Wrap(values);

std::shared_ptr<Tensor> tensor_expected;
ASSERT_OK_AND_ASSIGN(tensor_expected, Tensor::Make(int64(), data, shape));

std::shared_ptr<Tensor> result = TensorFromJSON(
int64(), "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90]",
"[9, 2]");

EXPECT_TRUE(tensor_expected->Equals(*result));
}

} // namespace arrow
Loading