From 37cac971d3bd58a2b60ddd21c526851ec83ee8b3 Mon Sep 17 00:00:00 2001 From: JiaKe Date: Thu, 25 Aug 2022 07:56:19 +0000 Subject: [PATCH] Upgrade substrait to 0.7.0 (#148) * upgrade substrait to 0.7.0 * fix the uts --- .../arrow/engine_substrait_consumption.cc | 2 +- .../engine/substrait/expression_internal.cc | 38 ++++++++++++++----- .../engine/substrait/relation_internal.cc | 21 +++++----- cpp/src/arrow/engine/substrait/serde_test.cc | 14 ++++--- .../arrow/engine/substrait/type_internal.cc | 13 +++++-- cpp/thirdparty/versions.txt | 4 +- 6 files changed, 59 insertions(+), 33 deletions(-) diff --git a/cpp/examples/arrow/engine_substrait_consumption.cc b/cpp/examples/arrow/engine_substrait_consumption.cc index 6a296706ed654..ce5eb160e85b9 100644 --- a/cpp/examples/arrow/engine_substrait_consumption.cc +++ b/cpp/examples/arrow/engine_substrait_consumption.cc @@ -87,7 +87,7 @@ arrow::Future> GetSubstraitFromServer( "items": [ { "uri_file": "file://FILENAME_PLACEHOLDER", - "format": "FILE_FORMAT_PARQUET" + "parquet": {} } ] } diff --git a/cpp/src/arrow/engine/substrait/expression_internal.cc b/cpp/src/arrow/engine/substrait/expression_internal.cc index 2dd3725f7bcab..dbc739f8fcd4d 100644 --- a/cpp/src/arrow/engine/substrait/expression_internal.cc +++ b/cpp/src/arrow/engine/substrait/expression_internal.cc @@ -160,17 +160,35 @@ Result FromProto(const substrait::Expression& expr, ARROW_ASSIGN_OR_RAISE(auto decoded_function, ext_set.DecodeFunction(scalar_fn.function_reference())); - std::vector arguments(scalar_fn.args_size()); - for (int i = 0; i < scalar_fn.args_size(); ++i) { - ARROW_ASSIGN_OR_RAISE(arguments[i], FromProto(scalar_fn.args(i), ext_set)); + std::vector arguments(scalar_fn.arguments_size()); + for (int i = 0; i < scalar_fn.arguments_size(); ++i) { + const auto& argument = scalar_fn.arguments(i); + switch (argument.arg_type_case()) { + case substrait::FunctionArgument::kValue: { + ARROW_ASSIGN_OR_RAISE(arguments[i], FromProto(argument.value(), ext_set)); + break; + } + default: + return Status::NotImplemented( + "only value arguments are currently supported for functions"); + } } - + if (decoded_function.name.to_string() == "alias") { - if (scalar_fn.args_size() != 1) { + if (scalar_fn.arguments_size() != 1) { return arrow::Status::Invalid("Alias should have exact 1 arg, but got " + - std::to_string(scalar_fn.args_size())); + std::to_string(scalar_fn.arguments_size())); + } + + const auto& argument = scalar_fn.arguments(0); + switch (argument.arg_type_case()) { + case substrait::FunctionArgument::kValue: { + return FromProto(argument.value(), ext_set); + } + default: + return Status::NotImplemented( + "only value arguments are currently supported for functions"); } - return FromProto(scalar_fn.args().at(0), ext_set); } if (decoded_function.name.to_string() == "is_in") { const auto& in_list = @@ -907,9 +925,11 @@ Result> ToProto(const compute::Expression auto scalar_fn = internal::make_unique(); scalar_fn->set_function_reference(anchor); - scalar_fn->mutable_args()->Reserve(static_cast(arguments.size())); + scalar_fn->mutable_arguments()->Reserve(static_cast(arguments.size())); for (auto& arg : arguments) { - scalar_fn->mutable_args()->AddAllocated(arg.release()); + auto argument = internal::make_unique(); + argument->set_allocated_value(arg.release()); + scalar_fn->mutable_arguments()->AddAllocated(argument.release()); } out->set_allocated_scalar_function(scalar_fn.release()); diff --git a/cpp/src/arrow/engine/substrait/relation_internal.cc b/cpp/src/arrow/engine/substrait/relation_internal.cc index 094eac2656970..2de3be04dd0e9 100644 --- a/cpp/src/arrow/engine/substrait/relation_internal.cc +++ b/cpp/src/arrow/engine/substrait/relation_internal.cc @@ -110,17 +110,16 @@ Result FromProto(const substrait::Rel& rel, "path_type other than uri_file"); } - if (item.format() == - substrait::ReadRel::LocalFiles::FileOrFiles::FILE_FORMAT_PARQUET) { - format = std::make_shared(); - } else if (util::string_view{item.uri_file()}.ends_with(".arrow")) { - format = std::make_shared(); - } else if (util::string_view{item.uri_file()}.ends_with(".feather")) { - format = std::make_shared(); - } else { - return Status::NotImplemented( - "substrait::ReadRel::LocalFiles::FileOrFiles::format " - "other than FILE_FORMAT_PARQUET"); + switch (item.file_format_case()) { + case substrait::ReadRel_LocalFiles_FileOrFiles::kParquet: + format = std::make_shared(); + break; + case substrait::ReadRel_LocalFiles_FileOrFiles::kArrow: + format = std::make_shared(); + break; + default: + return Status::NotImplemented( + "unknown substrait::ReadRel::LocalFiles::FileOrFiles::file_format"); } if (!util::string_view{item.uri_file()}.starts_with("file:///")) { diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 300a6c528bd2e..bddc8915a4d51 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -180,11 +180,12 @@ TEST(Substrait, SupportedExtensionTypes) { auto anchor = ext_set.num_types(); EXPECT_THAT(ext_set.EncodeType(*expected_type), ResultWith(Eq(anchor))); + ASSERT_OK_AND_ASSIGN( auto buf, internal::SubstraitFromJSON( - "Type", "{\"user_defined_type_reference\": " + std::to_string(anchor) + "}")); - + "Type", "{\"user_defined\": { \"type_reference\": " + std::to_string(anchor) + + ", \"nullability\": \"NULLABILITY_NULLABLE\" } }")); ASSERT_OK_AND_ASSIGN(auto type, DeserializeType(*buf, ext_set)); EXPECT_EQ(*type, *expected_type); @@ -258,8 +259,9 @@ TEST(Substrait, NamedStruct) { } TEST(Substrait, NoEquivalentArrowType) { - ASSERT_OK_AND_ASSIGN(auto buf, internal::SubstraitFromJSON( - "Type", R"({"user_defined_type_reference": 99})")); + ASSERT_OK_AND_ASSIGN( + auto buf, + internal::SubstraitFromJSON("Type", R"({"user_defined": {"type_reference": 99}})")); ExtensionSet empty; ASSERT_THAT( DeserializeType(*buf, empty), @@ -629,11 +631,11 @@ TEST(Substrait, ReadRel) { "items": [ { "uri_file": "file:///tmp/dat1.parquet", - "format": "FILE_FORMAT_PARQUET" + "parquet": {} }, { "uri_file": "file:///tmp/dat2.parquet", - "format": "FILE_FORMAT_PARQUET" + "parquet": {} } ] } diff --git a/cpp/src/arrow/engine/substrait/type_internal.cc b/cpp/src/arrow/engine/substrait/type_internal.cc index c1dac97b6821a..fd15da182884a 100644 --- a/cpp/src/arrow/engine/substrait/type_internal.cc +++ b/cpp/src/arrow/engine/substrait/type_internal.cc @@ -206,10 +206,11 @@ Result, bool>> FromProto( field("value", std::move(value_nullable.first), value_nullable.second)); } - case substrait::Type::kUserDefinedTypeReference: { - uint32_t anchor = type.user_defined_type_reference(); + case substrait::Type::kUserDefined: { + const auto& user_defined = type.user_defined(); + uint32_t anchor = user_defined.type_reference(); ARROW_ASSIGN_OR_RAISE(auto type_record, ext_set.DecodeType(anchor)); - return std::make_pair(std::move(type_record.type), true); + return std::make_pair(std::move(type_record.type), IsNullable(user_defined)); } default: @@ -394,7 +395,11 @@ struct DataTypeToProtoImpl { template Status EncodeUserDefined(const T& t) { ARROW_ASSIGN_OR_RAISE(auto anchor, ext_set_->EncodeType(t)); - type_->set_user_defined_type_reference(anchor); + auto user_defined = internal::make_unique<::substrait::Type_UserDefined>(); + user_defined->set_type_reference(anchor); + user_defined->set_nullability(nullable_ ? ::substrait::Type::NULLABILITY_NULLABLE + : ::substrait::Type::NULLABILITY_REQUIRED); + type_->set_allocated_user_defined(user_defined.release()); return Status::OK(); } diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 22eb12c9f0990..0da63e4cf6028 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -87,8 +87,8 @@ ARROW_SNAPPY_BUILD_SHA256_CHECKSUM=75c1fbb3d618dd3a0483bff0e26d0a92b495bbe5059c8 # There is a bug in GCC < 4.9 with Snappy 1.1.9, so revert to 1.1.8 for those (ARROW-14661) ARROW_SNAPPY_OLD_BUILD_VERSION=1.1.8 ARROW_SNAPPY_OLD_BUILD_SHA256_CHECKSUM=16b677f07832a612b0836178db7f374e414f94657c138e6993cbfc5dcc58651f -ARROW_SUBSTRAIT_BUILD_VERSION=f7a74dd8 -ARROW_SUBSTRAIT_BUILD_SHA256_CHECKSUM=7a969afca305135c4933cffa4316ee6c6c9ee38813348c62401ff9a552cf6fdb +ARROW_SUBSTRAIT_BUILD_VERSION=v0.7.0 +ARROW_SUBSTRAIT_BUILD_SHA256_CHECKSUM=15657168b0158e26b2b2e3b19887e7118126284e4094abf2a2a3edddca9d33c2 ARROW_THRIFT_BUILD_VERSION=0.13.0 ARROW_THRIFT_BUILD_SHA256_CHECKSUM=7ad348b88033af46ce49148097afe354d513c1fca7c607b59c33ebb6064b5179 ARROW_UTF8PROC_BUILD_VERSION=v2.7.0