diff --git a/cpp/src/arrow/util/uri.cc b/cpp/src/arrow/util/uri.cc index e73c983c59863..780dc29696178 100644 --- a/cpp/src/arrow/util/uri.cc +++ b/cpp/src/arrow/util/uri.cc @@ -145,6 +145,8 @@ bool Uri::has_port() const { return IsTextRangeSet(impl_->uri_.portText); } int32_t Uri::port() const { return impl_->port_; } +bool Uri::has_user_info() const { return IsTextRangeSet(impl_->uri_.userInfo); } + std::string Uri::user_info() const { return TextRangeToString(impl_->uri_.userInfo); } std::string Uri::username() const { @@ -228,6 +230,10 @@ Result>> Uri::query_items() cons return items; } +bool Uri::has_fragment() const { return IsTextRangeSet(impl_->uri_.fragment); } + +std::string Uri::fragment() const { return TextRangeToString(impl_->uri_.fragment); } + const std::string& Uri::ToString() const { return impl_->string_rep_; } Status Uri::Parse(const std::string& uri_string) { diff --git a/cpp/src/arrow/util/uri.h b/cpp/src/arrow/util/uri.h index 67e51f96c2660..78412d4fc666d 100644 --- a/cpp/src/arrow/util/uri.h +++ b/cpp/src/arrow/util/uri.h @@ -62,6 +62,8 @@ class ARROW_EXPORT Uri { /// number component. int32_t port() const; + /// Whether the URI has an userInfo. + bool has_user_info() const; /// The userInfo specified in the URI. std::string user_info() const; /// The username specified in the URI. @@ -83,6 +85,11 @@ class ARROW_EXPORT Uri { /// and a missing value, such in "a&b=1" vs. "a=&b=1". Result>> query_items() const; + /// Whether the URI has an explicit fragment. + bool has_fragment() const; + /// The URI fragment string + std::string fragment() const; + /// Get the string representation of this URI. const std::string& ToString() const; diff --git a/cpp/src/gandiva/gdv_function_stubs.h b/cpp/src/gandiva/gdv_function_stubs.h index 5613c75eec4de..dc24a4215fe4f 100644 --- a/cpp/src/gandiva/gdv_function_stubs.h +++ b/cpp/src/gandiva/gdv_function_stubs.h @@ -78,6 +78,9 @@ const char* gdv_fn_parse_url_utf8_utf8_utf8( const char* part, int32_t part_len, bool in2_valid, const char* pattern, int32_t pattern_len, bool in3_valid, bool* out_valid, int32_t* out_length); +const char* gdv_fn_substr_index_utf8_utf8_int32(int64_t ptr, int64_t holder_ptr, const char* input, int in_len, + const char* delim, int delim_len, int count, int32_t* out_len); + int64_t gdv_fn_to_date_utf8_utf8_int32(int64_t context, int64_t ptr, const char* data, int data_len, bool in1_validity, const char* pattern, int pattern_len, diff --git a/cpp/src/gandiva/parse_url_holder.h b/cpp/src/gandiva/parse_url_holder.h index 17ed5f80176af..40ccfb945b3fe 100644 --- a/cpp/src/gandiva/parse_url_holder.h +++ b/cpp/src/gandiva/parse_url_holder.h @@ -18,6 +18,7 @@ #pragma once #include +#include #include #include @@ -58,7 +59,12 @@ namespace gandiva { } else if (part_string == "PATH") { out = uri.path(); } else if (part_string == "QUERY") { - out = uri.query_string(); + // consistent with vanilla spark + if (uri.has_query()) { + out = uri.query_string(); + } else { + return nullptr; + } } else if (part_string == "PROTOCOL") { out = uri.scheme(); } else if (part_string == "FILE") { @@ -68,22 +74,28 @@ namespace gandiva { out = uri.path(); } } else if (part_string == "AUTHORITY") { - if (uri.has_port()) { - out = uri.host() + ":" + uri.port_text(); + if (uri.has_user_info()) { + out = uri.user_info() + "@" + uri.host(); } else { out = uri.host(); } + if (uri.has_port()) { + out = out + ":" + uri.port_text(); + } } else if (part_string == "USERINFO") { out = uri.user_info(); + } else if (part_string == "REF") { + // consistent with vanilla spark + if (uri.has_fragment()) { + out = uri.fragment(); + } else { + return nullptr; + } } else { return nullptr; } *out_length = static_cast(out.length()); - if (*out_length == 0) { - return nullptr; - } - char *result_buffer = reinterpret_cast(ctx->arena()->Allocate(*out_length)); if (result_buffer == NULLPTR) { ctx->set_error_msg("Could not allocate memory for result! Wrong result may be returned!"); @@ -95,7 +107,6 @@ namespace gandiva { return result_buffer; } - // We only support plain pattern string here. const char * operator()( ExecutionContext *ctx, const char * url, int32_t url_len, const char * part, int32_t part_len, @@ -114,23 +125,21 @@ namespace gandiva { if (part_string != "QUERY" || !uri.has_query()) { return nullptr; } else { - std::unordered_map queries; - const auto items = std::move(uri.query_items()).ValueUnsafe(); - for (const auto& query : items) { - queries.emplace(query.first, query.second); + RE2 re2("(&|^)" + pattern_string + "=([^&]*)"); + int groups_num = re2.NumberOfCapturingGroups(); + RE2::Arg *args[groups_num]; + for (int i = 0; i < groups_num; i++) { + args[i] = new RE2::Arg; } - - auto out_query = queries.find(pattern_string); - if (out_query == queries.end()) { + *(args[1]) = &out; + // Use re2 instead of pattern_ for better performance. + bool matched = RE2::PartialMatchN(uri.query_string(), re2, args, groups_num); + if (!matched) { + *out_length = 0; return nullptr; } - out = out_query->second; *out_length = static_cast(out.length()); - if (*out_length == 0) { - return nullptr; - } - char *result_buffer = reinterpret_cast(ctx->arena()->Allocate(*out_length)); if (result_buffer == NULLPTR) { ctx->set_error_msg("Could not allocate memory for result! Wrong result may be returned!"); diff --git a/cpp/src/gandiva/parse_url_holder_test.cc b/cpp/src/gandiva/parse_url_holder_test.cc index d09f3367c3cdd..38ec7e851f391 100644 --- a/cpp/src/gandiva/parse_url_holder_test.cc +++ b/cpp/src/gandiva/parse_url_holder_test.cc @@ -33,7 +33,7 @@ namespace gandiva { std::shared_ptr part_url_holder; auto status = ParseUrlHolder::Make(&part_url_holder); auto &parse_url = * part_url_holder; - std::string input_string = "https://userinfo@arrow.apache.org:8080/path?query=1"; + std::string input_string = "https://userinfo@arrow.apache.org:8080/path?query=1#fragment"; int32_t out_length = 0; // HOST @@ -87,8 +87,8 @@ namespace gandiva { &execution_context_, input_string.c_str(), static_cast(input_string.length()), part_string.c_str(), static_cast(part_string.length()), &out_length); std::string ret6_as_str(ret6, out_length); - EXPECT_EQ(out_length, 21); - EXPECT_EQ(ret6_as_str, "arrow.apache.org:8080"); + EXPECT_EQ(out_length, 30); + EXPECT_EQ(ret6_as_str, "userinfo@arrow.apache.org:8080"); // USERINFO part_string = "USERINFO"; @@ -99,19 +99,44 @@ namespace gandiva { EXPECT_EQ(out_length, 8); EXPECT_EQ(ret7_as_str, "userinfo"); + // REF + part_string = "REF"; + const char *ret8 = parse_url( + &execution_context_, input_string.c_str(), static_cast(input_string.length()), + part_string.c_str(), static_cast(part_string.length()), &out_length); + std::string ret8_as_str(ret8, out_length); + EXPECT_EQ(out_length, 8); + EXPECT_EQ(ret8_as_str, "fragment"); + + // REF empty + input_string = "http://user:pass@host/?#"; + const char *ret18 = parse_url( + &execution_context_, input_string.c_str(), static_cast(input_string.length()), + part_string.c_str(), static_cast(part_string.length()), &out_length); + std::string ret18_as_str(ret18, out_length); + EXPECT_EQ(out_length, 0); + EXPECT_EQ(ret18_as_str, ""); + + // REF not exist + input_string = "http://user:pass@host"; + const char *ret19 = parse_url( + &execution_context_, input_string.c_str(), static_cast(input_string.length()), + part_string.c_str(), static_cast(part_string.length()), &out_length); + EXPECT_EQ(ret19, nullptr); + // Invalid part part_string = "HOST_AND_PORT"; - const char *ret8 = parse_url( + const char *ret9 = parse_url( &execution_context_, input_string.c_str(), static_cast(input_string.length()), part_string.c_str(), static_cast(part_string.length()), &out_length); - EXPECT_EQ(ret8, nullptr); + EXPECT_EQ(ret9, nullptr); // Invalid url input_string = "abc-abc"; - const char *ret9 = parse_url( + const char *ret10 = parse_url( &execution_context_, input_string.c_str(), static_cast(input_string.length()), part_string.c_str(), static_cast(part_string.length()), &out_length); - EXPECT_EQ(ret9, nullptr); + EXPECT_EQ(ret10, nullptr); } TEST_F(TestParseUrlHolder, TestParseUrlWithQueryPattern) { @@ -141,6 +166,17 @@ namespace gandiva { EXPECT_EQ(out_length, 3); EXPECT_EQ(ret2_as_str, "1_1"); + // Test encoded query + input_string = "https://use%20r:pas%20s@example.com/dir%20/pa%20th.HTML?query=x%20y&q2=2#Ref%20two"; + query_string = "query"; + const char *ret13 = parse_url( + &execution_context_, input_string.c_str(), static_cast(input_string.length()), + part_string.c_str(), static_cast(part_string.length()), + query_string.c_str(), static_cast(query_string.length()), &out_length); + std::string ret13_as_str(ret13, out_length); + EXPECT_EQ(out_length, 5); + EXPECT_EQ(ret13_as_str, "x%20y"); + // Invalid pattern query_string = "query_pattern"; const char *ret3 = parse_url(