Skip to content

Commit

Permalink
apacheGH-40968: [C++][Gandiva] add RE2::Options set_dot_nl(true) for …
Browse files Browse the repository at this point in the history
…Like f (#80)

* apacheGH-40968: [C++][Gandiva] add RE2::Options set_dot_nl(true) for Like function (apache#40970) (#68)

Gandiva function "LIKE" does not always work correctly when the string contains \n.
String value:
`[function_name: "Space1.protect"\nargs: "passenger_count"\ncolumn_name: "passenger_count" ]`
Pattern '%Space1%' nor '%Space1.%' do not match.

added flag set_dot_nl(true) to LikeHolder

add unit tests.

Yes

**This PR includes breaking changes to public APIs.**

* GitHub Issue: apache#40968

Lead-authored-by: Ivan Chesnov <[email protected]>

Signed-off-by: Sutou Kouhei <[email protected]>

* apacheGH-43119: [CI][Packaging] Update manylinux 2014 CentOS repos that have been deprecated (apache#43121)

Jobs are failing to find mirrorlist.centos.org

Updating repos based on solution from: apache#43119 (comment)

Via archery

No
* GitHub Issue: apache#43119

Lead-authored-by: Raúl Cumplido <[email protected]>
Co-authored-by: Sutou Kouhei <[email protected]>
Co-authored-by: Sutou Kouhei <[email protected]>
Signed-off-by: Raúl Cumplido <[email protected]>

---------

Signed-off-by: Sutou Kouhei <[email protected]>
Signed-off-by: Raúl Cumplido <[email protected]>
Co-authored-by: Raúl Cumplido <[email protected]>
Co-authored-by: Sutou Kouhei <[email protected]>
Co-authored-by: Sutou Kouhei <[email protected]>
  • Loading branch information
4 people authored Jul 12, 2024
1 parent d2e16bf commit 0890149
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 14 deletions.
14 changes: 14 additions & 0 deletions ci/docker/centos-7-cpp.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,25 @@

FROM centos:centos7

# Update mirrors to use vault.centos.org as CentOS 7
# is EOL since 2024-06-30
RUN sed -i \
-e 's/^mirrorlist/#mirrorlist/' \
-e 's/^#baseurl/baseurl/' \
-e 's/mirror\.centos\.org/vault.centos.org/' \
/etc/yum.repos.d/*.repo

# devtoolset is required for C++17
RUN \
yum install -y \
centos-release-scl \
epel-release && \
sed -i \
-e 's/^mirrorlist/#mirrorlist/' \
-e 's/^#baseurl/baseurl/' \
-e 's/^# baseurl/baseurl/' \
-e 's/mirror\.centos\.org/vault.centos.org/' \
/etc/yum.repos.d/CentOS-SCLo-scl*.repo && \
yum install -y \
cmake3 \
curl \
Expand Down
15 changes: 15 additions & 0 deletions ci/docker/python-wheel-manylinux-201x.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,21 @@ ARG manylinux

ENV MANYLINUX_VERSION=${manylinux}

# Ensure dnf is installed, especially for the manylinux2014 base
RUN if [ "${MANYLINUX_VERSION}" = "2014" ]; then \
sed -i \
-e 's/^mirrorlist/#mirrorlist/' \
-e 's/^#baseurl/baseurl/' \
-e 's/mirror\.centos\.org/vault.centos.org/' \
/etc/yum.repos.d/*.repo; \
if [ "${arch}" != "amd64" ]; then \
sed -i \
-e 's,vault\.centos\.org/centos,vault.centos.org/altarch,' \
/etc/yum.repos.d/CentOS-SCLo-scl-rh.repo; \
fi; \
fi
RUN yum install -y dnf

# Install basic dependencies
RUN yum install -y git flex curl autoconf zip perl-IPC-Cmd wget

Expand Down
13 changes: 8 additions & 5 deletions cpp/src/gandiva/regex_functions_holder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -99,13 +99,14 @@ Status LikeHolder::Make(const FunctionNode& node, std::shared_ptr<LikeHolder>* h
"'like' function requires a string literal as the second parameter"));

RE2::Options regex_op;
regex_op.set_dot_nl(true);
if (node.descriptor()->name() == "ilike") {
regex_op.set_case_sensitive(false); // set case-insensitive for ilike function.

return Make(std::get<std::string>(literal->holder()), holder, regex_op);
}
if (node.children().size() == 2) {
return Make(std::get<std::string>(literal->holder()), holder);
return Make(std::get<std::string>(literal->holder()), holder, regex_op);
} else {
auto escape_char = dynamic_cast<LiteralNode*>(node.children().at(2).get());
ARROW_RETURN_IF(
Expand All @@ -118,7 +119,7 @@ Status LikeHolder::Make(const FunctionNode& node, std::shared_ptr<LikeHolder>* h
Status::Invalid(
"'like' function requires a string literal as the third parameter"));
return Make(std::get<std::string>(literal->holder()),
std::get<std::string>(escape_char->holder()), holder);
std::get<std::string>(escape_char->holder()), holder, regex_op);
}
}

Expand All @@ -127,7 +128,9 @@ Status LikeHolder::Make(const std::string& sql_pattern,
std::string pcre_pattern;
ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern));

auto lholder = std::shared_ptr<LikeHolder>(new LikeHolder(pcre_pattern));
RE2::Options regex_op;
regex_op.set_dot_nl(true);
auto lholder = std::shared_ptr<LikeHolder>(new LikeHolder(pcre_pattern, regex_op));
ARROW_RETURN_IF(!lholder->regex_.ok(),
Status::Invalid("Building RE2 pattern '", pcre_pattern,
"' failed with: ", lholder->regex_.error()));
Expand All @@ -137,7 +140,7 @@ Status LikeHolder::Make(const std::string& sql_pattern,
}

Status LikeHolder::Make(const std::string& sql_pattern, const std::string& escape_char,
std::shared_ptr<LikeHolder>* holder) {
std::shared_ptr<LikeHolder>* holder, RE2::Options regex_op) {
ARROW_RETURN_IF(escape_char.length() > 1,
Status::Invalid("The length of escape char ", escape_char,
" in 'like' function is greater than 1"));
Expand All @@ -149,7 +152,7 @@ Status LikeHolder::Make(const std::string& sql_pattern, const std::string& escap
ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern));
}

auto lholder = std::shared_ptr<LikeHolder>(new LikeHolder(pcre_pattern));
auto lholder = std::shared_ptr<LikeHolder>(new LikeHolder(pcre_pattern, regex_op));
ARROW_RETURN_IF(!lholder->regex_.ok(),
Status::Invalid("Building RE2 pattern '", pcre_pattern,
"' failed with: ", lholder->regex_.error()));
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/gandiva/regex_functions_holder.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class GANDIVA_EXPORT LikeHolder : public FunctionHolder {
static Status Make(const std::string& sql_pattern, std::shared_ptr<LikeHolder>* holder);

static Status Make(const std::string& sql_pattern, const std::string& escape_char,
std::shared_ptr<LikeHolder>* holder);
std::shared_ptr<LikeHolder>* holder, RE2::Options regex_op);

static Status Make(const std::string& sql_pattern, std::shared_ptr<LikeHolder>* holder,
RE2::Options regex_op);
Expand Down
34 changes: 26 additions & 8 deletions cpp/src/gandiva/regex_functions_holder_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ namespace gandiva {
class TestLikeHolder : public ::testing::Test {
public:
RE2::Options regex_op;
void SetUp() { regex_op.set_dot_nl(true); }

FunctionNode BuildLike(std::string pattern) {
auto field = std::make_shared<FieldNode>(arrow::field("in", arrow::utf8()));
auto pattern_node =
Expand Down Expand Up @@ -75,6 +77,15 @@ TEST_F(TestLikeHolder, TestMatchOne) {
EXPECT_FALSE(like("dabc"));
}

TEST_F(TestLikeHolder, TestPcreSpecialWithNewLine) {
std::shared_ptr<LikeHolder> like_holder;
auto status = LikeHolder::Make("%Space1.%", &like_holder, regex_op);

auto& like = *like_holder;
EXPECT_TRUE(
like("[name: \"Space1.protect\"\nargs: \"count\"\ncolumn_name: \"pass_count\"]"));
}

TEST_F(TestLikeHolder, TestPcreSpecial) {
std::shared_ptr<LikeHolder> like_holder;

Expand Down Expand Up @@ -104,17 +115,25 @@ TEST_F(TestLikeHolder, TestDot) {
EXPECT_FALSE(like("abcd"));
}

TEST_F(TestLikeHolder, TestMatchWithNewLine) {
std::shared_ptr<LikeHolder> like_holder;
auto status = LikeHolder::Make("%abc%", "\\", &like_holder, regex_op);

auto& like = *like_holder;
EXPECT_TRUE(like("abc\nd"));
}

TEST_F(TestLikeHolder, TestMatchSubString) {
std::shared_ptr<LikeHolder> like_holder;

auto status = LikeHolder::Make("%abc%", "\\", &like_holder);
auto status = LikeHolder::Make("%abc%", "\\", &like_holder, regex_op);
EXPECT_EQ(status.ok(), true) << status.message();

auto& like = *like_holder;
EXPECT_TRUE(like("abc"));
EXPECT_FALSE(like("xxabdc"));

status = LikeHolder::Make("%ab-.^$*+?()[]{}|—/c\\%%", "\\", &like_holder);
status = LikeHolder::Make("%ab-.^$*+?()[]{}|—/c\\%%", "\\", &like_holder, regex_op);
EXPECT_EQ(status.ok(), true) << status.message();

auto& like_reserved_char = *like_holder;
Expand Down Expand Up @@ -192,7 +211,7 @@ TEST_F(TestLikeHolder, TestOptimise) {
TEST_F(TestLikeHolder, TestMatchOneEscape) {
std::shared_ptr<LikeHolder> like_holder;

auto status = LikeHolder::Make("ab\\_", "\\", &like_holder);
auto status = LikeHolder::Make("ab\\_", "\\", &like_holder, regex_op);
EXPECT_EQ(status.ok(), true) << status.message();

auto& like = *like_holder;
Expand All @@ -209,8 +228,7 @@ TEST_F(TestLikeHolder, TestMatchOneEscape) {
TEST_F(TestLikeHolder, TestMatchManyEscape) {
std::shared_ptr<LikeHolder> like_holder;

auto status = LikeHolder::Make("ab\\%", "\\", &like_holder);
EXPECT_EQ(status.ok(), true) << status.message();
auto status = LikeHolder::Make("ab\\%", "\\", &like_holder, regex_op);

auto& like = *like_holder;

Expand All @@ -226,7 +244,7 @@ TEST_F(TestLikeHolder, TestMatchManyEscape) {
TEST_F(TestLikeHolder, TestMatchEscape) {
std::shared_ptr<LikeHolder> like_holder;

auto status = LikeHolder::Make("ab\\\\", "\\", &like_holder);
auto status = LikeHolder::Make("ab\\\\", "\\", &like_holder, regex_op);
EXPECT_EQ(status.ok(), true) << status.message();

auto& like = *like_holder;
Expand All @@ -239,7 +257,7 @@ TEST_F(TestLikeHolder, TestMatchEscape) {
TEST_F(TestLikeHolder, TestEmptyEscapeChar) {
std::shared_ptr<LikeHolder> like_holder;

auto status = LikeHolder::Make("ab\\_", "", &like_holder);
auto status = LikeHolder::Make("ab\\_", "", &like_holder, regex_op);
EXPECT_EQ(status.ok(), true) << status.message();

auto& like = *like_holder;
Expand All @@ -254,7 +272,7 @@ TEST_F(TestLikeHolder, TestEmptyEscapeChar) {
TEST_F(TestLikeHolder, TestMultipleEscapeChar) {
std::shared_ptr<LikeHolder> like_holder;

auto status = LikeHolder::Make("ab\\_", "\\\\", &like_holder);
auto status = LikeHolder::Make("ab\\_", "\\\\", &like_holder, regex_op);
EXPECT_EQ(status.ok(), false) << status.message();
}

Expand Down

0 comments on commit 0890149

Please sign in to comment.