From c82d3f1e4e7d21d6df8f77bb5a8a02f13734b345 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 16 Nov 2021 09:45:16 -0500 Subject: [PATCH 1/2] Fix regex non-multiline EOL/$ matching strings ending with a new-line --- cpp/src/strings/regex/regex.inl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index 66e99756615..bc0679993d0 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -276,7 +276,10 @@ __device__ inline int32_t reprog_device::regexec( } break; case EOL: - if (last_character || (inst->u1.c == '$' && c == '\n')) { + if (last_character || + (c == '\n' && (inst->u1.c == '$' || + // edge case where \n appears at the end of the string + pos + 1 == dstr.length()))) { id_activate = inst->u2.next_id; expanded = true; } From 3013f3d48b05747f58a1e41d4e2952a9407080f7 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 17 Nov 2021 09:17:44 -0500 Subject: [PATCH 2/2] add gtest and pytest --- cpp/tests/strings/contains_tests.cpp | 17 +++++++++-------- python/cudf/cudf/tests/test_string.py | 5 +++-- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index 3c11444e4b5..229f9e4cc82 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -302,28 +302,29 @@ TEST_F(StringsContainsTests, CountTest) TEST_F(StringsContainsTests, MultiLine) { - auto input = cudf::test::strings_column_wrapper({"abc\nfff\nabc", "fff\nabc\nlll", "abc", ""}); - auto view = cudf::strings_column_view(input); + auto input = + cudf::test::strings_column_wrapper({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"}); + auto view = cudf::strings_column_view(input); auto results = cudf::strings::contains_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE); - auto expected_contains = cudf::test::fixed_width_column_wrapper({1, 1, 1, 0}); + auto expected_contains = cudf::test::fixed_width_column_wrapper({1, 1, 1, 0, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); results = cudf::strings::contains_re(view, "^abc$"); - expected_contains = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0}); + expected_contains = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); results = cudf::strings::matches_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE); - auto expected_matches = cudf::test::fixed_width_column_wrapper({1, 0, 1, 0}); + auto expected_matches = cudf::test::fixed_width_column_wrapper({1, 0, 1, 0, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches); results = cudf::strings::matches_re(view, "^abc$"); - expected_matches = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0}); + expected_matches = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches); results = cudf::strings::count_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE); - auto expected_count = cudf::test::fixed_width_column_wrapper({2, 1, 1, 0}); + auto expected_count = cudf::test::fixed_width_column_wrapper({2, 1, 1, 0, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); results = cudf::strings::count_re(view, "^abc$"); - expected_count = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0}); + expected_count = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); } diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index c75eb91a335..cf52c4684c8 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1746,12 +1746,13 @@ def test_string_wrap(data, width): ["A B", "1.5", "3,000"], ["23", "³", "⅕", ""], [" ", "\t\r\n ", ""], - ["$", "B", "Aab$", "$$ca", "C$B$", "cat"], + ["$", "B", "Aab$", "$$ca", "C$B$", "cat", "cat\n"], ["line\nto be wrapped", "another\nline\nto be wrapped"], ], ) @pytest.mark.parametrize( - "pat", ["a", " ", "\t", "another", "0", r"\$", "^line$", "line.*be"] + "pat", + ["a", " ", "\t", "another", "0", r"\$", "^line$", "line.*be", "cat$"], ) @pytest.mark.parametrize("flags", [0, re.MULTILINE, re.DOTALL]) def test_string_count(data, pat, flags):