diff --git a/cpp/doxygen/regex.md b/cpp/doxygen/regex.md index 8d206f245dc..6d1c91a5752 100644 --- a/cpp/doxygen/regex.md +++ b/cpp/doxygen/regex.md @@ -17,6 +17,12 @@ The details are based on features documented at https://www.regular-expressions. **Note:** The alternation character is the pipe character `|` and not the character included in the tables on this page. There is an issue including the pipe character inside the table markdown that is rendered by doxygen. +By default, only the `\n` character is recognized as a line break. The [cudf::strings::regex_flags::EXT_NEWLINE](@ref cudf::strings::regex_flags) increases the set of line break characters to include: +- Paragraph separator (Unicode: `2029`, UTF-8: `E280A9`) +- Line separator (Unicode: `2028`, UTF-8: `E280A8`) +- Next line (Unicode: `0085`, UTF-8: `C285`) +- Carriage return (Unicode: `000D`, UTF-8: `0D`) + **Invalid regex patterns will result in undefined behavior**. This includes but is not limited to the following: - Unescaped special characters (listed in the third row of the Characters table below) when they are intended to match as literals. - Unmatched paired special characters like `()`, `[]`, and `{}`. diff --git a/cpp/include/cudf/strings/regex/flags.hpp b/cpp/include/cudf/strings/regex/flags.hpp index f7108129dee..4f3fc7086f2 100644 --- a/cpp/include/cudf/strings/regex/flags.hpp +++ b/cpp/include/cudf/strings/regex/flags.hpp @@ -35,10 +35,11 @@ namespace strings { * and to match the Python flag values. */ enum regex_flags : uint32_t { - DEFAULT = 0, ///< default - MULTILINE = 8, ///< the '^' and '$' honor new-line characters - DOTALL = 16, ///< the '.' matching includes new-line characters - ASCII = 256 ///< use only ASCII when matching built-in character classes + DEFAULT = 0, ///< default + MULTILINE = 8, ///< the '^' and '$' honor new-line characters + DOTALL = 16, ///< the '.' matching includes new-line characters + ASCII = 256, ///< use only ASCII when matching built-in character classes + EXT_NEWLINE = 512 ///< new-line matches extended characters }; /** @@ -74,6 +75,17 @@ constexpr bool is_ascii(regex_flags const f) return (f & regex_flags::ASCII) == regex_flags::ASCII; } +/** + * @brief Returns true if the given flags contain EXT_NEWLINE + * + * @param f Regex flags to check + * @return true if `f` includes EXT_NEWLINE + */ +constexpr bool is_ext_newline(regex_flags const f) +{ + return (f & regex_flags::EXT_NEWLINE) == regex_flags::EXT_NEWLINE; +} + /** * @brief Capture groups setting * diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index abb26d7ccb4..14695c3bb27 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -191,9 +191,14 @@ __device__ inline string_view::const_iterator& string_view::const_iterator::oper __device__ inline string_view::const_iterator& string_view::const_iterator::operator--() { - if (byte_pos > 0) - while (strings::detail::bytes_in_utf8_byte(static_cast(p[--byte_pos])) == 0) - ; + if (byte_pos > 0) { + if (byte_pos == char_pos) { + --byte_pos; + } else { + while (strings::detail::bytes_in_utf8_byte(static_cast(p[--byte_pos])) == 0) + ; + } + } --char_pos; return *this; } diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index adf650a4f27..7c4c89bd3fb 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -539,15 +539,26 @@ class regex_parser { : static_cast(LBRA); case ')': return RBRA; case '^': { - _chr = is_multiline(_flags) ? chr : '\n'; + if (is_ext_newline(_flags)) { + _chr = is_multiline(_flags) ? 'S' : 'N'; + } else { + _chr = is_multiline(_flags) ? chr : '\n'; + } return BOL; } case '$': { - _chr = is_multiline(_flags) ? chr : '\n'; + if (is_ext_newline(_flags)) { + _chr = is_multiline(_flags) ? 'S' : 'N'; + } else { + _chr = is_multiline(_flags) ? chr : '\n'; + } return EOL; } case '[': return build_cclass(); - case '.': return dot_type; + case '.': { + _chr = is_ext_newline(_flags) ? 'N' : chr; + return dot_type; + } } if (std::find(quantifiers.begin(), quantifiers.end(), static_cast(chr)) == @@ -959,7 +970,7 @@ class regex_compiler { _prog.inst_at(inst_id).u1.cls_id = class_id; } else if (token == CHAR) { _prog.inst_at(inst_id).u1.c = yy; - } else if (token == BOL || token == EOL) { + } else if (token == BOL || token == EOL || token == ANY) { _prog.inst_at(inst_id).u1.c = yy; } push_and(inst_id, inst_id); @@ -1194,7 +1205,7 @@ void reprog::print(regex_flags const flags) case STAR: printf(" STAR next=%d", inst.u2.next_id); break; case PLUS: printf(" PLUS next=%d", inst.u2.next_id); break; case QUEST: printf(" QUEST next=%d", inst.u2.next_id); break; - case ANY: printf(" ANY next=%d", inst.u2.next_id); break; + case ANY: printf(" ANY '%c', next=%d", inst.u1.c, inst.u2.next_id); break; case ANYNL: printf(" ANYNL next=%d", inst.u2.next_id); break; case NOP: printf(" NOP next=%d", inst.u2.next_id); break; case BOL: { diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index 3b899e4edc1..e34a1e12015 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -126,6 +126,16 @@ __device__ __forceinline__ void reprog_device::reljunk::swaplist() list2 = tmp; } +/** + * @brief Check for supported new-line characters + * + * '\n, \r, \u0085, \u2028, or \u2029' + */ +constexpr bool is_newline(char32_t const ch) +{ + return (ch == '\n' || ch == '\r' || ch == 0x00c285 || ch == 0x00e280a8 || ch == 0x00e280a9); +} + /** * @brief Utility to check a specific character against this class instance. * @@ -258,11 +268,14 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const if (checkstart) { auto startchar = static_cast(jnk.startchar); switch (jnk.starttype) { - case BOL: - if (pos == 0) break; - if (jnk.startchar != '^') { return cuda::std::nullopt; } + case BOL: { + if (pos == 0) { break; } + if (startchar != '^' && startchar != 'S') { return cuda::std::nullopt; } + if (startchar != '\n') { break; } --itr; startchar = static_cast('\n'); + [[fallthrough]]; + } case CHAR: { auto const find_itr = find_char(startchar, dstr, itr); if (find_itr.byte_offset() >= dstr.size_bytes()) { return cuda::std::nullopt; } @@ -312,26 +325,34 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const id_activate = inst.u2.next_id; expanded = true; break; - case BOL: - if ((pos == 0) || ((inst.u1.c == '^') && (dstr[pos - 1] == '\n'))) { + case BOL: { + auto titr = itr; + auto const prev_c = pos > 0 ? *(--titr) : 0; + if ((pos == 0) || ((inst.u1.c == '^') && (prev_c == '\n')) || + ((inst.u1.c == 'S') && (is_newline(prev_c)))) { id_activate = inst.u2.next_id; expanded = true; } break; - case EOL: + } + case EOL: { // after the last character OR: // - for MULTILINE, if current character is new-line // - for non-MULTILINE, the very last character of the string can also be a new-line + bool const nl = (inst.u1.c == 'S' || inst.u1.c == 'N') ? is_newline(c) : (c == '\n'); if (last_character || - ((c == '\n') && (inst.u1.c != 'Z') && - ((inst.u1.c == '$') || (itr.byte_offset() + 1 == dstr.size_bytes())))) { + (nl && (inst.u1.c != 'Z') && + ((inst.u1.c == '$' || inst.u1.c == 'S') || + (itr.byte_offset() + bytes_in_char_utf8(c) == dstr.size_bytes())))) { id_activate = inst.u2.next_id; expanded = true; } break; + } case BOW: case NBOW: { - auto const prev_c = pos > 0 ? dstr[pos - 1] : 0; + auto titr = itr; + auto const prev_c = pos > 0 ? *(--titr) : 0; auto const word_class = reclass_device{CCLASS_W}; bool const curr_is_word = word_class.is_match(c, _codepoint_flags); bool const prev_is_word = word_class.is_match(prev_c, _codepoint_flags); @@ -366,9 +387,10 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const case CHAR: if (inst.u1.c == c) id_activate = inst.u2.next_id; break; - case ANY: - if (c != '\n') id_activate = inst.u2.next_id; - break; + case ANY: { + if ((c == '\n') || ((inst.u1.c == 'N') && is_newline(c))) { break; } + [[fallthrough]]; + } case ANYNL: id_activate = inst.u2.next_id; break; case NCCLASS: case CCLASS: { diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index c816316d0ff..acf850c7a66 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include @@ -613,6 +615,63 @@ TEST_F(StringsContainsTests, MultiLine) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); } +TEST_F(StringsContainsTests, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé" LINE_SEPARATOR "qqq" NEXT_LINE "zzé", + "qqq\rzzé" LINE_SEPARATOR "lll", + "zzé", + "", + "zzé" PARAGRAPH_SEPARATOR, + "abc\nzzé" NEXT_LINE}); + auto view = cudf::strings_column_view(input); + + auto pattern = std::string("^zzé$"); + auto prog = + cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE); + auto ml_flags = static_cast(cudf::strings::regex_flags::EXT_NEWLINE | + cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create(pattern, ml_flags); + + auto expected = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); + auto results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + expected = cudf::test::fixed_width_column_wrapper({1, 1, 1, 0, 1, 1}); + results = cudf::strings::contains_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + expected = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); + results = cudf::strings::matches_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + expected = cudf::test::fixed_width_column_wrapper({1, 0, 1, 0, 1, 0}); + results = cudf::strings::matches_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto counts = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); + results = cudf::strings::count_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, counts); + counts = cudf::test::fixed_width_column_wrapper({2, 1, 1, 0, 1, 1}); + results = cudf::strings::count_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, counts); + + pattern = std::string("q.*l"); + prog = cudf::strings::regex_program::create(pattern); + expected = cudf::test::fixed_width_column_wrapper({0, 1, 0, 0, 0, 0}); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + // inst ANY will stop matching on first 'newline' and so should not match anything here + prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE); + expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 0, 0}); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + // including the DOTALL flag accepts the newline characters + auto dot_flags = static_cast(cudf::strings::regex_flags::EXT_NEWLINE | + cudf::strings::regex_flags::DOTALL); + prog = cudf::strings::regex_program::create(pattern, dot_flags); + expected = cudf::test::fixed_width_column_wrapper({0, 1, 0, 0, 0, 0}); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(StringsContainsTests, EndOfString) { auto input = cudf::test::strings_column_wrapper( diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp index b26cbd5a549..1491da758d5 100644 --- a/cpp/tests/strings/extract_tests.cpp +++ b/cpp/tests/strings/extract_tests.cpp @@ -14,9 +14,12 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include +#include #include #include @@ -200,6 +203,43 @@ TEST_F(StringsExtractTests, DotAll) CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); } +TEST_F(StringsExtractTests, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" LINE_SEPARATOR "zzé", + "qqq" LINE_SEPARATOR "zzé\rlll", + "zzé", + "", + "zzé" NEXT_LINE, + "abc" PARAGRAPH_SEPARATOR "zzé\n"}); + auto view = cudf::strings_column_view(input); + + auto prog = + cudf::strings::regex_program::create("(^zzé$)", cudf::strings::regex_flags::EXT_NEWLINE); + auto results = cudf::strings::extract(view, *prog); + auto expected = + cudf::test::strings_column_wrapper({"", "", "zzé", "", "zzé", ""}, {0, 0, 1, 0, 1, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); + + auto both_flags = static_cast( + cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags); + results = cudf::strings::extract(view, *prog_ml); + expected = + cudf::test::strings_column_wrapper({"zzé", "zzé", "zzé", "", "zzé", "zzé"}, {1, 1, 1, 0, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); + + prog = cudf::strings::regex_program::create("q(q.*l)l"); + expected = cudf::test::strings_column_wrapper({"", "qq" LINE_SEPARATOR "zzé\rll", "", "", "", ""}, + {0, 1, 0, 0, 0, 0}); + results = cudf::strings::extract(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); + // expect no matches here since the newline(s) interrupts the pattern + prog = cudf::strings::regex_program::create("q(q.*l)l", cudf::strings::regex_flags::EXT_NEWLINE); + expected = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, {0, 0, 0, 0, 0, 0}); + results = cudf::strings::extract(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); +} + TEST_F(StringsExtractTests, EmptyExtractTest) { std::vector h_strings{nullptr, "AAA", "AAA_A", "AAA_AAA_", "A__", ""}; diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp index 4582dcb1e38..47606b9b3ed 100644 --- a/cpp/tests/strings/findall_tests.cpp +++ b/cpp/tests/strings/findall_tests.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include @@ -80,6 +82,32 @@ TEST_F(StringsFindallTests, DotAll) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } +TEST_F(StringsFindallTests, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé" PARAGRAPH_SEPARATOR "qqq\nzzé", + "qqq\nzzé" PARAGRAPH_SEPARATOR "lll", + "zzé", + "", + "zzé\r", + "zzé" LINE_SEPARATOR "zzé" NEXT_LINE}); + auto view = cudf::strings_column_view(input); + + auto prog = + cudf::strings::regex_program::create("(^zzé$)", cudf::strings::regex_flags::EXT_NEWLINE); + auto results = cudf::strings::findall(view, *prog); + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{}, LCW{}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + auto both_flags = static_cast( + cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags); + results = cudf::strings::findall(view, *prog_ml); + LCW expected_ml( + {LCW{"zzé", "zzé"}, LCW{"zzé"}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{"zzé", "zzé"}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected_ml); +} + TEST_F(StringsFindallTests, MediumRegex) { // This results in 15 regex instructions and falls in the 'medium' range. diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp index 8c0482653fb..9847d8d6bb5 100644 --- a/cpp/tests/strings/replace_regex_tests.cpp +++ b/cpp/tests/strings/replace_regex_tests.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include @@ -245,6 +247,53 @@ TEST_F(StringsReplaceRegexTest, Multiline) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected); } +TEST_F(StringsReplaceRegexTest, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" NEXT_LINE "zzé", + "qqq" NEXT_LINE "zzé" NEXT_LINE "lll", + "zzé", + "", + "zzé" PARAGRAPH_SEPARATOR, + "abc\rzzé\r"}); + auto view = cudf::strings_column_view(input); + auto repl = cudf::string_scalar("_"); + auto pattern = std::string("^zzé$"); + auto prog = + cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE); + auto results = cudf::strings::replace_re(view, *prog, repl); + auto expected = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" NEXT_LINE "zzé", + "qqq" NEXT_LINE "zzé" NEXT_LINE "lll", + "_", + "", + "_" PARAGRAPH_SEPARATOR, + "abc\rzzé\r"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + auto both_flags = static_cast( + cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create(pattern, both_flags); + results = cudf::strings::replace_re(view, *prog_ml, repl); + expected = cudf::test::strings_column_wrapper({"_" NEXT_LINE "qqq" NEXT_LINE "_", + "qqq" NEXT_LINE "_" NEXT_LINE "lll", + "_", + "", + "_" PARAGRAPH_SEPARATOR, + "abc\r_\r"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + auto repl_template = std::string("[\\1]"); + pattern = std::string("(^zzé$)"); + prog = cudf::strings::regex_program::create(pattern, both_flags); + results = cudf::strings::replace_with_backrefs(view, *prog, repl_template); + expected = cudf::test::strings_column_wrapper({"[zzé]" NEXT_LINE "qqq" NEXT_LINE "[zzé]", + "qqq" NEXT_LINE "[zzé]" NEXT_LINE "lll", + "[zzé]", + "", + "[zzé]" PARAGRAPH_SEPARATOR, + "abc\r[zzé]\r"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); +} + TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest) { std::vector h_strings{"the quick brown fox jumps over the lazy dog", diff --git a/cpp/tests/strings/special_chars.h b/cpp/tests/strings/special_chars.h new file mode 100644 index 00000000000..0d630f6bb52 --- /dev/null +++ b/cpp/tests/strings/special_chars.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +namespace cudf::test { + +// special new-line characters for use with regex_flags::EXT_NEWLINE +#define NEXT_LINE "\xC2\x85" +#define LINE_SEPARATOR "\xE2\x80\xA8" +#define PARAGRAPH_SEPARATOR "\xE2\x80\xA9" + +} // namespace cudf::test