From 793c7dc87c708e792b0ca801741bfeaec9b9c2e6 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 10 Jun 2024 14:07:11 -0400 Subject: [PATCH 01/11] Support multiple new-line characters in regex APIs --- cpp/include/cudf/strings/string_view.cuh | 13 ++++++--- cpp/src/strings/regex/regex.inl | 34 +++++++++++++++------- cpp/tests/strings/contains_tests.cpp | 37 ++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 15 deletions(-) diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 74df1ea1887..f74746a400d 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -190,9 +190,14 @@ __device__ inline string_view::const_iterator& string_view::const_iterator::oper __device__ inline string_view::const_iterator& string_view::const_iterator::operator--() { - if (byte_pos > 0) - while (strings::detail::bytes_in_utf8_byte(static_cast(p[--byte_pos])) == 0) - ; + if (byte_pos > 0) { + if (byte_pos == char_pos) { + --byte_pos; + } else { + while (strings::detail::bytes_in_utf8_byte(static_cast(p[--byte_pos])) == 0) + ; + } + } --char_pos; return *this; } diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index 10e06505094..f69887f3aa1 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -127,6 +127,16 @@ __device__ __forceinline__ void reprog_device::reljunk::swaplist() list2 = tmp; } +/** + * @brief Check for supported new-line characters + * + * '\n, \r, \u0085, \u2028, or \u2029' + */ +__device__ __forceinline__ bool is_newline(char32_t const ch) +{ + return (ch == '\n' || ch == '\r' || ch == 0x00c285 || ch == 0x00e280a8 || ch == 0x00e280a9); +} + /** * @brief Utility to check a specific character against this class instance. * @@ -262,12 +272,10 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const case BOL: if (pos == 0) break; if (jnk.startchar != '^') { return thrust::nullopt; } - --itr; - startchar = static_cast('\n'); + break; case CHAR: { - auto const find_itr = find_char(startchar, dstr, itr); - if (find_itr.byte_offset() >= dstr.size_bytes()) { return thrust::nullopt; } - itr = find_itr + (jnk.starttype == BOL); + itr = find_char(startchar, dstr, itr); + if (itr.byte_offset() >= dstr.size_bytes()) { return thrust::nullopt; } pos = itr.position(); break; } @@ -313,26 +321,30 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const id_activate = inst.u2.next_id; expanded = true; break; - case BOL: - if ((pos == 0) || ((inst.u1.c == '^') && (dstr[pos - 1] == '\n'))) { + case BOL: { + auto titr = itr; + if ((pos == 0) || ((inst.u1.c == '^') && (is_newline(*(--titr))))) { id_activate = inst.u2.next_id; expanded = true; } break; + } case EOL: // after the last character OR: // - for MULTILINE, if current character is new-line // - for non-MULTILINE, the very last character of the string can also be a new-line if (last_character || - ((c == '\n') && (inst.u1.c != 'Z') && - ((inst.u1.c == '$') || (itr.byte_offset() + 1 == dstr.size_bytes())))) { + (is_newline(c) && (inst.u1.c != 'Z') && + ((inst.u1.c == '$') || + (itr.byte_offset() + bytes_in_char_utf8(c) == dstr.size_bytes())))) { id_activate = inst.u2.next_id; expanded = true; } break; case BOW: case NBOW: { - auto const prev_c = pos > 0 ? dstr[pos - 1] : 0; + auto titr = itr; + auto const prev_c = pos > 0 ? *(--titr) : 0; auto const word_class = reclass_device{CCLASS_W}; bool const curr_is_word = word_class.is_match(c, _codepoint_flags); bool const prev_is_word = word_class.is_match(prev_c, _codepoint_flags); @@ -368,7 +380,7 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const if (inst.u1.c == c) id_activate = inst.u2.next_id; break; case ANY: - if (c != '\n') id_activate = inst.u2.next_id; + if (!is_newline(c)) { id_activate = inst.u2.next_id; } break; case ANYNL: id_activate = inst.u2.next_id; break; case NCCLASS: diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index 2d9e2035e5e..41e330905e3 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -612,6 +612,43 @@ TEST_F(StringsContainsTests, MultiLine) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); } +TEST_F(StringsContainsTests, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé\xE2\x80\xA8qqq\xC2\x85zzé", + "qqq\xC2\x85zzé\xE2\x80\xA8lll", + "zzé", + "", + "zzé\xC2\x85", + "zze\xE2\x80\xA9zzé\xC2\x85"}); + auto view = cudf::strings_column_view(input); + + auto pattern = std::string("^zzé$"); + auto prog = cudf::strings::regex_program::create(pattern); + auto prog_ml = + cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::MULTILINE); + + auto expected_contains = cudf::test::fixed_width_column_wrapper({1, 1, 1, 0, 1, 1}); + auto results = cudf::strings::contains_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); + expected_contains = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); + + auto expected_matches = cudf::test::fixed_width_column_wrapper({1, 0, 1, 0, 1, 0}); + results = cudf::strings::matches_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches); + expected_matches = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); + results = cudf::strings::matches_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches); + + auto expected_count = cudf::test::fixed_width_column_wrapper({2, 1, 1, 0, 1, 1}); + results = cudf::strings::count_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); + expected_count = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); + results = cudf::strings::count_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); +} + TEST_F(StringsContainsTests, EndOfString) { auto input = cudf::test::strings_column_wrapper( From 71c063a3c953d587b6fc4f589904db468b0f4d7a Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 24 Jul 2024 16:19:47 -0400 Subject: [PATCH 02/11] add new flag --- cpp/include/cudf/strings/regex/flags.hpp | 22 +++++++++++++++++----- cpp/src/strings/regex/regcomp.cpp | 8 ++++++++ cpp/src/strings/regex/regex.inl | 16 ++++++++++++---- cpp/tests/strings/contains_tests.cpp | 8 +++++--- 4 files changed, 42 insertions(+), 12 deletions(-) diff --git a/cpp/include/cudf/strings/regex/flags.hpp b/cpp/include/cudf/strings/regex/flags.hpp index 44ca68439e7..e99bebd56ad 100644 --- a/cpp/include/cudf/strings/regex/flags.hpp +++ b/cpp/include/cudf/strings/regex/flags.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,10 +33,11 @@ namespace strings { * and to match the Python flag values. */ enum regex_flags : uint32_t { - DEFAULT = 0, ///< default - MULTILINE = 8, ///< the '^' and '$' honor new-line characters - DOTALL = 16, ///< the '.' matching includes new-line characters - ASCII = 256 ///< use only ASCII when matching built-in character classes + DEFAULT = 0, ///< default + MULTILINE = 8, ///< the '^' and '$' honor new-line characters + DOTALL = 16, ///< the '.' matching includes new-line characters + ASCII = 256, ///< use only ASCII when matching built-in character classes + EXT_NEWLINE = 512 ///< new-line matches extended characters }; /** @@ -72,6 +73,17 @@ constexpr bool is_ascii(regex_flags const f) return (f & regex_flags::ASCII) == regex_flags::ASCII; } +/** + * @brief Returns true if the given flags contain EXT_NEWLINE + * + * @param f Regex flags to check + * @return true if `f` includes EXT_NEWLINE + */ +constexpr bool is_ext_newline(regex_flags const f) +{ + return (f & regex_flags::EXT_NEWLINE) == regex_flags::EXT_NEWLINE; +} + /** * @brief Capture groups setting * diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index adf650a4f27..529a6cd84be 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -539,11 +539,19 @@ class regex_parser { : static_cast(LBRA); case ')': return RBRA; case '^': { + // if (is_ext_newline(_flags)) { + // _chr = is_multiline(_flags) ? 'S' : 'N'; + // } else { _chr = is_multiline(_flags) ? chr : '\n'; + //} return BOL; } case '$': { + // if (is_ext_newline(_flags)) { + // _chr = is_multiline(_flags) ? 'S' : 'N'; + // } else { _chr = is_multiline(_flags) ? chr : '\n'; + //} return EOL; } case '[': return build_cclass(); diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index cfb8b6f4206..082e0eb2459 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -321,25 +321,33 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const expanded = true; break; case BOL: { - auto titr = itr; - if ((pos == 0) || ((inst.u1.c == '^') && (is_newline(*(--titr))))) { + auto titr = itr; + auto const prev_ch = *(--titr); + if ((pos == 0) || ((inst.u1.c == '^') && (is_newline(prev_ch)))) { id_activate = inst.u2.next_id; expanded = true; } + // if ((pos == 0) || ((inst.u1.c == '^') && (prev_ch == '\n')) || + // ((inst.u1.c == 'S') && (is_newline(prev_ch)))) { + // id_activate = inst.u2.next_id; + // expanded = true; + // } break; } - case EOL: + case EOL: { // after the last character OR: // - for MULTILINE, if current character is new-line // - for non-MULTILINE, the very last character of the string can also be a new-line + // bool bnl = (inst.u1.c == 'S' || inst.u1.c == 'N') ? is_newline(c) : (c == '\n'); if (last_character || (is_newline(c) && (inst.u1.c != 'Z') && - ((inst.u1.c == '$') || + ((inst.u1.c == '$') || // || inst.u1.c == 'S' (itr.byte_offset() + bytes_in_char_utf8(c) == dstr.size_bytes())))) { id_activate = inst.u2.next_id; expanded = true; } break; + } case BOW: case NBOW: { auto titr = itr; diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index c36aba01c69..f24f26ef319 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -623,9 +623,11 @@ TEST_F(StringsContainsTests, SpecialNewLines) auto view = cudf::strings_column_view(input); auto pattern = std::string("^zzé$"); - auto prog = cudf::strings::regex_program::create(pattern); - auto prog_ml = - cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::MULTILINE); + auto prog = + cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE); + auto both_flags = static_cast( + cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create(pattern, both_flags); auto expected_contains = cudf::test::fixed_width_column_wrapper({1, 1, 1, 0, 1, 1}); auto results = cudf::strings::contains_re(view, *prog_ml); From 72e222aa1b9dac5c6a79935a1e83fbcbf0e0c75a Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 26 Jul 2024 11:00:23 -0400 Subject: [PATCH 03/11] update state engine for ext newlines --- cpp/src/strings/regex/regcomp.cpp | 20 +++++++-------- cpp/src/strings/regex/regex.inl | 20 ++++++--------- cpp/tests/strings/contains_tests.cpp | 37 ++++++++++++++-------------- 3 files changed, 37 insertions(+), 40 deletions(-) diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 529a6cd84be..5c31eb94853 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -539,19 +539,19 @@ class regex_parser { : static_cast(LBRA); case ')': return RBRA; case '^': { - // if (is_ext_newline(_flags)) { - // _chr = is_multiline(_flags) ? 'S' : 'N'; - // } else { - _chr = is_multiline(_flags) ? chr : '\n'; - //} + if (is_ext_newline(_flags)) { + _chr = is_multiline(_flags) ? 'S' : 'N'; + } else { + _chr = is_multiline(_flags) ? chr : '\n'; + } return BOL; } case '$': { - // if (is_ext_newline(_flags)) { - // _chr = is_multiline(_flags) ? 'S' : 'N'; - // } else { - _chr = is_multiline(_flags) ? chr : '\n'; - //} + if (is_ext_newline(_flags)) { + _chr = is_multiline(_flags) ? 'S' : 'N'; + } else { + _chr = is_multiline(_flags) ? chr : '\n'; + } return EOL; } case '[': return build_cclass(); diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index 082e0eb2459..b78ee3ae774 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -270,7 +270,7 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const switch (jnk.starttype) { case BOL: if (pos == 0) break; - if (jnk.startchar != '^') { return thrust::nullopt; } + if (jnk.startchar != '^' && jnk.startchar != 'S') { return thrust::nullopt; } break; case CHAR: { itr = find_char(startchar, dstr, itr); @@ -321,27 +321,23 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const expanded = true; break; case BOL: { - auto titr = itr; - auto const prev_ch = *(--titr); - if ((pos == 0) || ((inst.u1.c == '^') && (is_newline(prev_ch)))) { + auto titr = itr; + auto const prev_c = pos > 0 ? *(--titr) : 0; + if ((pos == 0) || ((inst.u1.c == '^') && (prev_c == '\n')) || + ((inst.u1.c == 'S') && (is_newline(prev_c)))) { id_activate = inst.u2.next_id; expanded = true; } - // if ((pos == 0) || ((inst.u1.c == '^') && (prev_ch == '\n')) || - // ((inst.u1.c == 'S') && (is_newline(prev_ch)))) { - // id_activate = inst.u2.next_id; - // expanded = true; - // } break; } case EOL: { // after the last character OR: // - for MULTILINE, if current character is new-line // - for non-MULTILINE, the very last character of the string can also be a new-line - // bool bnl = (inst.u1.c == 'S' || inst.u1.c == 'N') ? is_newline(c) : (c == '\n'); + bool const nl = (inst.u1.c == 'S' || inst.u1.c == 'N') ? is_newline(c) : (c == '\n'); if (last_character || - (is_newline(c) && (inst.u1.c != 'Z') && - ((inst.u1.c == '$') || // || inst.u1.c == 'S' + (nl && (inst.u1.c != 'Z') && + ((inst.u1.c == '$' || inst.u1.c == 'S') || (itr.byte_offset() + bytes_in_char_utf8(c) == dstr.size_bytes())))) { id_activate = inst.u2.next_id; expanded = true; diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index f24f26ef319..2eb5a61f3c4 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -629,26 +630,26 @@ TEST_F(StringsContainsTests, SpecialNewLines) cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); auto prog_ml = cudf::strings::regex_program::create(pattern, both_flags); - auto expected_contains = cudf::test::fixed_width_column_wrapper({1, 1, 1, 0, 1, 1}); - auto results = cudf::strings::contains_re(view, *prog_ml); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); - expected_contains = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); - results = cudf::strings::contains_re(view, *prog); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); + auto expected = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); + auto results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + expected = cudf::test::fixed_width_column_wrapper({1, 1, 1, 0, 1, 1}); + results = cudf::strings::contains_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); - auto expected_matches = cudf::test::fixed_width_column_wrapper({1, 0, 1, 0, 1, 0}); - results = cudf::strings::matches_re(view, *prog_ml); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches); - expected_matches = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); - results = cudf::strings::matches_re(view, *prog); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches); + expected = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); + results = cudf::strings::matches_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + expected = cudf::test::fixed_width_column_wrapper({1, 0, 1, 0, 1, 0}); + results = cudf::strings::matches_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); - auto expected_count = cudf::test::fixed_width_column_wrapper({2, 1, 1, 0, 1, 1}); - results = cudf::strings::count_re(view, *prog_ml); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); - expected_count = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); - results = cudf::strings::count_re(view, *prog); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); + auto counts = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); + results = cudf::strings::count_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, counts); + counts = cudf::test::fixed_width_column_wrapper({2, 1, 1, 0, 1, 1}); + results = cudf::strings::count_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, counts); } TEST_F(StringsContainsTests, EndOfString) From e3425a632b0712bf242e3f3759c6231eeac656f5 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 29 Jul 2024 14:07:26 -0400 Subject: [PATCH 04/11] add support for ANY inst --- cpp/benchmarks/string/contains.cpp | 2 +- cpp/src/strings/regex/regcomp.cpp | 9 ++++++--- cpp/src/strings/regex/regex.inl | 11 ++++++----- cpp/tests/strings/contains_tests.cpp | 12 +++++++++++- 4 files changed, 24 insertions(+), 10 deletions(-) diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp index ae6c8b844c8..80752110090 100644 --- a/cpp/benchmarks/string/contains.cpp +++ b/cpp/benchmarks/string/contains.cpp @@ -80,7 +80,7 @@ std::unique_ptr build_input_column(cudf::size_type n_rows, } // longer pattern lengths demand more working memory per string -std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43"}; +std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43$"}; static void bench_contains(nvbench::state& state) { diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 5c31eb94853..7c4c89bd3fb 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -555,7 +555,10 @@ class regex_parser { return EOL; } case '[': return build_cclass(); - case '.': return dot_type; + case '.': { + _chr = is_ext_newline(_flags) ? 'N' : chr; + return dot_type; + } } if (std::find(quantifiers.begin(), quantifiers.end(), static_cast(chr)) == @@ -967,7 +970,7 @@ class regex_compiler { _prog.inst_at(inst_id).u1.cls_id = class_id; } else if (token == CHAR) { _prog.inst_at(inst_id).u1.c = yy; - } else if (token == BOL || token == EOL) { + } else if (token == BOL || token == EOL || token == ANY) { _prog.inst_at(inst_id).u1.c = yy; } push_and(inst_id, inst_id); @@ -1202,7 +1205,7 @@ void reprog::print(regex_flags const flags) case STAR: printf(" STAR next=%d", inst.u2.next_id); break; case PLUS: printf(" PLUS next=%d", inst.u2.next_id); break; case QUEST: printf(" QUEST next=%d", inst.u2.next_id); break; - case ANY: printf(" ANY next=%d", inst.u2.next_id); break; + case ANY: printf(" ANY '%c', next=%d", inst.u1.c, inst.u2.next_id); break; case ANYNL: printf(" ANYNL next=%d", inst.u2.next_id); break; case NOP: printf(" NOP next=%d", inst.u2.next_id); break; case BOL: { diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index b78ee3ae774..ea8c6bec3ab 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -131,7 +131,7 @@ __device__ __forceinline__ void reprog_device::reljunk::swaplist() * * '\n, \r, \u0085, \u2028, or \u2029' */ -__device__ __forceinline__ bool is_newline(char32_t const ch) +constexpr bool is_newline(char32_t const ch) { return (ch == '\n' || ch == '\r' || ch == 0x00c285 || ch == 0x00e280a8 || ch == 0x00e280a9); } @@ -382,11 +382,12 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const case CHAR: if (inst.u1.c == c) id_activate = inst.u2.next_id; break; - case ANY: - if (!is_newline(c)) { id_activate = inst.u2.next_id; } - break; + case ANY: { + if ((c == '\n') || ((inst.u1.c == 'N') && is_newline(c))) { break; } + [[fallthrough]]; + } case ANYNL: id_activate = inst.u2.next_id; break; - case NCCLASS: + case NCCLASS: [[fallthrough]]; case CCLASS: { auto const cls = get_class(inst.u1.cls_id); if (cls.is_match(static_cast(c), _codepoint_flags) == (inst.type == CCLASS)) { diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index 2eb5a61f3c4..8965a1b78a6 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -650,6 +649,17 @@ TEST_F(StringsContainsTests, SpecialNewLines) counts = cudf::test::fixed_width_column_wrapper({2, 1, 1, 0, 1, 1}); results = cudf::strings::count_re(view, *prog_ml); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, counts); + + pattern = std::string("q.*l"); + prog = cudf::strings::regex_program::create(pattern); + expected = cudf::test::fixed_width_column_wrapper({0, 1, 0, 0, 0, 0}); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + // inst ANY will stop matching on first 'newline' and so should not match anything here + prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE); + expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 0, 0}); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } TEST_F(StringsContainsTests, EndOfString) From d82fe08e528eaf5a8dcc9ff4d7ac8070f9dd424c Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 5 Aug 2024 14:16:56 -0400 Subject: [PATCH 05/11] add gtest for extract --- cpp/tests/strings/extract_tests.cpp | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp index b26cbd5a549..e56eb683a1c 100644 --- a/cpp/tests/strings/extract_tests.cpp +++ b/cpp/tests/strings/extract_tests.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -200,6 +201,32 @@ TEST_F(StringsExtractTests, DotAll) CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); } +TEST_F(StringsExtractTests, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé\xE2\x80\xA8qqq\xC2\x85zzé", + "qqq\xC2\x85zzé\xE2\x80\xA8lll", + "zzé", + "", + "zzé\xC2\x85", + "zze\xE2\x80\xA9zzé\xC2\x85"}); + auto view = cudf::strings_column_view(input); + + auto prog = + cudf::strings::regex_program::create("(^zzé$)", cudf::strings::regex_flags::EXT_NEWLINE); + auto results = cudf::strings::extract(view, *prog); + auto expected = + cudf::test::strings_column_wrapper({"", "", "zzé", "", "zzé", ""}, {0, 0, 1, 0, 1, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); + + auto both_flags = static_cast( + cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags); + results = cudf::strings::extract(view, *prog_ml); + expected = + cudf::test::strings_column_wrapper({"zzé", "zzé", "zzé", "", "zzé", "zzé"}, {1, 1, 1, 0, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); +} + TEST_F(StringsExtractTests, EmptyExtractTest) { std::vector h_strings{nullptr, "AAA", "AAA_A", "AAA_AAA_", "A__", ""}; From b396e7562794a8477322486fe156bfd66a925978 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 6 Aug 2024 17:47:01 -0400 Subject: [PATCH 06/11] adds more gtests: extract, findall --- cpp/tests/strings/findall_tests.cpp | 25 +++++++++++++++++ cpp/tests/strings/replace_regex_tests.cpp | 34 +++++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp index 4582dcb1e38..898e1793a10 100644 --- a/cpp/tests/strings/findall_tests.cpp +++ b/cpp/tests/strings/findall_tests.cpp @@ -80,6 +80,31 @@ TEST_F(StringsFindallTests, DotAll) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } +TEST_F(StringsFindallTests, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé\xE2\x80\xA8qqq\xC2\x85zzé", + "qqq\xC2\x85zzé\xE2\x80\xA8lll", + "zzé", + "", + "zzé\xC2\x85", + "zze\xE2\x80\xA9zzé\xC2\x85"}); + auto view = cudf::strings_column_view(input); + + auto prog = + cudf::strings::regex_program::create("(^zzé$)", cudf::strings::regex_flags::EXT_NEWLINE); + auto results = cudf::strings::findall(view, *prog); + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{}, LCW{}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{}}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); + + auto both_flags = static_cast( + cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags); + results = cudf::strings::findall(view, *prog_ml); + LCW expected_ml({LCW{"zzé", "zzé"}, LCW{"zzé"}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{"zzé"}}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected_ml); +} + TEST_F(StringsFindallTests, MediumRegex) { // This results in 15 regex instructions and falls in the 'medium' range. diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp index 8c0482653fb..49f02a5da5d 100644 --- a/cpp/tests/strings/replace_regex_tests.cpp +++ b/cpp/tests/strings/replace_regex_tests.cpp @@ -245,6 +245,40 @@ TEST_F(StringsReplaceRegexTest, Multiline) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected); } +TEST_F(StringsReplaceRegexTest, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé\xE2\x80\xA8qqq\xC2\x85zzé", + "qqq\xC2\x85zzé\xE2\x80\xA8lll", + "zzé", + "", + "zzé\xC2\x85", + "abc\xE2\x80\xA9zzé\xC2\x85"}); + auto view = cudf::strings_column_view(input); + auto repl = cudf::string_scalar("_"); + auto prog = + cudf::strings::regex_program::create("^zzé$", cudf::strings::regex_flags::EXT_NEWLINE); + auto results = cudf::strings::replace_re(view, *prog, repl); + auto expected = cudf::test::strings_column_wrapper({"zzé\xE2\x80\xA8qqq\xC2\x85zzé", + "qqq\xC2\x85zzé\xE2\x80\xA8lll", + "_", + "", + "_\xC2\x85", + "abc\xE2\x80\xA9zzé\xC2\x85"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + auto both_flags = static_cast( + cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create("^zzé$", both_flags); + results = cudf::strings::replace_re(view, *prog_ml, repl); + expected = cudf::test::strings_column_wrapper({"_\xE2\x80\xA8qqq\xC2\x85_", + "qqq\xC2\x85_\xE2\x80\xA8lll", + "_", + "", + "_\xC2\x85", + "abc\xE2\x80\xA9_\xC2\x85"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); +} + TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest) { std::vector h_strings{"the quick brown fox jumps over the lazy dog", From b41989f96d56107469b5491751c137974d7737bf Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 7 Aug 2024 13:26:15 -0400 Subject: [PATCH 07/11] add special_chars.h and more gtests --- cpp/src/strings/regex/regex.inl | 8 ++-- cpp/tests/strings/contains_tests.cpp | 10 +++-- cpp/tests/strings/extract_tests.cpp | 10 +++-- cpp/tests/strings/findall_tests.cpp | 13 +++--- cpp/tests/strings/replace_regex_tests.cpp | 51 +++++++++++++++-------- cpp/tests/strings/special_chars.h | 25 +++++++++++ 6 files changed, 83 insertions(+), 34 deletions(-) create mode 100644 cpp/tests/strings/special_chars.h diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index ea8c6bec3ab..48042fad47a 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -269,9 +269,11 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const auto startchar = static_cast(jnk.startchar); switch (jnk.starttype) { case BOL: - if (pos == 0) break; - if (jnk.startchar != '^' && jnk.startchar != 'S') { return thrust::nullopt; } - break; + if (pos == 0) { break; } + if (startchar != '^' && startchar != 'S') { return thrust::nullopt; } + if (startchar != '\n') { break; } + --itr; + startchar = static_cast('\n'); case CHAR: { itr = find_char(startchar, dstr, itr); if (itr.byte_offset() >= dstr.size_bytes()) { return thrust::nullopt; } diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index 8965a1b78a6..22b960a0f20 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include @@ -614,12 +616,12 @@ TEST_F(StringsContainsTests, MultiLine) TEST_F(StringsContainsTests, SpecialNewLines) { - auto input = cudf::test::strings_column_wrapper({"zzé\xE2\x80\xA8qqq\xC2\x85zzé", - "qqq\xC2\x85zzé\xE2\x80\xA8lll", + auto input = cudf::test::strings_column_wrapper({"zzé" LINE_SEPARATOR "qqq" NEXT_LINE "zzé", + "qqq" NEXT_LINE "zzé" LINE_SEPARATOR "lll", "zzé", "", - "zzé\xC2\x85", - "zze\xE2\x80\xA9zzé\xC2\x85"}); + "zzé" PARAGRAPH_SEPARATOR, + "abc\nzzé" NEXT_LINE}); auto view = cudf::strings_column_view(input); auto pattern = std::string("^zzé$"); diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp index e56eb683a1c..40768282df4 100644 --- a/cpp/tests/strings/extract_tests.cpp +++ b/cpp/tests/strings/extract_tests.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include @@ -203,12 +205,12 @@ TEST_F(StringsExtractTests, DotAll) TEST_F(StringsExtractTests, SpecialNewLines) { - auto input = cudf::test::strings_column_wrapper({"zzé\xE2\x80\xA8qqq\xC2\x85zzé", - "qqq\xC2\x85zzé\xE2\x80\xA8lll", + auto input = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" LINE_SEPARATOR "zzé", + "qqq" LINE_SEPARATOR "zzé" NEXT_LINE "lll", "zzé", "", - "zzé\xC2\x85", - "zze\xE2\x80\xA9zzé\xC2\x85"}); + "zzé" NEXT_LINE, + "abc" PARAGRAPH_SEPARATOR "zzé\n"}); auto view = cudf::strings_column_view(input); auto prog = diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp index 898e1793a10..cf6bae1b187 100644 --- a/cpp/tests/strings/findall_tests.cpp +++ b/cpp/tests/strings/findall_tests.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include @@ -82,12 +84,12 @@ TEST_F(StringsFindallTests, DotAll) TEST_F(StringsFindallTests, SpecialNewLines) { - auto input = cudf::test::strings_column_wrapper({"zzé\xE2\x80\xA8qqq\xC2\x85zzé", - "qqq\xC2\x85zzé\xE2\x80\xA8lll", + auto input = cudf::test::strings_column_wrapper({"zzé" PARAGRAPH_SEPARATOR "qqq\nzzé", + "qqq\nzzé" PARAGRAPH_SEPARATOR "lll", "zzé", "", - "zzé\xC2\x85", - "zze\xE2\x80\xA9zzé\xC2\x85"}); + "zzé" LINE_SEPARATOR, + "zzé" LINE_SEPARATOR "zzé" NEXT_LINE}); auto view = cudf::strings_column_view(input); auto prog = @@ -101,7 +103,8 @@ TEST_F(StringsFindallTests, SpecialNewLines) cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags); results = cudf::strings::findall(view, *prog_ml); - LCW expected_ml({LCW{"zzé", "zzé"}, LCW{"zzé"}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{"zzé"}}); + LCW expected_ml( + {LCW{"zzé", "zzé"}, LCW{"zzé"}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{"zzé", "zzé"}}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected_ml); } diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp index 49f02a5da5d..59547f6325c 100644 --- a/cpp/tests/strings/replace_regex_tests.cpp +++ b/cpp/tests/strings/replace_regex_tests.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include @@ -247,35 +249,48 @@ TEST_F(StringsReplaceRegexTest, Multiline) TEST_F(StringsReplaceRegexTest, SpecialNewLines) { - auto input = cudf::test::strings_column_wrapper({"zzé\xE2\x80\xA8qqq\xC2\x85zzé", - "qqq\xC2\x85zzé\xE2\x80\xA8lll", - "zzé", - "", - "zzé\xC2\x85", - "abc\xE2\x80\xA9zzé\xC2\x85"}); - auto view = cudf::strings_column_view(input); - auto repl = cudf::string_scalar("_"); + auto input = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" NEXT_LINE "zzé", + "qqq" NEXT_LINE "zzé" NEXT_LINE "lll", + "zzé", + "", + "zzé" PARAGRAPH_SEPARATOR, + "abc" NEXT_LINE "zzé" NEXT_LINE}); + auto view = cudf::strings_column_view(input); + auto repl = cudf::string_scalar("_"); + auto pattern = std::string("^zzé$"); auto prog = - cudf::strings::regex_program::create("^zzé$", cudf::strings::regex_flags::EXT_NEWLINE); + cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE); auto results = cudf::strings::replace_re(view, *prog, repl); - auto expected = cudf::test::strings_column_wrapper({"zzé\xE2\x80\xA8qqq\xC2\x85zzé", - "qqq\xC2\x85zzé\xE2\x80\xA8lll", + auto expected = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" NEXT_LINE "zzé", + "qqq" NEXT_LINE "zzé" NEXT_LINE "lll", "_", "", - "_\xC2\x85", - "abc\xE2\x80\xA9zzé\xC2\x85"}); + "_" PARAGRAPH_SEPARATOR, + "abc" NEXT_LINE "zzé" NEXT_LINE}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); auto both_flags = static_cast( cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); - auto prog_ml = cudf::strings::regex_program::create("^zzé$", both_flags); + auto prog_ml = cudf::strings::regex_program::create(pattern, both_flags); results = cudf::strings::replace_re(view, *prog_ml, repl); - expected = cudf::test::strings_column_wrapper({"_\xE2\x80\xA8qqq\xC2\x85_", - "qqq\xC2\x85_\xE2\x80\xA8lll", + expected = cudf::test::strings_column_wrapper({"_" NEXT_LINE "qqq" NEXT_LINE "_", + "qqq" NEXT_LINE "_" NEXT_LINE "lll", "_", "", - "_\xC2\x85", - "abc\xE2\x80\xA9_\xC2\x85"}); + "_" PARAGRAPH_SEPARATOR, + "abc" NEXT_LINE "_" NEXT_LINE}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + auto repl_template = std::string("[\\1]"); + pattern = std::string("(^zzé$)"); + prog = cudf::strings::regex_program::create(pattern, both_flags); + results = cudf::strings::replace_with_backrefs(view, *prog, repl_template); + expected = cudf::test::strings_column_wrapper({"[zzé]" NEXT_LINE "qqq" NEXT_LINE "[zzé]", + "qqq" NEXT_LINE "[zzé]" NEXT_LINE "lll", + "[zzé]", + "", + "[zzé]" PARAGRAPH_SEPARATOR, + "abc" NEXT_LINE "[zzé]" NEXT_LINE}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); } diff --git a/cpp/tests/strings/special_chars.h b/cpp/tests/strings/special_chars.h new file mode 100644 index 00000000000..0d630f6bb52 --- /dev/null +++ b/cpp/tests/strings/special_chars.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +namespace cudf::test { + +// special new-line characters for use with regex_flags::EXT_NEWLINE +#define NEXT_LINE "\xC2\x85" +#define LINE_SEPARATOR "\xE2\x80\xA8" +#define PARAGRAPH_SEPARATOR "\xE2\x80\xA9" + +} // namespace cudf::test From 2c144f9a0232fb7d5a5208593679ade1f464fc1c Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 7 Aug 2024 13:51:54 -0400 Subject: [PATCH 08/11] fix BOL/CHAR logic --- cpp/benchmarks/string/contains.cpp | 2 +- cpp/src/strings/regex/regex.inl | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp index 80752110090..ae6c8b844c8 100644 --- a/cpp/benchmarks/string/contains.cpp +++ b/cpp/benchmarks/string/contains.cpp @@ -80,7 +80,7 @@ std::unique_ptr build_input_column(cudf::size_type n_rows, } // longer pattern lengths demand more working memory per string -std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43$"}; +std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43"}; static void bench_contains(nvbench::state& state) { diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index 48042fad47a..9a567f78a10 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -268,15 +268,18 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const if (checkstart) { auto startchar = static_cast(jnk.startchar); switch (jnk.starttype) { - case BOL: + case BOL: { if (pos == 0) { break; } if (startchar != '^' && startchar != 'S') { return thrust::nullopt; } if (startchar != '\n') { break; } --itr; startchar = static_cast('\n'); + [[fallthrough]]; + } case CHAR: { - itr = find_char(startchar, dstr, itr); - if (itr.byte_offset() >= dstr.size_bytes()) { return thrust::nullopt; } + auto const find_itr = find_char(startchar, dstr, itr); + if (find_itr.byte_offset() >= dstr.size_bytes()) { return thrust::nullopt; } + itr = find_itr + (jnk.starttype == BOL); pos = itr.position(); break; } From 7c10de48dec316fc771883eb3c2db4110b64b15f Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 8 Aug 2024 09:21:33 -0400 Subject: [PATCH 09/11] add dotall test for completeness --- cpp/src/strings/regex/regex.inl | 2 +- cpp/tests/strings/contains_tests.cpp | 29 +++++++++++++++++----------- cpp/tests/strings/extract_tests.cpp | 11 +++++++++++ cpp/tests/strings/findall_tests.cpp | 4 ++-- 4 files changed, 32 insertions(+), 14 deletions(-) diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index 9a567f78a10..805d9489524 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -392,7 +392,7 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const [[fallthrough]]; } case ANYNL: id_activate = inst.u2.next_id; break; - case NCCLASS: [[fallthrough]]; + case NCCLASS: case CCLASS: { auto const cls = get_class(inst.u1.cls_id); if (cls.is_match(static_cast(c), _codepoint_flags) == (inst.type == CCLASS)) { diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index 22b960a0f20..876afdc2413 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -627,41 +627,48 @@ TEST_F(StringsContainsTests, SpecialNewLines) auto pattern = std::string("^zzé$"); auto prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE); - auto both_flags = static_cast( - cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); - auto prog_ml = cudf::strings::regex_program::create(pattern, both_flags); + auto ml_flags = static_cast(cudf::strings::regex_flags::EXT_NEWLINE | + cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create(pattern, ml_flags); auto expected = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); auto results = cudf::strings::contains_re(view, *prog); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); expected = cudf::test::fixed_width_column_wrapper({1, 1, 1, 0, 1, 1}); results = cudf::strings::contains_re(view, *prog_ml); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); expected = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); results = cudf::strings::matches_re(view, *prog); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); expected = cudf::test::fixed_width_column_wrapper({1, 0, 1, 0, 1, 0}); results = cudf::strings::matches_re(view, *prog_ml); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); auto counts = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); results = cudf::strings::count_re(view, *prog); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, counts); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, counts); counts = cudf::test::fixed_width_column_wrapper({2, 1, 1, 0, 1, 1}); results = cudf::strings::count_re(view, *prog_ml); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, counts); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, counts); pattern = std::string("q.*l"); prog = cudf::strings::regex_program::create(pattern); expected = cudf::test::fixed_width_column_wrapper({0, 1, 0, 0, 0, 0}); results = cudf::strings::contains_re(view, *prog); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); // inst ANY will stop matching on first 'newline' and so should not match anything here prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE); expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 0, 0}); results = cudf::strings::contains_re(view, *prog); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + // including the DOTALL flag accepts the newline characters + auto dot_flags = static_cast(cudf::strings::regex_flags::EXT_NEWLINE | + cudf::strings::regex_flags::DOTALL); + prog = cudf::strings::regex_program::create(pattern, dot_flags); + expected = cudf::test::fixed_width_column_wrapper({0, 1, 0, 0, 0, 0}); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } TEST_F(StringsContainsTests, EndOfString) diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp index 40768282df4..6afbb1a060a 100644 --- a/cpp/tests/strings/extract_tests.cpp +++ b/cpp/tests/strings/extract_tests.cpp @@ -227,6 +227,17 @@ TEST_F(StringsExtractTests, SpecialNewLines) expected = cudf::test::strings_column_wrapper({"zzé", "zzé", "zzé", "", "zzé", "zzé"}, {1, 1, 1, 0, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); + + prog = cudf::strings::regex_program::create("q(q.*l)l"); + expected = cudf::test::strings_column_wrapper( + {"", "qq" LINE_SEPARATOR "zzé" NEXT_LINE "ll", "", "", "", ""}, {0, 1, 0, 0, 0, 0}); + results = cudf::strings::extract(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); + // expect no matches here since the newline(s) interrupts the pattern + prog = cudf::strings::regex_program::create("q(q.*l)l", cudf::strings::regex_flags::EXT_NEWLINE); + expected = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, {0, 0, 0, 0, 0, 0}); + results = cudf::strings::extract(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); } TEST_F(StringsExtractTests, EmptyExtractTest) diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp index cf6bae1b187..936146d43ba 100644 --- a/cpp/tests/strings/findall_tests.cpp +++ b/cpp/tests/strings/findall_tests.cpp @@ -97,7 +97,7 @@ TEST_F(StringsFindallTests, SpecialNewLines) auto results = cudf::strings::findall(view, *prog); using LCW = cudf::test::lists_column_wrapper; LCW expected({LCW{}, LCW{}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{}}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); auto both_flags = static_cast( cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); @@ -105,7 +105,7 @@ TEST_F(StringsFindallTests, SpecialNewLines) results = cudf::strings::findall(view, *prog_ml); LCW expected_ml( {LCW{"zzé", "zzé"}, LCW{"zzé"}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{"zzé", "zzé"}}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected_ml); } TEST_F(StringsFindallTests, MediumRegex) From 85ebbe5967c0d8aced7ea2b85f0da25de1c1291c Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 9 Aug 2024 10:13:05 -0400 Subject: [PATCH 10/11] update tests; update regex.md doc --- cpp/doxygen/regex.md | 6 ++++++ cpp/tests/strings/contains_tests.cpp | 2 +- cpp/tests/strings/extract_tests.cpp | 8 ++++---- cpp/tests/strings/findall_tests.cpp | 2 +- cpp/tests/strings/replace_regex_tests.cpp | 8 ++++---- 5 files changed, 16 insertions(+), 10 deletions(-) diff --git a/cpp/doxygen/regex.md b/cpp/doxygen/regex.md index 8d206f245dc..4f1e25a5c71 100644 --- a/cpp/doxygen/regex.md +++ b/cpp/doxygen/regex.md @@ -17,6 +17,12 @@ The details are based on features documented at https://www.regular-expressions. **Note:** The alternation character is the pipe character `|` and not the character included in the tables on this page. There is an issue including the pipe character inside the table markdown that is rendered by doxygen. +Only the `\n` character is recognized as a line break. The [cudf::strings::regex_flags::EXT_NEWLINE](@ref cudf::strings::regex_flags) increases the set of line break characters to include: +- Paragraph separator (Unicode: `2029`, UTF-8: `E280A9`) +- Line separator (Unicode: `2028`, UTF-8: `E280A8`) +- Next line (Unicode: `0085`, UTF-8: `C285`) +- Carriage return (Unicode: `000D`, UTF-8: `0D`) + **Invalid regex patterns will result in undefined behavior**. This includes but is not limited to the following: - Unescaped special characters (listed in the third row of the Characters table below) when they are intended to match as literals. - Unmatched paired special characters like `()`, `[]`, and `{}`. diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index 876afdc2413..a591df7daf8 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -617,7 +617,7 @@ TEST_F(StringsContainsTests, MultiLine) TEST_F(StringsContainsTests, SpecialNewLines) { auto input = cudf::test::strings_column_wrapper({"zzé" LINE_SEPARATOR "qqq" NEXT_LINE "zzé", - "qqq" NEXT_LINE "zzé" LINE_SEPARATOR "lll", + "qqq\rzzé" LINE_SEPARATOR "lll", "zzé", "", "zzé" PARAGRAPH_SEPARATOR, diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp index 6afbb1a060a..1491da758d5 100644 --- a/cpp/tests/strings/extract_tests.cpp +++ b/cpp/tests/strings/extract_tests.cpp @@ -206,7 +206,7 @@ TEST_F(StringsExtractTests, DotAll) TEST_F(StringsExtractTests, SpecialNewLines) { auto input = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" LINE_SEPARATOR "zzé", - "qqq" LINE_SEPARATOR "zzé" NEXT_LINE "lll", + "qqq" LINE_SEPARATOR "zzé\rlll", "zzé", "", "zzé" NEXT_LINE, @@ -228,9 +228,9 @@ TEST_F(StringsExtractTests, SpecialNewLines) cudf::test::strings_column_wrapper({"zzé", "zzé", "zzé", "", "zzé", "zzé"}, {1, 1, 1, 0, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); - prog = cudf::strings::regex_program::create("q(q.*l)l"); - expected = cudf::test::strings_column_wrapper( - {"", "qq" LINE_SEPARATOR "zzé" NEXT_LINE "ll", "", "", "", ""}, {0, 1, 0, 0, 0, 0}); + prog = cudf::strings::regex_program::create("q(q.*l)l"); + expected = cudf::test::strings_column_wrapper({"", "qq" LINE_SEPARATOR "zzé\rll", "", "", "", ""}, + {0, 1, 0, 0, 0, 0}); results = cudf::strings::extract(view, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); // expect no matches here since the newline(s) interrupts the pattern diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp index 936146d43ba..47606b9b3ed 100644 --- a/cpp/tests/strings/findall_tests.cpp +++ b/cpp/tests/strings/findall_tests.cpp @@ -88,7 +88,7 @@ TEST_F(StringsFindallTests, SpecialNewLines) "qqq\nzzé" PARAGRAPH_SEPARATOR "lll", "zzé", "", - "zzé" LINE_SEPARATOR, + "zzé\r", "zzé" LINE_SEPARATOR "zzé" NEXT_LINE}); auto view = cudf::strings_column_view(input); diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp index 59547f6325c..9847d8d6bb5 100644 --- a/cpp/tests/strings/replace_regex_tests.cpp +++ b/cpp/tests/strings/replace_regex_tests.cpp @@ -254,7 +254,7 @@ TEST_F(StringsReplaceRegexTest, SpecialNewLines) "zzé", "", "zzé" PARAGRAPH_SEPARATOR, - "abc" NEXT_LINE "zzé" NEXT_LINE}); + "abc\rzzé\r"}); auto view = cudf::strings_column_view(input); auto repl = cudf::string_scalar("_"); auto pattern = std::string("^zzé$"); @@ -266,7 +266,7 @@ TEST_F(StringsReplaceRegexTest, SpecialNewLines) "_", "", "_" PARAGRAPH_SEPARATOR, - "abc" NEXT_LINE "zzé" NEXT_LINE}); + "abc\rzzé\r"}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); auto both_flags = static_cast( @@ -278,7 +278,7 @@ TEST_F(StringsReplaceRegexTest, SpecialNewLines) "_", "", "_" PARAGRAPH_SEPARATOR, - "abc" NEXT_LINE "_" NEXT_LINE}); + "abc\r_\r"}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); auto repl_template = std::string("[\\1]"); @@ -290,7 +290,7 @@ TEST_F(StringsReplaceRegexTest, SpecialNewLines) "[zzé]", "", "[zzé]" PARAGRAPH_SEPARATOR, - "abc" NEXT_LINE "[zzé]" NEXT_LINE}); + "abc\r[zzé]\r"}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); } From 612061f315d54a6577f8dcd52fd4fcfc9e665d2e Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 29 Aug 2024 08:25:09 -0400 Subject: [PATCH 11/11] fix wording in .md file --- cpp/doxygen/regex.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/doxygen/regex.md b/cpp/doxygen/regex.md index 4f1e25a5c71..6d1c91a5752 100644 --- a/cpp/doxygen/regex.md +++ b/cpp/doxygen/regex.md @@ -17,7 +17,7 @@ The details are based on features documented at https://www.regular-expressions. **Note:** The alternation character is the pipe character `|` and not the character included in the tables on this page. There is an issue including the pipe character inside the table markdown that is rendered by doxygen. -Only the `\n` character is recognized as a line break. The [cudf::strings::regex_flags::EXT_NEWLINE](@ref cudf::strings::regex_flags) increases the set of line break characters to include: +By default, only the `\n` character is recognized as a line break. The [cudf::strings::regex_flags::EXT_NEWLINE](@ref cudf::strings::regex_flags) increases the set of line break characters to include: - Paragraph separator (Unicode: `2029`, UTF-8: `E280A9`) - Line separator (Unicode: `2028`, UTF-8: `E280A8`) - Next line (Unicode: `0085`, UTF-8: `C285`)