Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support multiple new-line characters in regex APIs #15961

Merged
merged 65 commits into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
65 commits
Select commit Hold shift + click to select a range
793c7dc
Support multiple new-line characters in regex APIs
davidwendt Jun 10, 2024
4f98848
Merge branch 'branch-24.08' into regex-new-lines
davidwendt Jun 10, 2024
1db46f7
Merge branch 'branch-24.08' into regex-new-lines
davidwendt Jun 12, 2024
10efa85
Merge branch 'branch-24.08' into regex-new-lines
davidwendt Jun 12, 2024
833cfaa
Merge branch 'branch-24.08' into regex-new-lines
davidwendt Jun 14, 2024
896e3e8
Merge branch 'branch-24.08' into regex-new-lines
davidwendt Jun 24, 2024
421a276
Merge branch 'branch-24.08' into regex-new-lines
davidwendt Jun 25, 2024
4717059
Merge branch 'branch-24.08' into regex-new-lines
davidwendt Jun 27, 2024
6517432
Merge branch 'branch-24.08' into regex-new-lines
davidwendt Jun 28, 2024
f429f83
Merge branch 'branch-24.08' into regex-new-lines
davidwendt Jul 2, 2024
cb36e73
Merge branch 'branch-24.08' into regex-new-lines
davidwendt Jul 10, 2024
aec936d
Merge branch 'branch-24.08' into regex-new-lines
davidwendt Jul 11, 2024
a2a79a3
Merge branch 'branch-24.08' into regex-new-lines
davidwendt Jul 11, 2024
bd08443
Merge branch 'branch-24.08' into regex-new-lines
davidwendt Jul 15, 2024
1f6da03
Merge branch 'branch-24.08' into regex-new-lines
davidwendt Jul 17, 2024
f543f30
Merge branch 'branch-24.08' into regex-new-lines
davidwendt Jul 18, 2024
22d5f66
Merge branch 'branch-24.08' into regex-new-lines
davidwendt Jul 18, 2024
d93a4b6
Merge branch 'branch-24.08' into regex-new-lines
davidwendt Jul 22, 2024
c701004
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Jul 22, 2024
8d472ba
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Jul 23, 2024
d693d43
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Jul 24, 2024
71c063a
add new flag
davidwendt Jul 24, 2024
14e1c78
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Jul 26, 2024
72e222a
update state engine for ext newlines
davidwendt Jul 26, 2024
57f3567
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Jul 29, 2024
e3425a6
add support for ANY inst
davidwendt Jul 29, 2024
9ebf087
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Jul 29, 2024
b789fc1
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Jul 30, 2024
6a0eae3
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Jul 31, 2024
920ed87
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Aug 5, 2024
d82fe08
add gtest for extract
davidwendt Aug 5, 2024
f23c8d8
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Aug 6, 2024
8e83b99
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Aug 6, 2024
b396e75
adds more gtests: extract, findall
davidwendt Aug 6, 2024
58e0f95
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Aug 6, 2024
d7c4dec
Merge branch 'regex-new-lines' of github.com:davidwendt/cudf into reg…
davidwendt Aug 7, 2024
e974cb6
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Aug 7, 2024
b41989f
add special_chars.h and more gtests
davidwendt Aug 7, 2024
b5befac
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Aug 7, 2024
2c144f9
fix BOL/CHAR logic
davidwendt Aug 7, 2024
7c41318
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Aug 8, 2024
7c10de4
add dotall test for completeness
davidwendt Aug 8, 2024
feaae6d
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Aug 8, 2024
d9b5481
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Aug 9, 2024
85ebbe5
update tests; update regex.md doc
davidwendt Aug 9, 2024
7eec095
Merge branch 'regex-new-lines' of github.com:davidwendt/cudf into reg…
davidwendt Aug 9, 2024
f4b28ab
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Aug 9, 2024
797004e
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Aug 14, 2024
4da7ef5
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Aug 14, 2024
143e396
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Aug 15, 2024
75c3643
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Aug 16, 2024
bbf28c3
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Aug 19, 2024
0477392
fix merge conflict
davidwendt Aug 20, 2024
58003b5
Merge branch 'regex-new-lines' of github.com:davidwendt/cudf into reg…
davidwendt Aug 20, 2024
6f9a55b
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Aug 23, 2024
858dfd9
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Aug 23, 2024
b71fbb8
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Aug 27, 2024
364ed09
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Aug 27, 2024
ac1e5cd
Merge branch 'regex-new-lines' of github.com:davidwendt/cudf into reg…
davidwendt Aug 28, 2024
c05c3ac
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Aug 28, 2024
b230e0f
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Aug 29, 2024
612061f
fix wording in .md file
davidwendt Aug 29, 2024
fd9fcaf
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Sep 3, 2024
5a78495
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Sep 10, 2024
53c925c
Merge branch 'branch-24.10' into regex-new-lines
davidwendt Sep 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions cpp/doxygen/regex.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ The details are based on features documented at https://www.regular-expressions.

**Note:** The alternation character is the pipe character `|` and not the character included in the tables on this page. There is an issue including the pipe character inside the table markdown that is rendered by doxygen.

By default, only the `\n` character is recognized as a line break. The [cudf::strings::regex_flags::EXT_NEWLINE](@ref cudf::strings::regex_flags) increases the set of line break characters to include:
- Paragraph separator (Unicode: `2029`, UTF-8: `E280A9`)
- Line separator (Unicode: `2028`, UTF-8: `E280A8`)
- Next line (Unicode: `0085`, UTF-8: `C285`)
- Carriage return (Unicode: `000D`, UTF-8: `0D`)

**Invalid regex patterns will result in undefined behavior**. This includes but is not limited to the following:
- Unescaped special characters (listed in the third row of the Characters table below) when they are intended to match as literals.
- Unmatched paired special characters like `()`, `[]`, and `{}`.
Expand Down
20 changes: 16 additions & 4 deletions cpp/include/cudf/strings/regex/flags.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,11 @@ namespace strings {
* and to match the Python flag values.
*/
enum regex_flags : uint32_t {
DEFAULT = 0, ///< default
MULTILINE = 8, ///< the '^' and '$' honor new-line characters
DOTALL = 16, ///< the '.' matching includes new-line characters
ASCII = 256 ///< use only ASCII when matching built-in character classes
DEFAULT = 0, ///< default
MULTILINE = 8, ///< the '^' and '$' honor new-line characters
DOTALL = 16, ///< the '.' matching includes new-line characters
ASCII = 256, ///< use only ASCII when matching built-in character classes
EXT_NEWLINE = 512 ///< new-line matches extended characters
};

/**
Expand Down Expand Up @@ -74,6 +75,17 @@ constexpr bool is_ascii(regex_flags const f)
return (f & regex_flags::ASCII) == regex_flags::ASCII;
}

/**
* @brief Returns true if the given flags contain EXT_NEWLINE
*
* @param f Regex flags to check
* @return true if `f` includes EXT_NEWLINE
*/
constexpr bool is_ext_newline(regex_flags const f)
{
return (f & regex_flags::EXT_NEWLINE) == regex_flags::EXT_NEWLINE;
}

/**
* @brief Capture groups setting
*
Expand Down
11 changes: 8 additions & 3 deletions cpp/include/cudf/strings/string_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -191,9 +191,14 @@ __device__ inline string_view::const_iterator& string_view::const_iterator::oper

__device__ inline string_view::const_iterator& string_view::const_iterator::operator--()
{
if (byte_pos > 0)
while (strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[--byte_pos])) == 0)
;
if (byte_pos > 0) {
if (byte_pos == char_pos) {
--byte_pos;
} else {
Comment on lines +195 to +197
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This optimization speeds up this operation in ASCII strings -- ones with no multi-byte chars.

while (strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[--byte_pos])) == 0)
;
}
}
--char_pos;
return *this;
}
Expand Down
21 changes: 16 additions & 5 deletions cpp/src/strings/regex/regcomp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -539,15 +539,26 @@ class regex_parser {
: static_cast<int32_t>(LBRA);
case ')': return RBRA;
case '^': {
_chr = is_multiline(_flags) ? chr : '\n';
if (is_ext_newline(_flags)) {
_chr = is_multiline(_flags) ? 'S' : 'N';
} else {
_chr = is_multiline(_flags) ? chr : '\n';
}
return BOL;
}
case '$': {
_chr = is_multiline(_flags) ? chr : '\n';
if (is_ext_newline(_flags)) {
_chr = is_multiline(_flags) ? 'S' : 'N';
} else {
_chr = is_multiline(_flags) ? chr : '\n';
}
return EOL;
}
case '[': return build_cclass();
case '.': return dot_type;
case '.': {
_chr = is_ext_newline(_flags) ? 'N' : chr;
return dot_type;
}
}

if (std::find(quantifiers.begin(), quantifiers.end(), static_cast<char>(chr)) ==
Expand Down Expand Up @@ -959,7 +970,7 @@ class regex_compiler {
_prog.inst_at(inst_id).u1.cls_id = class_id;
} else if (token == CHAR) {
_prog.inst_at(inst_id).u1.c = yy;
} else if (token == BOL || token == EOL) {
} else if (token == BOL || token == EOL || token == ANY) {
_prog.inst_at(inst_id).u1.c = yy;
}
push_and(inst_id, inst_id);
Expand Down Expand Up @@ -1194,7 +1205,7 @@ void reprog::print(regex_flags const flags)
case STAR: printf(" STAR next=%d", inst.u2.next_id); break;
case PLUS: printf(" PLUS next=%d", inst.u2.next_id); break;
case QUEST: printf(" QUEST next=%d", inst.u2.next_id); break;
case ANY: printf(" ANY next=%d", inst.u2.next_id); break;
case ANY: printf(" ANY '%c', next=%d", inst.u1.c, inst.u2.next_id); break;
case ANYNL: printf(" ANYNL next=%d", inst.u2.next_id); break;
case NOP: printf(" NOP next=%d", inst.u2.next_id); break;
case BOL: {
Expand Down
46 changes: 34 additions & 12 deletions cpp/src/strings/regex/regex.inl
vuule marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,16 @@ __device__ __forceinline__ void reprog_device::reljunk::swaplist()
list2 = tmp;
}

/**
* @brief Check for supported new-line characters
*
* '\n, \r, \u0085, \u2028, or \u2029'
*/
constexpr bool is_newline(char32_t const ch)
{
return (ch == '\n' || ch == '\r' || ch == 0x00c285 || ch == 0x00e280a8 || ch == 0x00e280a9);
}

/**
* @brief Utility to check a specific character against this class instance.
*
Expand Down Expand Up @@ -258,11 +268,14 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
if (checkstart) {
auto startchar = static_cast<char_utf8>(jnk.startchar);
switch (jnk.starttype) {
case BOL:
if (pos == 0) break;
if (jnk.startchar != '^') { return cuda::std::nullopt; }
case BOL: {
if (pos == 0) { break; }
if (startchar != '^' && startchar != 'S') { return cuda::std::nullopt; }
if (startchar != '\n') { break; }
--itr;
startchar = static_cast<char_utf8>('\n');
[[fallthrough]];
}
case CHAR: {
auto const find_itr = find_char(startchar, dstr, itr);
if (find_itr.byte_offset() >= dstr.size_bytes()) { return cuda::std::nullopt; }
Expand Down Expand Up @@ -312,26 +325,34 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
id_activate = inst.u2.next_id;
expanded = true;
break;
case BOL:
if ((pos == 0) || ((inst.u1.c == '^') && (dstr[pos - 1] == '\n'))) {
case BOL: {
auto titr = itr;
auto const prev_c = pos > 0 ? *(--titr) : 0;
if ((pos == 0) || ((inst.u1.c == '^') && (prev_c == '\n')) ||
((inst.u1.c == 'S') && (is_newline(prev_c)))) {
id_activate = inst.u2.next_id;
expanded = true;
}
break;
case EOL:
}
case EOL: {
// after the last character OR:
// - for MULTILINE, if current character is new-line
// - for non-MULTILINE, the very last character of the string can also be a new-line
bool const nl = (inst.u1.c == 'S' || inst.u1.c == 'N') ? is_newline(c) : (c == '\n');
if (last_character ||
((c == '\n') && (inst.u1.c != 'Z') &&
((inst.u1.c == '$') || (itr.byte_offset() + 1 == dstr.size_bytes())))) {
(nl && (inst.u1.c != 'Z') &&
((inst.u1.c == '$' || inst.u1.c == 'S') ||
(itr.byte_offset() + bytes_in_char_utf8(c) == dstr.size_bytes())))) {
id_activate = inst.u2.next_id;
expanded = true;
}
break;
}
case BOW:
case NBOW: {
auto const prev_c = pos > 0 ? dstr[pos - 1] : 0;
auto titr = itr;
auto const prev_c = pos > 0 ? *(--titr) : 0;
auto const word_class = reclass_device{CCLASS_W};
bool const curr_is_word = word_class.is_match(c, _codepoint_flags);
bool const prev_is_word = word_class.is_match(prev_c, _codepoint_flags);
Expand Down Expand Up @@ -366,9 +387,10 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
case CHAR:
if (inst.u1.c == c) id_activate = inst.u2.next_id;
break;
case ANY:
if (c != '\n') id_activate = inst.u2.next_id;
break;
case ANY: {
if ((c == '\n') || ((inst.u1.c == 'N') && is_newline(c))) { break; }
[[fallthrough]];
}
case ANYNL: id_activate = inst.u2.next_id; break;
case NCCLASS:
case CCLASS: {
Expand Down
59 changes: 59 additions & 0 deletions cpp/tests/strings/contains_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
* limitations under the License.
*/

#include "special_chars.h"

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>
Expand Down Expand Up @@ -613,6 +615,63 @@ TEST_F(StringsContainsTests, MultiLine)
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
}

TEST_F(StringsContainsTests, SpecialNewLines)
{
auto input = cudf::test::strings_column_wrapper({"zzé" LINE_SEPARATOR "qqq" NEXT_LINE "zzé",
"qqq\rzzé" LINE_SEPARATOR "lll",
"zzé",
"",
"zzé" PARAGRAPH_SEPARATOR,
"abc\nzzé" NEXT_LINE});
auto view = cudf::strings_column_view(input);

auto pattern = std::string("^zzé$");
auto prog =
cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE);
auto ml_flags = static_cast<cudf::strings::regex_flags>(cudf::strings::regex_flags::EXT_NEWLINE |
cudf::strings::regex_flags::MULTILINE);
auto prog_ml = cudf::strings::regex_program::create(pattern, ml_flags);

auto expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 1, 0});
auto results = cudf::strings::contains_re(view, *prog);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
expected = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 0, 1, 1});
results = cudf::strings::contains_re(view, *prog_ml);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);

expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 1, 0});
results = cudf::strings::matches_re(view, *prog);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
expected = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 1, 0, 1, 0});
results = cudf::strings::matches_re(view, *prog_ml);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);

auto counts = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 1, 0, 1, 0});
results = cudf::strings::count_re(view, *prog);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, counts);
counts = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0, 1, 1});
results = cudf::strings::count_re(view, *prog_ml);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, counts);

pattern = std::string("q.*l");
prog = cudf::strings::regex_program::create(pattern);
expected = cudf::test::fixed_width_column_wrapper<bool>({0, 1, 0, 0, 0, 0});
results = cudf::strings::contains_re(view, *prog);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
// inst ANY will stop matching on first 'newline' and so should not match anything here
prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE);
expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 0, 0, 0});
results = cudf::strings::contains_re(view, *prog);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
// including the DOTALL flag accepts the newline characters
auto dot_flags = static_cast<cudf::strings::regex_flags>(cudf::strings::regex_flags::EXT_NEWLINE |
cudf::strings::regex_flags::DOTALL);
prog = cudf::strings::regex_program::create(pattern, dot_flags);
expected = cudf::test::fixed_width_column_wrapper<bool>({0, 1, 0, 0, 0, 0});
results = cudf::strings::contains_re(view, *prog);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(StringsContainsTests, EndOfString)
{
auto input = cudf::test::strings_column_wrapper(
Expand Down
40 changes: 40 additions & 0 deletions cpp/tests/strings/extract_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,12 @@
* limitations under the License.
*/

#include "special_chars.h"

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/debug_utilities.hpp>
#include <cudf_test/table_utilities.hpp>

#include <cudf/detail/iterator.cuh>
Expand Down Expand Up @@ -200,6 +203,43 @@ TEST_F(StringsExtractTests, DotAll)
CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
}

TEST_F(StringsExtractTests, SpecialNewLines)
{
auto input = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" LINE_SEPARATOR "zzé",
"qqq" LINE_SEPARATOR "zzé\rlll",
"zzé",
"",
"zzé" NEXT_LINE,
"abc" PARAGRAPH_SEPARATOR "zzé\n"});
auto view = cudf::strings_column_view(input);

auto prog =
cudf::strings::regex_program::create("(^zzé$)", cudf::strings::regex_flags::EXT_NEWLINE);
auto results = cudf::strings::extract(view, *prog);
auto expected =
cudf::test::strings_column_wrapper({"", "", "zzé", "", "zzé", ""}, {0, 0, 1, 0, 1, 0});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);

auto both_flags = static_cast<cudf::strings::regex_flags>(
cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE);
auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags);
results = cudf::strings::extract(view, *prog_ml);
expected =
cudf::test::strings_column_wrapper({"zzé", "zzé", "zzé", "", "zzé", "zzé"}, {1, 1, 1, 0, 1, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);

prog = cudf::strings::regex_program::create("q(q.*l)l");
expected = cudf::test::strings_column_wrapper({"", "qq" LINE_SEPARATOR "zzé\rll", "", "", "", ""},
{0, 1, 0, 0, 0, 0});
results = cudf::strings::extract(view, *prog);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
// expect no matches here since the newline(s) interrupts the pattern
prog = cudf::strings::regex_program::create("q(q.*l)l", cudf::strings::regex_flags::EXT_NEWLINE);
expected = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, {0, 0, 0, 0, 0, 0});
results = cudf::strings::extract(view, *prog);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
}

TEST_F(StringsExtractTests, EmptyExtractTest)
{
std::vector<char const*> h_strings{nullptr, "AAA", "AAA_A", "AAA_AAA_", "A__", ""};
Expand Down
28 changes: 28 additions & 0 deletions cpp/tests/strings/findall_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
* limitations under the License.
*/

#include "special_chars.h"

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>
Expand Down Expand Up @@ -80,6 +82,32 @@ TEST_F(StringsFindallTests, DotAll)
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
}

TEST_F(StringsFindallTests, SpecialNewLines)
{
auto input = cudf::test::strings_column_wrapper({"zzé" PARAGRAPH_SEPARATOR "qqq\nzzé",
"qqq\nzzé" PARAGRAPH_SEPARATOR "lll",
"zzé",
"",
"zzé\r",
"zzé" LINE_SEPARATOR "zzé" NEXT_LINE});
auto view = cudf::strings_column_view(input);

auto prog =
cudf::strings::regex_program::create("(^zzé$)", cudf::strings::regex_flags::EXT_NEWLINE);
auto results = cudf::strings::findall(view, *prog);
using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
LCW expected({LCW{}, LCW{}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{}});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);

auto both_flags = static_cast<cudf::strings::regex_flags>(
cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE);
auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags);
results = cudf::strings::findall(view, *prog_ml);
LCW expected_ml(
{LCW{"zzé", "zzé"}, LCW{"zzé"}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{"zzé", "zzé"}});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected_ml);
}

TEST_F(StringsFindallTests, MediumRegex)
{
// This results in 15 regex instructions and falls in the 'medium' range.
Expand Down
Loading
Loading