diff --git a/java/src/main/java/ai/rapids/cudf/RegexFlag.java b/java/src/main/java/ai/rapids/cudf/RegexFlag.java index 7ed8e0354c9..68a3856f37d 100644 --- a/java/src/main/java/ai/rapids/cudf/RegexFlag.java +++ b/java/src/main/java/ai/rapids/cudf/RegexFlag.java @@ -28,7 +28,16 @@ public enum RegexFlag { DEFAULT(0), // default MULTILINE(8), // the '^' and '$' honor new-line characters DOTALL(16), // the '.' matching includes new-line characters - ASCII(256); // use only ASCII when matching built-in character classes + ASCII(256), // use only ASCII when matching built-in character classes + /** + * EXT_NEWLINE(512): Extends line delimiters to include the following Unicode characters + * - NEXT_LINE ('\u0085') + * - LINE_SEPARATOR ('\u2028') + * - PARAGRAPH_SEPARATOR ('\u2029') + * - CARRIAGE_RETURN ('\r') + * - NEW_LINE ('\n') + */ + EXT_NEWLINE(512); final int nativeId; // Native id, for use with libcudf. private RegexFlag(int nativeId) { // Only constant values should be used diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 708744569df..14c290b300a 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -31,6 +31,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.EnumSet; import java.util.List; import java.util.Optional; import java.util.concurrent.atomic.AtomicInteger; @@ -3877,6 +3878,43 @@ void testExtractRe() { } } + @Test +void testExtractReWithMultiLineDelimiters() { + String NEXT_LINE = "\u0085"; + String LINE_SEPARATOR = "\u2028"; + String PARAGRAPH_SEPARATOR = "\u2029"; + String CARRIAGE_RETURN = "\r"; + String NEW_LINE = "\n"; + + try (ColumnVector input = ColumnVector.fromStrings( + "boo:" + NEXT_LINE + "boo::" + LINE_SEPARATOR + "boo:::", + "boo:::" + LINE_SEPARATOR + "zzé" + CARRIAGE_RETURN + "lll", + "boo::", + "", + "boo::" + NEW_LINE, + "boo::" + CARRIAGE_RETURN, + "boo:" + NEXT_LINE + "boo::" + PARAGRAPH_SEPARATOR, + "boo:" + NEW_LINE + "boo::" + LINE_SEPARATOR, + "boo:" + NEXT_LINE + "boo::" + NEXT_LINE); + Table expected_ext_newline = new Table.TestBuilder() + .column("boo:::", null, "boo::", null, "boo::", "boo::", "boo::", "boo::", "boo::") + .build(); + Table expected_default = new Table.TestBuilder() + .column("boo:::", null, "boo::", null, "boo::", null, null, null, null) + .build()) { + + // Regex pattern to match 'boo:' followed by one or more colons at the end of the string + try (Table found = input.extractRe(new RegexProgram("(boo:+)$", EnumSet.of(RegexFlag.EXT_NEWLINE)))) { + assertColumnsAreEqual(expected_ext_newline.getColumns()[0], found.getColumns()[0]); + } + + try (Table found = input.extractRe(new RegexProgram("(boo:+)$", EnumSet.of(RegexFlag.DEFAULT)))) { + assertColumnsAreEqual(expected_default.getColumns()[0], found.getColumns()[0]); + } + } + } + + @Test void testExtractAllRecord() { String pattern = "([ab])(\\d)";