Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add JNI Support for Multi-line Delimiters and Include Test #17139

Merged
merged 10 commits into from
Oct 23, 2024
Merged
11 changes: 10 additions & 1 deletion java/src/main/java/ai/rapids/cudf/RegexFlag.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,16 @@ public enum RegexFlag {
DEFAULT(0), // default
MULTILINE(8), // the '^' and '$' honor new-line characters
DOTALL(16), // the '.' matching includes new-line characters
ASCII(256); // use only ASCII when matching built-in character classes
ASCII(256), // use only ASCII when matching built-in character classes
/**
* EXT_NEWLINE(512): Extends line delimiters to include the following Unicode characters
* - NEXT_LINE ('\u0085')
* - LINE_SEPARATOR ('\u2028')
* - PARAGRAPH_SEPARATOR ('\u2029')
* - CARRIAGE_RETURN ('\r')
* - NEW_LINE ('\n')
*/
EXT_NEWLINE(512);

final int nativeId; // Native id, for use with libcudf.
private RegexFlag(int nativeId) { // Only constant values should be used
Expand Down
38 changes: 38 additions & 0 deletions java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumSet;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicInteger;
Expand Down Expand Up @@ -3877,6 +3878,43 @@ void testExtractRe() {
}
}

@Test
void testExtractReWithMultiLineDelimiters() {
String NEXT_LINE = "\u0085";
String LINE_SEPARATOR = "\u2028";
String PARAGRAPH_SEPARATOR = "\u2029";
String CARRIAGE_RETURN = "\r";
String NEW_LINE = "\n";

try (ColumnVector input = ColumnVector.fromStrings(
"boo:" + NEXT_LINE + "boo::" + LINE_SEPARATOR + "boo:::",
"boo:::" + LINE_SEPARATOR + "zzé" + CARRIAGE_RETURN + "lll",
"boo::",
"",
"boo::" + NEW_LINE,
"boo::" + CARRIAGE_RETURN,
"boo:" + NEXT_LINE + "boo::" + PARAGRAPH_SEPARATOR,
"boo:" + NEW_LINE + "boo::" + LINE_SEPARATOR,
"boo:" + NEXT_LINE + "boo::" + NEXT_LINE);
Table expected_ext_newline = new Table.TestBuilder()
.column("boo:::", null, "boo::", null, "boo::", "boo::", "boo::", "boo::", "boo::")
.build();
Table expected_default = new Table.TestBuilder()
.column("boo:::", null, "boo::", null, "boo::", null, null, null, null)
.build()) {

// Regex pattern to match 'boo:' followed by one or more colons at the end of the string
try (Table found = input.extractRe(new RegexProgram("(boo:+)$", EnumSet.of(RegexFlag.EXT_NEWLINE)))) {
assertColumnsAreEqual(expected_ext_newline.getColumns()[0], found.getColumns()[0]);
}

try (Table found = input.extractRe(new RegexProgram("(boo:+)$", EnumSet.of(RegexFlag.DEFAULT)))) {
assertColumnsAreEqual(expected_default.getColumns()[0], found.getColumns()[0]);
}
}
}


@Test
void testExtractAllRecord() {
String pattern = "([ab])(\\d)";
Expand Down
Loading