From 3944733101fdad9923b4598cdcd8a2b838f5e67f Mon Sep 17 00:00:00 2001 From: Suraj Aralihalli Date: Mon, 21 Oct 2024 11:42:46 -0700 Subject: [PATCH 1/9] update regex program and include simple test Signed-off-by: Suraj Aralihalli --- java/src/main/java/ai/rapids/cudf/RegexFlag.java | 3 ++- .../test/java/ai/rapids/cudf/ColumnVectorTest.java | 13 +++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/java/src/main/java/ai/rapids/cudf/RegexFlag.java b/java/src/main/java/ai/rapids/cudf/RegexFlag.java index 7ed8e0354c9..b27623c846d 100644 --- a/java/src/main/java/ai/rapids/cudf/RegexFlag.java +++ b/java/src/main/java/ai/rapids/cudf/RegexFlag.java @@ -28,7 +28,8 @@ public enum RegexFlag { DEFAULT(0), // default MULTILINE(8), // the '^' and '$' honor new-line characters DOTALL(16), // the '.' matching includes new-line characters - ASCII(256); // use only ASCII when matching built-in character classes + ASCII(256), // use only ASCII when matching built-in character classes + EXT_NEWLINE(512); // new-line matches extended characters final int nativeId; // Native id, for use with libcudf. private RegexFlag(int nativeId) { // Only constant values should be used diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 708744569df..4ec643bbd54 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -3877,6 +3877,19 @@ void testExtractRe() { } } + @Test + void testExtractReWithNewline() { + try (ColumnVector input = ColumnVector.fromStrings("boo:", "boo::", "boo::::", null); + Table expected = new Table.TestBuilder() + .column("boo:", "boo::", "boo::::", null) // Full match as per the pattern + .build()) { + // Keep the original regex pattern + try (Table found = input.extractRe(new RegexProgram("(boo:+)$", RegexFlag.EXT_NEWLINE))) { + assertColumnsAreEqual(expected, found->view().column(0)); + } + } + } + @Test void testExtractAllRecord() { String pattern = "([ab])(\\d)"; From 9dd744e1ff09c2407dfdb6f00f87d033ba1707dd Mon Sep 17 00:00:00 2001 From: Suraj Aralihalli Date: Mon, 21 Oct 2024 13:22:58 -0700 Subject: [PATCH 2/9] make column Signed-off-by: Suraj Aralihalli --- java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 4ec643bbd54..e5ef87af7cb 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -3885,7 +3885,7 @@ void testExtractReWithNewline() { .build()) { // Keep the original regex pattern try (Table found = input.extractRe(new RegexProgram("(boo:+)$", RegexFlag.EXT_NEWLINE))) { - assertColumnsAreEqual(expected, found->view().column(0)); + assertColumnsAreEqual(expected->view().column(0), found->view().column(0)); } } } From 9e80ef2716da76ce9b7de74cfb132a550a5321d2 Mon Sep 17 00:00:00 2001 From: Suraj Aralihalli Date: Mon, 21 Oct 2024 13:30:16 -0700 Subject: [PATCH 3/9] fix bug Signed-off-by: Suraj Aralihalli --- java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index e5ef87af7cb..4a352358530 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -31,6 +31,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.EnumSet; import java.util.List; import java.util.Optional; import java.util.concurrent.atomic.AtomicInteger; @@ -3884,8 +3885,8 @@ void testExtractReWithNewline() { .column("boo:", "boo::", "boo::::", null) // Full match as per the pattern .build()) { // Keep the original regex pattern - try (Table found = input.extractRe(new RegexProgram("(boo:+)$", RegexFlag.EXT_NEWLINE))) { - assertColumnsAreEqual(expected->view().column(0), found->view().column(0)); + try (Table found = input.extractRe(new RegexProgram("(boo:+)$", EnumSet.of(RegexFlag.EXT_NEWLINE)))) { + assertColumnsAreEqual(expected.getColumns()[0],found.getColumns()[0]); } } } From 9190b239280841a24a6a0a78c57f51af66622fc8 Mon Sep 17 00:00:00 2001 From: Suraj Aralihalli Date: Mon, 21 Oct 2024 15:51:34 -0700 Subject: [PATCH 4/9] update test Signed-off-by: Suraj Aralihalli --- .../java/ai/rapids/cudf/ColumnVectorTest.java | 29 ++++++++++++++----- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 4a352358530..2a306baf6a7 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -3879,18 +3879,33 @@ void testExtractRe() { } @Test - void testExtractReWithNewline() { - try (ColumnVector input = ColumnVector.fromStrings("boo:", "boo::", "boo::::", null); +void testExtractReWithNewline() { + String NEXT_LINE = "\u0085"; + String LINE_SEPARATOR = "\u2028"; + String PARAGRAPH_SEPARATOR = "\u2029"; + String CARRIAGE_RETURN = "\r"; + + try (ColumnVector input = ColumnVector.fromStrings( + "boo:" + NEXT_LINE + "boo::" + LINE_SEPARATOR + "boo:::", + "boo:::" + LINE_SEPARATOR + "zzé\rlll", + "boo::", + "", + "boo::\n", + "boo:" + NEXT_LINE + "boo::" + PARAGRAPH_SEPARATOR, + "boo:\nboo::" + LINE_SEPARATOR, + "boo:" + NEXT_LINE + "boo::" + NEXT_LINE); Table expected = new Table.TestBuilder() - .column("boo:", "boo::", "boo::::", null) // Full match as per the pattern + .column("boo:::", null, "boo::", null, "boo::", "boo::", "boo::", "boo::") // Expected full matches .build()) { - // Keep the original regex pattern - try (Table found = input.extractRe(new RegexProgram("(boo:+)$", EnumSet.of(RegexFlag.EXT_NEWLINE)))) { - assertColumnsAreEqual(expected.getColumns()[0],found.getColumns()[0]); - } + + // Regex pattern to match 'boo:' followed by one or more colons at the end of the string + try (Table found = input.extractRe(new RegexProgram("(boo:+)$", EnumSet.of(RegexFlag.EXT_NEWLINE)))) { + assertColumnsAreEqual(expected.getColumns()[0], found.getColumns()[0]); + } } } + @Test void testExtractAllRecord() { String pattern = "([ab])(\\d)"; From 28e11fdedddbd1b8d3aff5730ed07604d6eeb4a8 Mon Sep 17 00:00:00 2001 From: Suraj Aralihalli Date: Mon, 21 Oct 2024 15:54:20 -0700 Subject: [PATCH 5/9] update test fail Signed-off-by: Suraj Aralihalli --- java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 2a306baf6a7..e16f96d1963 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -3895,7 +3895,7 @@ void testExtractReWithNewline() { "boo:\nboo::" + LINE_SEPARATOR, "boo:" + NEXT_LINE + "boo::" + NEXT_LINE); Table expected = new Table.TestBuilder() - .column("boo:::", null, "boo::", null, "boo::", "boo::", "boo::", "boo::") // Expected full matches + .column("boo:::", null, "boo::", null, "boo::", "boo::", "boo::", "boo:::") // Expected full matches .build()) { // Regex pattern to match 'boo:' followed by one or more colons at the end of the string From 9e71a739c8f7c4fa8a4ccfef0aa03ecedfea55fa Mon Sep 17 00:00:00 2001 From: Suraj Aralihalli Date: Mon, 21 Oct 2024 16:06:29 -0700 Subject: [PATCH 6/9] add test for default Signed-off-by: Suraj Aralihalli --- .../java/ai/rapids/cudf/ColumnVectorTest.java | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index e16f96d1963..2aaba6cc6b9 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -3884,23 +3884,32 @@ void testExtractReWithNewline() { String LINE_SEPARATOR = "\u2028"; String PARAGRAPH_SEPARATOR = "\u2029"; String CARRIAGE_RETURN = "\r"; + String NEW_LINE = "\r"; try (ColumnVector input = ColumnVector.fromStrings( "boo:" + NEXT_LINE + "boo::" + LINE_SEPARATOR + "boo:::", "boo:::" + LINE_SEPARATOR + "zzé\rlll", "boo::", "", - "boo::\n", + "boo::" + NEW_LINE, + "boo::" + CARRIAGE_RETURN, "boo:" + NEXT_LINE + "boo::" + PARAGRAPH_SEPARATOR, - "boo:\nboo::" + LINE_SEPARATOR, + "boo:" + NEW_LINE + "boo::" + LINE_SEPARATOR, "boo:" + NEXT_LINE + "boo::" + NEXT_LINE); - Table expected = new Table.TestBuilder() - .column("boo:::", null, "boo::", null, "boo::", "boo::", "boo::", "boo:::") // Expected full matches + Table expected_ext_newline = new Table.TestBuilder() + .column("boo:::", null, "boo::", null, "boo::", "boo::", "boo::", "boo::", "boo::") + .build(); + Table expected_default = new Table.TestBuilder() + .column("boo:::", null, "boo::", null, "boo::", null, null, null, null) .build()) { // Regex pattern to match 'boo:' followed by one or more colons at the end of the string try (Table found = input.extractRe(new RegexProgram("(boo:+)$", EnumSet.of(RegexFlag.EXT_NEWLINE)))) { - assertColumnsAreEqual(expected.getColumns()[0], found.getColumns()[0]); + assertColumnsAreEqual(expected_ext_newline.getColumns()[0], found.getColumns()[0]); + } + + try (Table found = input.extractRe(new RegexProgram("(boo:+)$", EnumSet.of(RegexFlag.DEFAULT)))) { + assertColumnsAreEqual(expected_default.getColumns()[0], found.getColumns()[0]); } } } From 663ad930a7f05b2a33ffe5a0a9421a7669a63664 Mon Sep 17 00:00:00 2001 From: Suraj Aralihalli Date: Mon, 21 Oct 2024 16:11:36 -0700 Subject: [PATCH 7/9] fix test Signed-off-by: Suraj Aralihalli --- java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 2aaba6cc6b9..fa42cadcd7f 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -3884,11 +3884,11 @@ void testExtractReWithNewline() { String LINE_SEPARATOR = "\u2028"; String PARAGRAPH_SEPARATOR = "\u2029"; String CARRIAGE_RETURN = "\r"; - String NEW_LINE = "\r"; + String NEW_LINE = "\n"; try (ColumnVector input = ColumnVector.fromStrings( "boo:" + NEXT_LINE + "boo::" + LINE_SEPARATOR + "boo:::", - "boo:::" + LINE_SEPARATOR + "zzé\rlll", + "boo:::" + LINE_SEPARATOR + "zzé" + CARRIAGE_RETURN + "lll", "boo::", "", "boo::" + NEW_LINE, From e63f036c456c33eda66e8abc6763c92fdb622f5a Mon Sep 17 00:00:00 2001 From: Suraj Aralihalli Date: Mon, 21 Oct 2024 16:30:49 -0700 Subject: [PATCH 8/9] rename test Signed-off-by: Suraj Aralihalli --- java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index fa42cadcd7f..14c290b300a 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -3879,7 +3879,7 @@ void testExtractRe() { } @Test -void testExtractReWithNewline() { +void testExtractReWithMultiLineDelimiters() { String NEXT_LINE = "\u0085"; String LINE_SEPARATOR = "\u2028"; String PARAGRAPH_SEPARATOR = "\u2029"; From f48523336bc797e6b90db885775c5820c10e5b5f Mon Sep 17 00:00:00 2001 From: Suraj Aralihalli Date: Tue, 22 Oct 2024 13:55:01 -0700 Subject: [PATCH 9/9] update docs Signed-off-by: Suraj Aralihalli --- java/src/main/java/ai/rapids/cudf/RegexFlag.java | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/java/src/main/java/ai/rapids/cudf/RegexFlag.java b/java/src/main/java/ai/rapids/cudf/RegexFlag.java index b27623c846d..68a3856f37d 100644 --- a/java/src/main/java/ai/rapids/cudf/RegexFlag.java +++ b/java/src/main/java/ai/rapids/cudf/RegexFlag.java @@ -29,7 +29,15 @@ public enum RegexFlag { MULTILINE(8), // the '^' and '$' honor new-line characters DOTALL(16), // the '.' matching includes new-line characters ASCII(256), // use only ASCII when matching built-in character classes - EXT_NEWLINE(512); // new-line matches extended characters + /** + * EXT_NEWLINE(512): Extends line delimiters to include the following Unicode characters + * - NEXT_LINE ('\u0085') + * - LINE_SEPARATOR ('\u2028') + * - PARAGRAPH_SEPARATOR ('\u2029') + * - CARRIAGE_RETURN ('\r') + * - NEW_LINE ('\n') + */ + EXT_NEWLINE(512); final int nativeId; // Native id, for use with libcudf. private RegexFlag(int nativeId) { // Only constant values should be used