-
Notifications
You must be signed in to change notification settings - Fork 234
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fall back to CPU for unsupported regular expression edge cases with end of line/string anchors and newlines #5610
Changes from 21 commits
79883d4
1e6f126
43011cd
46471ef
224bfb2
74dd695
d98ff09
969a045
3a10384
f9ec1fb
d856a15
4d4d250
a87fb5d
78be837
df337b0
0e128f3
2e9f4e0
ef9fc5d
c6c010d
4aa0be8
04b2619
53330ef
a641fad
a364dba
8ea6c4b
d9a414f
12607b7
2fd9251
7a81dba
7cc007a
33cf006
d83ac04
ef22adc
9765ab8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,8 @@ import java.sql.SQLException | |
|
||
import scala.collection.mutable.ListBuffer | ||
|
||
import com.nvidia.spark.rapids.RegexParser.toReadableString | ||
|
||
/** | ||
* Regular expression parser based on a Pratt Parser design. | ||
* | ||
|
@@ -509,6 +511,21 @@ object RegexParser { | |
true | ||
} | ||
} | ||
|
||
def toReadableString(x: String): String = { | ||
x.map { | ||
case '\r' => "\\r" | ||
case '\n' => "\\n" | ||
case '\t' => "\\t" | ||
case '\f' => "\\f" | ||
case '\u000b' => "\\u000b" | ||
case '\u0085' => "\\u0085" | ||
case '\u2028' => "\\u2028" | ||
case '\u2029' => "\\u2029" | ||
case other => other | ||
}.mkString | ||
} | ||
|
||
} | ||
|
||
sealed trait RegexMode | ||
|
@@ -559,7 +576,7 @@ class CudfRegexTranspiler(mode: RegexMode) { | |
val replacement = repl.map(s => new RegexParser(s).parseReplacement(countCaptureGroups(regex))) | ||
|
||
// validate that the regex is supported by cuDF | ||
val cudfRegex = rewrite(regex, replacement, None) | ||
val cudfRegex = transpile(regex, replacement, None) | ||
// write out to regex string, performing minor transformations | ||
// such as adding additional escaping | ||
(cudfRegex.toRegexString, replacement.map(_.toRegexString)) | ||
|
@@ -696,6 +713,90 @@ class CudfRegexTranspiler(mode: RegexMode) { | |
} | ||
} | ||
|
||
private def transpile(regex: RegexAST, replacement: Option[RegexReplacement], | ||
previous: Option[RegexAST]): RegexAST = { | ||
|
||
def containsBeginAnchor(regex: RegexAST): Boolean = { | ||
contains(regex, { | ||
case RegexChar('^') | RegexEscaped('A') => true | ||
case _ => false | ||
}) | ||
} | ||
|
||
def containsEndAnchor(regex: RegexAST): Boolean = { | ||
contains(regex, { | ||
case RegexChar('$') | RegexEscaped('z') | RegexEscaped('Z') => true | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One thing we need to do (probably not in this PR) is to not check inside character classes to fix false positives such as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another case would be something like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Opened #5659 to track this |
||
case _ => false | ||
}) | ||
} | ||
|
||
def containsNewline(regex: RegexAST): Boolean = { | ||
contains(regex, { | ||
case RegexChar('\r') | RegexEscaped('r') => true | ||
case RegexChar('\n') | RegexEscaped('n') => true | ||
case RegexChar('\f') | RegexEscaped('f') => true | ||
andygrove marked this conversation as resolved.
Show resolved
Hide resolved
|
||
case RegexChar('\u0085') | RegexChar('\u2028') | RegexChar('\u2029') => true | ||
case RegexEscaped('s') | RegexEscaped('v') | RegexEscaped('R') => true | ||
case RegexEscaped('W') | RegexEscaped('D') => | ||
// these would get transpiled to negated character classes | ||
// that include newlines | ||
true | ||
case RegexCharacterClass(true, _) => true | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: there is an edge case here that can prevent this from being a newline, and that being a negated character class that includes all new line characters. I see no need to handle this at the moment, but should be noted in case this potentially comes up in some testing scenarios. |
||
case _ => false | ||
}) | ||
} | ||
|
||
def containsEmpty(regex: RegexAST): Boolean = { | ||
contains(regex, { | ||
case RegexRepetition(_, term) => term match { | ||
case SimpleQuantifier('*') | SimpleQuantifier('?') => true | ||
case QuantifierFixedLength(0) => true | ||
case QuantifierVariableLength(0, _) => true | ||
case _ => false | ||
} | ||
case _ => false | ||
}) | ||
} | ||
|
||
def checkPair(r1: RegexAST, r2: RegexAST): Unit = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: would be nice for this to have a more descriptive name, like "checkEndAnchorContext", so it's more obvious to the reader what this is checking. |
||
if ((containsEndAnchor(r1) && | ||
(containsNewline(r2) || containsEmpty(r2) || containsBeginAnchor(r2))) || | ||
(containsEndAnchor(r2) && | ||
(containsNewline(r1) || containsEmpty(r1) || containsBeginAnchor(r1)))) { | ||
throw new RegexUnsupportedException( | ||
s"End of line/string anchor is not supported in this context: " + | ||
s"${toReadableString(r1.toRegexString)}" + | ||
s"${toReadableString(r2.toRegexString)}") | ||
} | ||
} | ||
|
||
def checkUnsupported(regex: RegexAST): Unit = { | ||
regex match { | ||
andygrove marked this conversation as resolved.
Show resolved
Hide resolved
|
||
case RegexSequence(parts) => | ||
// check each pair of regex ast nodes for unsupported combinations | ||
// of end string/line anchors and newlines or optional items | ||
for (i <- 1 until parts.length) { | ||
checkPair(parts(i - 1), parts(i)) | ||
} | ||
case RegexChoice(l, r) => | ||
checkUnsupported(l) | ||
checkUnsupported(r) | ||
case RegexGroup(_, term) => checkUnsupported(term) | ||
case RegexRepetition(ast, _) => checkUnsupported(ast) | ||
case RegexCharacterClass(_, components) => | ||
for (i <- 1 until components.length) { | ||
checkPair(components(i - 1), components(i)) | ||
} | ||
case _ => | ||
// ignore | ||
} | ||
} | ||
|
||
checkUnsupported(regex) | ||
|
||
rewrite(regex, replacement, previous) | ||
} | ||
|
||
private def rewrite(regex: RegexAST, replacement: Option[RegexReplacement], | ||
previous: Option[RegexAST]): RegexAST = { | ||
regex match { | ||
|
@@ -1162,6 +1263,21 @@ class CudfRegexTranspiler(mode: RegexMode) { | |
} | ||
} | ||
|
||
private def contains(regex: RegexAST, f: RegexAST => Boolean): Boolean = { | ||
if (f(regex)) { | ||
true | ||
} else { | ||
regex match { | ||
case RegexSequence(parts) => parts.exists(x => contains(x, f)) | ||
case RegexGroup(_, term) => contains(term, f) | ||
case RegexChoice(l, r) => contains(l, f) || contains(r, f) | ||
case RegexRepetition(term, _) => contains(term, f) | ||
case RegexCharacterClass(_, chars) => chars.exists(ch => contains(ch, f)) | ||
case leaf => f(leaf) | ||
} | ||
} | ||
} | ||
|
||
private def isBeginOrEndLineAnchor(regex: RegexAST): Boolean = regex match { | ||
case RegexSequence(parts) => parts.nonEmpty && parts.forall(isBeginOrEndLineAnchor) | ||
case RegexGroup(_, term) => isBeginOrEndLineAnchor(term) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit:
replacement
andprevious
is never used here, just passed on torewrite
. It may be more descriptive to remove these, rename this method tocheckUnsupported
(take the code out of the closure), and then call this andrewrite
insidetranspile