-
Notifications
You must be signed in to change notification settings - Fork 28.4k
Commit
…nsistent with old sql parser behavior ## What changes were proposed in this pull request? The new SQL parser is introduced into Spark 2.0. All string literals are unescaped in parser. Seems it bring an issue regarding the regex pattern string. The following codes can reproduce it: val data = Seq("\u0020\u0021\u0023", "abc") val df = data.toDF() // 1st usage: works in 1.6 // Let parser parse pattern string val rlike1 = df.filter("value rlike '^\\x20[\\x20-\\x23]+$'") // 2nd usage: works in 1.6, 2.x // Call Column.rlike so the pattern string is a literal which doesn't go through parser val rlike2 = df.filter($"value".rlike("^\\x20[\\x20-\\x23]+$")) // In 2.x, we need add backslashes to make regex pattern parsed correctly val rlike3 = df.filter("value rlike '^\\\\x20[\\\\x20-\\\\x23]+$'") Follow the discussion in #17736, this patch adds a config to fallback to 1.6 string literal parsing and mitigate migration issue. ## How was this patch tested? Jenkins tests. Please review http://spark.apache.org/contributing.html before opening a pull request. Author: Liang-Chi Hsieh <[email protected]> Closes #17887 from viirya/add-config-fallback-string-parsing.
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, _} | |
import org.apache.spark.sql.catalyst.expressions._ | ||
import org.apache.spark.sql.catalyst.expressions.aggregate.{First, Last} | ||
import org.apache.spark.sql.catalyst.plans.PlanTest | ||
import org.apache.spark.sql.internal.SQLConf | ||
import org.apache.spark.sql.types._ | ||
import org.apache.spark.unsafe.types.CalendarInterval | ||
|
||
|
@@ -39,12 +40,17 @@ class ExpressionParserSuite extends PlanTest { | |
import org.apache.spark.sql.catalyst.dsl.expressions._ | ||
import org.apache.spark.sql.catalyst.dsl.plans._ | ||
|
||
def assertEqual(sqlCommand: String, e: Expression): Unit = { | ||
compareExpressions(parseExpression(sqlCommand), e) | ||
val defaultParser = CatalystSqlParser | ||
|
||
def assertEqual( | ||
sqlCommand: String, | ||
e: Expression, | ||
parser: ParserInterface = defaultParser): Unit = { | ||
compareExpressions(parser.parseExpression(sqlCommand), e) | ||
} | ||
|
||
def intercept(sqlCommand: String, messages: String*): Unit = { | ||
val e = intercept[ParseException](parseExpression(sqlCommand)) | ||
val e = intercept[ParseException](defaultParser.parseExpression(sqlCommand)) | ||
messages.foreach { message => | ||
assert(e.message.contains(message)) | ||
} | ||
|
@@ -101,7 +107,7 @@ class ExpressionParserSuite extends PlanTest { | |
test("long binary logical expressions") { | ||
def testVeryBinaryExpression(op: String, clazz: Class[_]): Unit = { | ||
val sql = (1 to 1000).map(x => s"$x == $x").mkString(op) | ||
val e = parseExpression(sql) | ||
val e = defaultParser.parseExpression(sql) | ||
assert(e.collect { case _: EqualTo => true }.size === 1000) | ||
assert(e.collect { case x if clazz.isInstance(x) => true }.size === 999) | ||
} | ||
|
@@ -160,6 +166,15 @@ class ExpressionParserSuite extends PlanTest { | |
assertEqual("a not regexp 'pattern%'", !('a rlike "pattern%")) | ||
} | ||
|
||
test("like expressions with ESCAPED_STRING_LITERALS = true") { | ||
val conf = new SQLConf() | ||
conf.setConfString(SQLConf.ESCAPED_STRING_LITERALS.key, "true") | ||
val parser = new CatalystSqlParser(conf) | ||
assertEqual("a rlike '^\\x20[\\x20-\\x23]+$'", 'a rlike "^\\x20[\\x20-\\x23]+$", parser) | ||
assertEqual("a rlike 'pattern\\\\'", 'a rlike "pattern\\\\", parser) | ||
assertEqual("a rlike 'pattern\\t\\n'", 'a rlike "pattern\\t\\n", parser) | ||
} | ||
|
||
test("is null expressions") { | ||
assertEqual("a is null", 'a.isNull) | ||
assertEqual("a is not null", 'a.isNotNull) | ||
|
@@ -418,38 +433,79 @@ class ExpressionParserSuite extends PlanTest { | |
} | ||
|
||
test("strings") { | ||
// Single Strings. | ||
assertEqual("\"hello\"", "hello") | ||
assertEqual("'hello'", "hello") | ||
|
||
// Multi-Strings. | ||
assertEqual("\"hello\" 'world'", "helloworld") | ||
assertEqual("'hello' \" \" 'world'", "hello world") | ||
|
||
// 'LIKE' string literals. Notice that an escaped '%' is the same as an escaped '\' and a | ||
// regular '%'; to get the correct result you need to add another escaped '\'. | ||
// TODO figure out if we shouldn't change the ParseUtils.unescapeSQLString method? | ||
assertEqual("'pattern%'", "pattern%") | ||
assertEqual("'no-pattern\\%'", "no-pattern\\%") | ||
assertEqual("'pattern\\\\%'", "pattern\\%") | ||
assertEqual("'pattern\\\\\\%'", "pattern\\\\%") | ||
|
||
// Escaped characters. | ||
// See: http://dev.mysql.com/doc/refman/5.7/en/string-literals.html | ||
assertEqual("'\\0'", "\u0000") // ASCII NUL (X'00') | ||
assertEqual("'\\''", "\'") // Single quote | ||
assertEqual("'\\\"'", "\"") // Double quote | ||
assertEqual("'\\b'", "\b") // Backspace | ||
assertEqual("'\\n'", "\n") // Newline | ||
assertEqual("'\\r'", "\r") // Carriage return | ||
assertEqual("'\\t'", "\t") // Tab character | ||
assertEqual("'\\Z'", "\u001A") // ASCII 26 - CTRL + Z (EOF on windows) | ||
|
||
// Octals | ||
assertEqual("'\\110\\145\\154\\154\\157\\041'", "Hello!") | ||
|
||
// Unicode | ||
assertEqual("'\\u0057\\u006F\\u0072\\u006C\\u0064\\u0020\\u003A\\u0029'", "World :)") | ||
Seq(true, false).foreach { escape => | ||
val conf = new SQLConf() | ||
conf.setConfString(SQLConf.ESCAPED_STRING_LITERALS.key, escape.toString) | ||
val parser = new CatalystSqlParser(conf) | ||
|
||
// tests that have same result whatever the conf is | ||
// Single Strings. | ||
assertEqual("\"hello\"", "hello", parser) | ||
assertEqual("'hello'", "hello", parser) | ||
|
||
// Multi-Strings. | ||
assertEqual("\"hello\" 'world'", "helloworld", parser) | ||
assertEqual("'hello' \" \" 'world'", "hello world", parser) | ||
|
||
// 'LIKE' string literals. Notice that an escaped '%' is the same as an escaped '\' and a | ||
// regular '%'; to get the correct result you need to add another escaped '\'. | ||
// TODO figure out if we shouldn't change the ParseUtils.unescapeSQLString method? | ||
assertEqual("'pattern%'", "pattern%", parser) | ||
assertEqual("'no-pattern\\%'", "no-pattern\\%", parser) | ||
|
||
// tests that have different result regarding the conf | ||
if (escape) { | ||
// When SQLConf.ESCAPED_STRING_LITERALS is enabled, string literal parsing fallbacks to | ||
// Spark 1.6 behavior. | ||
|
||
// 'LIKE' string literals. | ||
assertEqual("'pattern\\\\%'", "pattern\\\\%", parser) | ||
assertEqual("'pattern\\\\\\%'", "pattern\\\\\\%", parser) | ||
|
||
// Escaped characters. | ||
assertEqual("'\0'", "\u0000", parser) // ASCII NUL (X'00') | ||
|
||
// Note: Single quote follows 1.6 parsing behavior when ESCAPED_STRING_LITERALS is enabled. | ||
val e = intercept[ParseException](parser.parseExpression("'\''")) | ||
assert(e.message.contains("extraneous input '''")) | ||
|
||
assertEqual("'\"'", "\"", parser) // Double quote | ||
assertEqual("'\b'", "\b", parser) // Backspace | ||
assertEqual("'\n'", "\n", parser) // Newline | ||
assertEqual("'\r'", "\r", parser) // Carriage return | ||
assertEqual("'\t'", "\t", parser) // Tab character | ||
|
||
// Octals | ||
assertEqual("'\110\145\154\154\157\041'", "Hello!", parser) | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
viirya
Author
Member
|
||
// Unicode | ||
assertEqual("'\u0057\u006F\u0072\u006C\u0064\u0020\u003A\u0029'", "World :)", parser) | ||
} else { | ||
// Default behavior | ||
|
||
// 'LIKE' string literals. | ||
assertEqual("'pattern\\\\%'", "pattern\\%", parser) | ||
assertEqual("'pattern\\\\\\%'", "pattern\\\\%", parser) | ||
|
||
// Escaped characters. | ||
// See: http://dev.mysql.com/doc/refman/5.7/en/string-literals.html | ||
assertEqual("'\\0'", "\u0000", parser) // ASCII NUL (X'00') | ||
assertEqual("'\\''", "\'", parser) // Single quote | ||
assertEqual("'\\\"'", "\"", parser) // Double quote | ||
assertEqual("'\\b'", "\b", parser) // Backspace | ||
assertEqual("'\\n'", "\n", parser) // Newline | ||
assertEqual("'\\r'", "\r", parser) // Carriage return | ||
assertEqual("'\\t'", "\t", parser) // Tab character | ||
assertEqual("'\\Z'", "\u001A", parser) // ASCII 26 - CTRL + Z (EOF on windows) | ||
|
||
// Octals | ||
assertEqual("'\\110\\145\\154\\154\\157\\041'", "Hello!", parser) | ||
|
||
// Unicode | ||
assertEqual("'\\u0057\\u006F\\u0072\\u006C\\u0064\\u0020\\u003A\\u0029'", "World :)", | ||
parser) | ||
} | ||
|
||
} | ||
} | ||
|
||
test("intervals") { | ||
|
@viirya did you mean to write
\
instead of\\
here, unlike in the else branch below?\110
is a deprecated way to just generate the letter "H" and is parsed by scalac so this test doesn't seem to do anything as it is. It's meant to test that the parser can deal with the escapes? I can adjust it to\\110
etc if so in my patch to update to Scala 2.11+ support only because that's what generates the warning.