Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GLUTEN-5341][VL][TEST] Fix SPARK-42782: Hive compatibility check for get_json_object #5467

Merged
merged 3 commits into from
Apr 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/velox-backend-limitations.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ In certain cases, Gluten result may be different from Vanilla spark.
#### JSON functions
Velox only supports double quotes surrounded strings, not single quotes, in JSON data. If single quotes are used, gluten will produce incorrect result.

Velox doesn't support [*] in path when get_json_object function is called and returns null instead.

#### Parquet read conf
Gluten supports `spark.files.ignoreCorruptFiles` with default false, if true, the behavior is same as config false.
Gluten ignores `spark.sql.parquet.datetimeRebaseModeInRead`, it only returns what write in parquet file. It does not consider the difference between legacy
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ class VeloxTestSettings extends BackendTestSettings {
enableSuite[GlutenHashExpressionsSuite]
enableSuite[GlutenIntervalExpressionsSuite]
enableSuite[GlutenJsonFunctionsSuite]
// Disable for Spark3.5.
// * in get_json_object expression not supported in velox
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any issue to track this, if not please open one, thanks.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, I will open one, this is the PR where get_json_object support was added facebookincubator/velox@20e4678

.exclude("SPARK-42782: Hive compatibility check for get_json_object")
// Velox does not support single quotes in get_json_object function.
.exclude("function get_json_object - support single quotes")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,89 @@
*/
package org.apache.spark.sql

class GlutenJsonFunctionsSuite extends JsonFunctionsSuite with GlutenSQLTestsTrait {}
class GlutenJsonFunctionsSuite extends JsonFunctionsSuite with GlutenSQLTestsTrait {
import testImplicits._

testGluten("SPARK-42782: Hive compatibility check for get_json_object ") {
val book0 = "{\"author\":\"Nigel Rees\",\"title\":\"Sayings of the Century\"" +
",\"category\":\"reference\",\"price\":8.95}"
val backet0 = "[1,2,{\"b\":\"y\",\"a\":\"x\"}]"
val backet = "[" + backet0 + ",[3,4],[5,6]]"
val backetFlat = backet0.substring(0, backet0.length() - 1) + ",3,4,5,6]"

val book = "[" + book0 + ",{\"author\":\"Herman Melville\",\"title\":\"Moby Dick\"," +
"\"category\":\"fiction\",\"price\":8.99" +
",\"isbn\":\"0-553-21311-3\"},{\"author\":\"J. R. R. Tolkien\"" +
",\"title\":\"The Lord of the Rings\",\"category\":\"fiction\"" +
",\"reader\":[{\"age\":25,\"name\":\"bob\"},{\"age\":26,\"name\":\"jack\"}]" +
",\"price\":22.99,\"isbn\":\"0-395-19395-8\"}]"

val json = "{\"store\":{\"fruit\":[{\"weight\":8,\"type\":\"apple\"}," +
"{\"weight\":9,\"type\":\"pear\"}],\"basket\":" + backet + ",\"book\":" + book +
",\"bicycle\":{\"price\":19.95,\"color\":\"red\"}}" +
",\"email\":\"amy@only_for_json_udf_test.net\"" +
",\"owner\":\"amy\",\"zip code\":\"94025\",\"fb:testid\":\"1234\"}"

// Basic test
runTest(json, "$.owner", "amy")
runTest(json, "$.store.bicycle", "{\"price\":19.95,\"color\":\"red\"}")
runTest(json, "$.store.book", book)
runTest(json, "$.store.book[0]", book0)
// runTest(json, "$.store.book[*]", book) - not supported in velox
runTest(json, "$.store.book[0].category", "reference")
// runTest(json, "$.store.book[*].category",
// "[\"reference\",\"fiction\",\"fiction\"]") - not supported in velox
// runTest(json, "$.store.book[*].reader[0].age", "25") - not supported in velox
// runTest(json, "$.store.book[*].reader[*].age", "[25,26]") - not supported in velox
runTest(json, "$.store.basket[0][1]", "2")
// runTest(json, "$.store.basket[*]", backet) - not supported in velox
// runTest(json, "$.store.basket[*][0]", "[1,3,5]") - not supported in velox
// runTest(json, "$.store.basket[0][*]", backet0) - not supported in velox
// runTest(json, "$.store.basket[*][*]", backetFlat) - not supported in velox
runTest(json, "$.store.basket[0][2].b", "y")
// runTest(json, "$.store.basket[0][*].b", "[\"y\"]") - not supported in velox
runTest(json, "$.non_exist_key", null)
runTest(json, "$.store.book[10]", null)
runTest(json, "$.store.book[0].non_exist_key", null)
// runTest(json, "$.store.basket[*].non_exist_key", null) - not supported in velox
// runTest(json, "$.store.basket[0][*].non_exist_key", null) - not supported in velox
// runTest(json, "$.store.basket[*][*].non_exist_key", null) - not supported in velox
runTest(json, "$.zip code", "94025")
runTest(json, "$.fb:testid", "1234")
// runTest("{\"a\":\"b\nc\"}", "$.a", "b\nc") - not supported in velox

// Test root array
runTest("[1,2,3]", "$[0]", "1")
runTest("[1,2,3]", "$.[0]", null) // Not supported in spark and velox
runTest("[1,2,3]", "$.[1]", null) // Not supported in spark and velox
runTest("[1,2,3]", "$[1]", "2")

runTest("[1,2,3]", "$[3]", null)
runTest("[1,2,3]", "$.[*]", null) // Not supported in spark and velox
// runTest("[1,2,3]", "$[*]", "[1,2,3]") - not supported in velox
// runTest("[1,2,3]", "$", "[1,2,3]") - not supported in velox
runTest("[{\"k1\":\"v1\"},{\"k2\":\"v2\"},{\"k3\":\"v3\"}]", "$[2]", "{\"k3\":\"v3\"}")
runTest("[{\"k1\":\"v1\"},{\"k2\":\"v2\"},{\"k3\":\"v3\"}]", "$[2].k3", "v3")
runTest("[{\"k1\":[{\"k11\":[1,2,3]}]}]", "$[0].k1[0].k11[1]", "2")
runTest("[{\"k1\":[{\"k11\":[1,2,3]}]}]", "$[0].k1[0].k11", "[1,2,3]")
runTest("[{\"k1\":[{\"k11\":[1,2,3]}]}]", "$[0].k1[0]", "{\"k11\":[1,2,3]}")
runTest("[{\"k1\":[{\"k11\":[1,2,3]}]}]", "$[0].k1", "[{\"k11\":[1,2,3]}]")
runTest("[{\"k1\":[{\"k11\":[1,2,3]}]}]", "$[0]", "{\"k1\":[{\"k11\":[1,2,3]}]}")
runTest("[[1,2,3],[4,5,6],[7,8,9]]", "$[1]", "[4,5,6]")
runTest("[[1,2,3],[4,5,6],[7,8,9]]", "$[1][0]", "4")
runTest("[\"a\",\"b\"]", "$[1]", "b")
runTest("[[\"a\",\"b\"]]", "$[0][1]", "b")

runTest("[1,2,3]", "[0]", "1")
// runTest("[1,2,3]", "$0", null) crashes in velox
runTest("[1,2,3]", "0", null)
runTest("[1,2,3]", "$.", null)

// runTest("[1,2,3]", "$", "[1,2,3]") crashes in velox
// runTest("{\"a\":4}", "$", "{\"a\":4}") crashes in velox

def runTest(json: String, path: String, exp: String): Unit = {
checkAnswer(Seq(json).toDF().selectExpr(s"get_json_object(value, '$path')"), Row(exp))
}
}
}
Loading