Skip to content

Commit

Permalink
[GLUTEN-5341][VL] Fix SPARK-42782: Hive compatibility check for get_j…
Browse files Browse the repository at this point in the history
…son_object (apache#5467)
  • Loading branch information
ayushi-agarwal authored and Preetesh2110 committed Apr 25, 2024
1 parent 77a297d commit 5e60cb4
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 2 deletions.
2 changes: 2 additions & 0 deletions docs/velox-backend-limitations.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ In certain cases, Gluten result may be different from Vanilla spark.
#### JSON functions
Velox only supports double quotes surrounded strings, not single quotes, in JSON data. If single quotes are used, gluten will produce incorrect result.

Velox doesn't support [*] in path when get_json_object function is called and returns null instead.

#### Parquet read conf
Gluten supports `spark.files.ignoreCorruptFiles` with default false, if true, the behavior is same as config false.
Gluten ignores `spark.sql.parquet.datetimeRebaseModeInRead`, it only returns what write in parquet file. It does not consider the difference between legacy
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ class VeloxTestSettings extends BackendTestSettings {
enableSuite[GlutenHigherOrderFunctionsSuite]
enableSuite[GlutenIntervalExpressionsSuite]
enableSuite[GlutenJsonFunctionsSuite]
// Disable for Spark3.5.
// * in get_json_object expression not supported in velox
.exclude("SPARK-42782: Hive compatibility check for get_json_object")
// Velox does not support single quotes in get_json_object function.
.exclude("function get_json_object - support single quotes")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,89 @@
*/
package org.apache.spark.sql

class GlutenJsonFunctionsSuite extends JsonFunctionsSuite with GlutenSQLTestsTrait {}
class GlutenJsonFunctionsSuite extends JsonFunctionsSuite with GlutenSQLTestsTrait {
import testImplicits._

testGluten("SPARK-42782: Hive compatibility check for get_json_object ") {
val book0 = "{\"author\":\"Nigel Rees\",\"title\":\"Sayings of the Century\"" +
",\"category\":\"reference\",\"price\":8.95}"
val backet0 = "[1,2,{\"b\":\"y\",\"a\":\"x\"}]"
val backet = "[" + backet0 + ",[3,4],[5,6]]"
val backetFlat = backet0.substring(0, backet0.length() - 1) + ",3,4,5,6]"

val book = "[" + book0 + ",{\"author\":\"Herman Melville\",\"title\":\"Moby Dick\"," +
"\"category\":\"fiction\",\"price\":8.99" +
",\"isbn\":\"0-553-21311-3\"},{\"author\":\"J. R. R. Tolkien\"" +
",\"title\":\"The Lord of the Rings\",\"category\":\"fiction\"" +
",\"reader\":[{\"age\":25,\"name\":\"bob\"},{\"age\":26,\"name\":\"jack\"}]" +
",\"price\":22.99,\"isbn\":\"0-395-19395-8\"}]"

val json = "{\"store\":{\"fruit\":[{\"weight\":8,\"type\":\"apple\"}," +
"{\"weight\":9,\"type\":\"pear\"}],\"basket\":" + backet + ",\"book\":" + book +
",\"bicycle\":{\"price\":19.95,\"color\":\"red\"}}" +
",\"email\":\"amy@only_for_json_udf_test.net\"" +
",\"owner\":\"amy\",\"zip code\":\"94025\",\"fb:testid\":\"1234\"}"

// Basic test
runTest(json, "$.owner", "amy")
runTest(json, "$.store.bicycle", "{\"price\":19.95,\"color\":\"red\"}")
runTest(json, "$.store.book", book)
runTest(json, "$.store.book[0]", book0)
// runTest(json, "$.store.book[*]", book) - not supported in velox
runTest(json, "$.store.book[0].category", "reference")
// runTest(json, "$.store.book[*].category",
// "[\"reference\",\"fiction\",\"fiction\"]") - not supported in velox
// runTest(json, "$.store.book[*].reader[0].age", "25") - not supported in velox
// runTest(json, "$.store.book[*].reader[*].age", "[25,26]") - not supported in velox
runTest(json, "$.store.basket[0][1]", "2")
// runTest(json, "$.store.basket[*]", backet) - not supported in velox
// runTest(json, "$.store.basket[*][0]", "[1,3,5]") - not supported in velox
// runTest(json, "$.store.basket[0][*]", backet0) - not supported in velox
// runTest(json, "$.store.basket[*][*]", backetFlat) - not supported in velox
runTest(json, "$.store.basket[0][2].b", "y")
// runTest(json, "$.store.basket[0][*].b", "[\"y\"]") - not supported in velox
runTest(json, "$.non_exist_key", null)
runTest(json, "$.store.book[10]", null)
runTest(json, "$.store.book[0].non_exist_key", null)
// runTest(json, "$.store.basket[*].non_exist_key", null) - not supported in velox
// runTest(json, "$.store.basket[0][*].non_exist_key", null) - not supported in velox
// runTest(json, "$.store.basket[*][*].non_exist_key", null) - not supported in velox
runTest(json, "$.zip code", "94025")
runTest(json, "$.fb:testid", "1234")
// runTest("{\"a\":\"b\nc\"}", "$.a", "b\nc") - not supported in velox

// Test root array
runTest("[1,2,3]", "$[0]", "1")
runTest("[1,2,3]", "$.[0]", null) // Not supported in spark and velox
runTest("[1,2,3]", "$.[1]", null) // Not supported in spark and velox
runTest("[1,2,3]", "$[1]", "2")

runTest("[1,2,3]", "$[3]", null)
runTest("[1,2,3]", "$.[*]", null) // Not supported in spark and velox
// runTest("[1,2,3]", "$[*]", "[1,2,3]") - not supported in velox
// runTest("[1,2,3]", "$", "[1,2,3]") - not supported in velox
runTest("[{\"k1\":\"v1\"},{\"k2\":\"v2\"},{\"k3\":\"v3\"}]", "$[2]", "{\"k3\":\"v3\"}")
runTest("[{\"k1\":\"v1\"},{\"k2\":\"v2\"},{\"k3\":\"v3\"}]", "$[2].k3", "v3")
runTest("[{\"k1\":[{\"k11\":[1,2,3]}]}]", "$[0].k1[0].k11[1]", "2")
runTest("[{\"k1\":[{\"k11\":[1,2,3]}]}]", "$[0].k1[0].k11", "[1,2,3]")
runTest("[{\"k1\":[{\"k11\":[1,2,3]}]}]", "$[0].k1[0]", "{\"k11\":[1,2,3]}")
runTest("[{\"k1\":[{\"k11\":[1,2,3]}]}]", "$[0].k1", "[{\"k11\":[1,2,3]}]")
runTest("[{\"k1\":[{\"k11\":[1,2,3]}]}]", "$[0]", "{\"k1\":[{\"k11\":[1,2,3]}]}")
runTest("[[1,2,3],[4,5,6],[7,8,9]]", "$[1]", "[4,5,6]")
runTest("[[1,2,3],[4,5,6],[7,8,9]]", "$[1][0]", "4")
runTest("[\"a\",\"b\"]", "$[1]", "b")
runTest("[[\"a\",\"b\"]]", "$[0][1]", "b")

runTest("[1,2,3]", "[0]", "1")
// runTest("[1,2,3]", "$0", null) crashes in velox
runTest("[1,2,3]", "0", null)
runTest("[1,2,3]", "$.", null)

// runTest("[1,2,3]", "$", "[1,2,3]") crashes in velox
// runTest("{\"a\":4}", "$", "{\"a\":4}") crashes in velox

def runTest(json: String, path: String, exp: String): Unit = {
checkAnswer(Seq(json).toDF().selectExpr(s"get_json_object(value, '$path')"), Row(exp))
}
}
}

0 comments on commit 5e60cb4

Please sign in to comment.