Skip to content

Commit

Permalink
[SPARK-23094] Fix invalid character handling in JsonDataSource
Browse files Browse the repository at this point in the history
## What changes were proposed in this pull request?

There were two related fixes regarding `from_json`, `get_json_object` and `json_tuple` ([Fix #1](apache@c8803c0),
 [Fix #2](apache@86174ea)), but they weren't comprehensive it seems. I wanted to extend those fixes to all the parsers, and add tests for each case.

## How was this patch tested?

Regression tests

Author: Burak Yavuz <[email protected]>

Closes apache#20302 from brkyvz/json-invfix.
  • Loading branch information
brkyvz authored and HyukjinKwon committed Jan 18, 2018
1 parent f568e9c commit e01919e
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,11 @@ private[sql] object CreateJacksonParser extends Serializable {
}

def text(jsonFactory: JsonFactory, record: Text): JsonParser = {
jsonFactory.createParser(record.getBytes, 0, record.getLength)
val bain = new ByteArrayInputStream(record.getBytes, 0, record.getLength)
jsonFactory.createParser(new InputStreamReader(bain, "UTF-8"))
}

def inputStream(jsonFactory: JsonFactory, record: InputStream): JsonParser = {
jsonFactory.createParser(record)
jsonFactory.createParser(new InputStreamReader(record, "UTF-8"))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ import org.apache.spark.sql.types._
class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
override val dataSourceName: String = "json"

private val badJson = "\u0000\u0000\u0000A\u0001AAA"

// JSON does not write data of NullType and does not play well with BinaryType.
override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
case _: NullType => false
Expand Down Expand Up @@ -105,4 +107,36 @@ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
)
}
}

test("invalid json with leading nulls - from file (multiLine=true)") {
import testImplicits._
withTempDir { tempDir =>
val path = tempDir.getAbsolutePath
Seq(badJson, """{"a":1}""").toDS().write.mode("overwrite").text(path)
val expected = s"""$badJson\n{"a":1}\n"""
val schema = new StructType().add("a", IntegerType).add("_corrupt_record", StringType)
val df =
spark.read.format(dataSourceName).option("multiLine", true).schema(schema).load(path)
checkAnswer(df, Row(null, expected))
}
}

test("invalid json with leading nulls - from file (multiLine=false)") {
import testImplicits._
withTempDir { tempDir =>
val path = tempDir.getAbsolutePath
Seq(badJson, """{"a":1}""").toDS().write.mode("overwrite").text(path)
val schema = new StructType().add("a", IntegerType).add("_corrupt_record", StringType)
val df =
spark.read.format(dataSourceName).option("multiLine", false).schema(schema).load(path)
checkAnswer(df, Seq(Row(1, null), Row(null, badJson)))
}
}

test("invalid json with leading nulls - from dataset") {
import testImplicits._
checkAnswer(
spark.read.json(Seq(badJson).toDS()),
Row(badJson))
}
}

0 comments on commit e01919e

Please sign in to comment.