From 46bdf7d0497b8e23ca61964f896495d9c350ef6c Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Mon, 6 May 2024 13:56:42 -0700 Subject: [PATCH 1/8] initial commit --- .../sql/catalyst/json/JacksonParser.scala | 29 +++++++++++++++---- .../datasources/json/JsonSuite.scala | 18 ++++++++++++ 2 files changed, 42 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala index eadd0a4f8ab9e..745cf2c207873 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala @@ -280,13 +280,32 @@ class JacksonParser( case VALUE_STRING => UTF8String.fromString(parser.getText) - case _ => + case other => // Note that it always tries to convert the data as string without the case of failure. - val writer = new ByteArrayOutputStream() - Utils.tryWithResource(factory.createGenerator(writer, JsonEncoding.UTF8)) { - generator => generator.copyCurrentStructure(parser) + val startLocation = parser.getTokenLocation + startLocation.contentReference().getRawContent match { + case byteArray: Array[Byte] => + other match { + case START_OBJECT => + parser.skipChildren() + case START_ARRAY => + parser.skipChildren() + case _ => + // Do nothing in this case; we've already read the token + } + val endLocation = parser.currentLocation.getByteOffset + + UTF8String.fromBytes( + byteArray, + startLocation.getByteOffset.toInt, + endLocation.toInt - (startLocation.getByteOffset.toInt)) + case _ => + val writer = new ByteArrayOutputStream() + Utils.tryWithResource(factory.createGenerator(writer, JsonEncoding.UTF8)) { + generator => generator.copyCurrentStructure(parser) + } + UTF8String.fromBytes(writer.toByteArray) } - UTF8String.fromBytes(writer.toByteArray) } case TimestampType => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index c17a25be8e2ae..b0400d32459cc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -3865,6 +3865,24 @@ abstract class JsonSuite } } } + + test("SPARK-48148: decimal precision is preserved when object is read as string") { + withTempPath { path => + + val granularFloat = "-999.99999999999999999999999999999999995" + val jsonString = s"""{"data": {"v": ${granularFloat}}}, {"data": {"v": ${granularFloat}}}]""" + + Seq(jsonString).toDF() + .repartition(1) + .write + .text(path.getAbsolutePath) + + val df = spark.read.schema("data STRING").json(path.getAbsolutePath) + + val expected = s"""{"v": ${granularFloat}}""" + checkAnswer(df, Seq(Row(expected))) + } + } } class JsonV1Suite extends JsonSuite { From 79e8457879e1b6388b0794d6c63f648fe8180e77 Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Wed, 8 May 2024 14:18:41 -0700 Subject: [PATCH 2/8] add flag --- .../sql/catalyst/json/JacksonParser.scala | 4 +- .../apache/spark/sql/internal/SQLConf.scala | 9 +++ .../datasources/json/JsonSuite.scala | 60 +++++++++++++++---- 3 files changed, 59 insertions(+), 14 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala index 745cf2c207873..5676bf68b80b1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala @@ -284,7 +284,7 @@ class JacksonParser( // Note that it always tries to convert the data as string without the case of failure. val startLocation = parser.getTokenLocation startLocation.contentReference().getRawContent match { - case byteArray: Array[Byte] => + case byteArray: Array[Byte] if exactStringParsing => other match { case START_OBJECT => parser.skipChildren() @@ -448,6 +448,8 @@ class JacksonParser( private val allowEmptyString = SQLConf.get.getConf(SQLConf.LEGACY_ALLOW_EMPTY_STRING_IN_JSON) + private val exactStringParsing = SQLConf.get.getConf(SQLConf.JSON_EXACT_STRING_PARSING) + /** * This function throws an exception for failed conversion. For empty string on data types * except for string and binary types, this also throws an exception. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index df75985043d0d..4ef669e6209da 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -4249,6 +4249,15 @@ object SQLConf { .booleanConf .createWithDefault(true) + val JSON_EXACT_STRING_PARSING = + buildConf("spark.sql.json.enableExactStringParsing") + .internal() + .doc("When set to true, string columns extracted from JSON objects will be extracted " + + "exactly as they appear in the input string, with no changes") + .version("4.0.0") + .booleanConf + .createWithDefault(true) + val LEGACY_CSV_ENABLE_DATE_TIME_PARSING_FALLBACK = buildConf("spark.sql.legacy.csv.enableDateTimeParsingFallback") .internal() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index b0400d32459cc..eb7081aa003bd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -3866,21 +3866,55 @@ abstract class JsonSuite } } - test("SPARK-48148: decimal precision is preserved when object is read as string") { + test("SPARK-48148: values are unchanged when read as string") { withTempPath { path => + def extractData( + jsonString: String, + expectedInexactData: Seq[String], + expectedExactData: Seq[String], + multiLine: Boolean = false): Unit = { + Seq(jsonString).toDF() + .repartition(1) + .write + .mode("overwrite") + .text(path.getAbsolutePath) + + withClue("Exact string parsing") { + withSQLConf(SQLConf.JSON_EXACT_STRING_PARSING.key -> "false") { + val df = spark.read + .schema("data STRING") + .option("multiLine", multiLine.toString) + .json(path.getAbsolutePath) + checkAnswer(df, expectedInexactData.map(d => Row(d))) + } + } - val granularFloat = "-999.99999999999999999999999999999999995" - val jsonString = s"""{"data": {"v": ${granularFloat}}}, {"data": {"v": ${granularFloat}}}]""" - - Seq(jsonString).toDF() - .repartition(1) - .write - .text(path.getAbsolutePath) - - val df = spark.read.schema("data STRING").json(path.getAbsolutePath) - - val expected = s"""{"v": ${granularFloat}}""" - checkAnswer(df, Seq(Row(expected))) + withClue("Inexact string parsing") { + withSQLConf(SQLConf.JSON_EXACT_STRING_PARSING.key -> "true") { + val df = spark.read + .schema("data STRING") + .option("multiLine", multiLine.toString) + .json(path.getAbsolutePath) + checkAnswer(df, expectedExactData.map(d => Row(d))) + } + } + } + extractData( + s"""{"data": {"white": "space"}}""", + expectedInexactData = Seq(s"""{"white":"space"}"""), + expectedExactData = Seq(s"""{"white": "space"}""") + ) +// extractData(s"""{"data": ["white", "space"]}""", Seq(s"""["white", "space"]""")) +// +// +// extractData( +// s"""{"data": {"white":\n"space"}}""", +// Seq(s"""{"white":\n"space"}"""), +// multiLine = true) +// +// +// val granularFloat = "-999.99999999999999999999999999999999995" +// extractData(s"""{"data": {"v": ${granularFloat}}}""", Seq(s"""{"v": ${granularFloat}}""")) } } } From 07083fde0776c2d681a8e2da2cb9b928fced0a55 Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Wed, 8 May 2024 14:40:04 -0700 Subject: [PATCH 3/8] improve tests --- .../datasources/json/JsonSuite.scala | 36 +++++++++++-------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index eb7081aa003bd..bdea5beed3b11 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -3880,22 +3880,22 @@ abstract class JsonSuite .text(path.getAbsolutePath) withClue("Exact string parsing") { - withSQLConf(SQLConf.JSON_EXACT_STRING_PARSING.key -> "false") { + withSQLConf(SQLConf.JSON_EXACT_STRING_PARSING.key -> "true") { val df = spark.read .schema("data STRING") .option("multiLine", multiLine.toString) .json(path.getAbsolutePath) - checkAnswer(df, expectedInexactData.map(d => Row(d))) + checkAnswer(df, expectedExactData.map(d => Row(d))) } } withClue("Inexact string parsing") { - withSQLConf(SQLConf.JSON_EXACT_STRING_PARSING.key -> "true") { + withSQLConf(SQLConf.JSON_EXACT_STRING_PARSING.key -> "false") { val df = spark.read .schema("data STRING") .option("multiLine", multiLine.toString) .json(path.getAbsolutePath) - checkAnswer(df, expectedExactData.map(d => Row(d))) + checkAnswer(df, expectedInexactData.map(d => Row(d))) } } } @@ -3904,17 +3904,23 @@ abstract class JsonSuite expectedInexactData = Seq(s"""{"white":"space"}"""), expectedExactData = Seq(s"""{"white": "space"}""") ) -// extractData(s"""{"data": ["white", "space"]}""", Seq(s"""["white", "space"]""")) -// -// -// extractData( -// s"""{"data": {"white":\n"space"}}""", -// Seq(s"""{"white":\n"space"}"""), -// multiLine = true) -// -// -// val granularFloat = "-999.99999999999999999999999999999999995" -// extractData(s"""{"data": {"v": ${granularFloat}}}""", Seq(s"""{"v": ${granularFloat}}""")) + extractData( + s"""{"data": ["white", "space"]}""", + expectedInexactData = Seq(s"""["white","space"}]"""), + expectedExactData = Seq(s"""["white": "space"]""") + ) + val granularFloat = "-999.99999999999999999999999999999999995" + extractData( + s"""{"data": {"v": ${granularFloat}}""", + expectedInexactData = Seq(s"""{"v": 1000.0}"""), + expectedExactData = Seq(s"""{"v": ${granularFloat}}""") + ) + extractData( + s"""{"data": {"white":\n"space"}}""", + expectedInexactData = Seq(s"""{"white":"space"}"""), + expectedExactData = Seq(s"""{"white":\n"space"}"""), + multiLine = true + ) } } } From 2f697da3638ae980a2f441020c20caf424464b61 Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Wed, 8 May 2024 15:22:21 -0700 Subject: [PATCH 4/8] should be stable --- .../sql/execution/datasources/json/JsonSuite.scala | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index bdea5beed3b11..3319e30ce6889 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -3906,19 +3906,20 @@ abstract class JsonSuite ) extractData( s"""{"data": ["white", "space"]}""", - expectedInexactData = Seq(s"""["white","space"}]"""), - expectedExactData = Seq(s"""["white": "space"]""") + expectedInexactData = Seq(s"""["white","space"]"""), + expectedExactData = Seq(s"""["white", "space"]""") ) val granularFloat = "-999.99999999999999999999999999999999995" extractData( - s"""{"data": {"v": ${granularFloat}}""", - expectedInexactData = Seq(s"""{"v": 1000.0}"""), + s"""{"data": {"v": ${granularFloat}}}""", + expectedInexactData = Seq(s"""{"v":-1000.0}"""), expectedExactData = Seq(s"""{"v": ${granularFloat}}""") ) + // In multiLine, we fall back to the inexact method: extractData( s"""{"data": {"white":\n"space"}}""", expectedInexactData = Seq(s"""{"white":"space"}"""), - expectedExactData = Seq(s"""{"white":\n"space"}"""), + expectedExactData = Seq(s"""{"white":"space"}"""), multiLine = true ) } From a5c37612370d9177519f6847ffe22576d09a44f8 Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Thu, 9 May 2024 10:11:10 +0900 Subject: [PATCH 5/8] Apply suggestions from code review --- .../datasources/json/JsonSuite.scala | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index 3319e30ce6889..06e3891483a09 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -3900,26 +3900,26 @@ abstract class JsonSuite } } extractData( - s"""{"data": {"white": "space"}}""", - expectedInexactData = Seq(s"""{"white":"space"}"""), - expectedExactData = Seq(s"""{"white": "space"}""") + """{"data": {"white": "space"}}""", + expectedInexactData = Seq("""{"white":"space"}"""), + expectedExactData = Seq("""{"white": "space"}""") ) extractData( - s"""{"data": ["white", "space"]}""", - expectedInexactData = Seq(s"""["white","space"]"""), - expectedExactData = Seq(s"""["white", "space"]""") + """{"data": ["white", "space"]}""", + expectedInexactData = Seq("""["white","space"]"""), + expectedExactData = Seq("""["white", "space"]""") ) val granularFloat = "-999.99999999999999999999999999999999995" extractData( s"""{"data": {"v": ${granularFloat}}}""", - expectedInexactData = Seq(s"""{"v":-1000.0}"""), + expectedInexactData = Seq("""{"v":-1000.0}"""), expectedExactData = Seq(s"""{"v": ${granularFloat}}""") ) // In multiLine, we fall back to the inexact method: extractData( - s"""{"data": {"white":\n"space"}}""", - expectedInexactData = Seq(s"""{"white":"space"}"""), - expectedExactData = Seq(s"""{"white":"space"}"""), + """{"data": {"white":\n"space"}}""", + expectedInexactData = Seq("""{"white":"space"}"""), + expectedExactData = Seq("""{"white":"space"}"""), multiLine = true ) } From 9a78a8d81d9442e51ce1595c58ace271a819cfe5 Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Thu, 9 May 2024 11:09:35 -0700 Subject: [PATCH 6/8] stable; fixed multiline --- .../sql/catalyst/json/JacksonParser.scala | 50 ++++++++++++++----- .../datasources/json/JsonSuite.scala | 37 +++++++------- 2 files changed, 55 insertions(+), 32 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala index 5676bf68b80b1..b50cde50b4cd2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala @@ -24,8 +24,9 @@ import scala.collection.mutable.ArrayBuffer import scala.util.control.NonFatal import com.fasterxml.jackson.core._ - +import org.apache.hadoop.fs.PositionedReadable import org.apache.spark.SparkUpgradeException + import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.{InternalRow, NoopFilters, StructFilters} import org.apache.spark.sql.catalyst.expressions._ @@ -275,30 +276,50 @@ class JacksonParser( } } - case StringType => - (parser: JsonParser) => parseJsonToken[UTF8String](parser, dataType) { + case StringType => (parser: JsonParser) => { + val includeSourceInLocation = JsonParser.Feature.INCLUDE_SOURCE_IN_LOCATION + val originalMask = if (includeSourceInLocation.enabledIn(parser.getFeatureMask)) { + 1 + } else { + 0 + } + parser.overrideStdFeatures(includeSourceInLocation.getMask, includeSourceInLocation.getMask) + val result = parseJsonToken[UTF8String](parser, dataType) { case VALUE_STRING => UTF8String.fromString(parser.getText) case other => // Note that it always tries to convert the data as string without the case of failure. - val startLocation = parser.getTokenLocation + val startLocation = parser.currentTokenLocation() + def skipAhead(): Unit = { + other match { + case START_OBJECT => + parser.skipChildren() + case START_ARRAY => + parser.skipChildren() + case _ => + // Do nothing in this case; we've already read the token + } + } + + // PositionedReadable startLocation.contentReference().getRawContent match { case byteArray: Array[Byte] if exactStringParsing => - other match { - case START_OBJECT => - parser.skipChildren() - case START_ARRAY => - parser.skipChildren() - case _ => - // Do nothing in this case; we've already read the token - } + skipAhead() val endLocation = parser.currentLocation.getByteOffset UTF8String.fromBytes( byteArray, startLocation.getByteOffset.toInt, endLocation.toInt - (startLocation.getByteOffset.toInt)) + case positionedReadable: PositionedReadable if exactStringParsing => + skipAhead() + val endLocation = parser.currentLocation.getByteOffset + + val size = endLocation.toInt - (startLocation.getByteOffset.toInt) + val buffer = new Array[Byte](size) + positionedReadable.read(startLocation.getByteOffset, buffer, 0, size) + UTF8String.fromBytes(buffer, 0, size) case _ => val writer = new ByteArrayOutputStream() Utils.tryWithResource(factory.createGenerator(writer, JsonEncoding.UTF8)) { @@ -306,7 +327,10 @@ class JacksonParser( } UTF8String.fromBytes(writer.toByteArray) } - } + } + parser.overrideStdFeatures(includeSourceInLocation.getMask, originalMask) + result + } case TimestampType => (parser: JsonParser) => parseJsonToken[java.lang.Long](parser, dataType) { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index 06e3891483a09..026dd7390124b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -3899,27 +3899,26 @@ abstract class JsonSuite } } } +// extractData( +// """{"data": {"white": "space"}}""", +// expectedInexactData = Seq("""{"white":"space"}"""), +// expectedExactData = Seq("""{"white": "space"}""") +// ) +// extractData( +// """{"data": ["white", "space"]}""", +// expectedInexactData = Seq("""["white","space"]"""), +// expectedExactData = Seq("""["white", "space"]""") +// ) +// val granularFloat = "-999.99999999999999999999999999999999995" +// extractData( +// s"""{"data": {"v": ${granularFloat}}}""", +// expectedInexactData = Seq("""{"v":-1000.0}"""), +// expectedExactData = Seq(s"""{"v": ${granularFloat}}""") +// ) extractData( - """{"data": {"white": "space"}}""", + s"""{"data": {"white":\n"space"}}""", expectedInexactData = Seq("""{"white":"space"}"""), - expectedExactData = Seq("""{"white": "space"}""") - ) - extractData( - """{"data": ["white", "space"]}""", - expectedInexactData = Seq("""["white","space"]"""), - expectedExactData = Seq("""["white", "space"]""") - ) - val granularFloat = "-999.99999999999999999999999999999999995" - extractData( - s"""{"data": {"v": ${granularFloat}}}""", - expectedInexactData = Seq("""{"v":-1000.0}"""), - expectedExactData = Seq(s"""{"v": ${granularFloat}}""") - ) - // In multiLine, we fall back to the inexact method: - extractData( - """{"data": {"white":\n"space"}}""", - expectedInexactData = Seq("""{"white":"space"}"""), - expectedExactData = Seq("""{"white":"space"}"""), + expectedExactData = Seq(s"""{"white":\n"space"}"""), multiLine = true ) } From e9cb9d2dfa8d42be240c48d08360265e8f318cb7 Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Thu, 9 May 2024 11:11:35 -0700 Subject: [PATCH 7/8] resolve conflicts --- .../org/apache/spark/sql/catalyst/json/JacksonParser.scala | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala index 83d2d77b70989..f649d78eda442 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala @@ -276,8 +276,7 @@ class JacksonParser( } } -<<<<<<< HEAD - case StringType => (parser: JsonParser) => { + case _: StringType => (parser: JsonParser) => { val includeSourceInLocation = JsonParser.Feature.INCLUDE_SOURCE_IN_LOCATION val originalMask = if (includeSourceInLocation.enabledIn(parser.getFeatureMask)) { 1 @@ -286,10 +285,6 @@ class JacksonParser( } parser.overrideStdFeatures(includeSourceInLocation.getMask, includeSourceInLocation.getMask) val result = parseJsonToken[UTF8String](parser, dataType) { -======= - case _: StringType => - (parser: JsonParser) => parseJsonToken[UTF8String](parser, dataType) { ->>>>>>> e1fb1d7e063af7e8eb6e992c800902aff6e19e15 case VALUE_STRING => UTF8String.fromString(parser.getText) From fc77ed0210a30d0ad43433ba356a34b4eb4ce05d Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Thu, 9 May 2024 11:13:33 -0700 Subject: [PATCH 8/8] polish --- .../sql/catalyst/json/JacksonParser.scala | 4 ++- .../datasources/json/JsonSuite.scala | 32 +++++++++---------- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala index f649d78eda442..b2c302fbbbe31 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala @@ -25,8 +25,8 @@ import scala.util.control.NonFatal import com.fasterxml.jackson.core._ import org.apache.hadoop.fs.PositionedReadable -import org.apache.spark.SparkUpgradeException +import org.apache.spark.SparkUpgradeException import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.{InternalRow, NoopFilters, StructFilters} import org.apache.spark.sql.catalyst.expressions._ @@ -277,6 +277,7 @@ class JacksonParser( } case _: StringType => (parser: JsonParser) => { + // This must be enabled if we will retrieve the bytes directly from the raw content: val includeSourceInLocation = JsonParser.Feature.INCLUDE_SOURCE_IN_LOCATION val originalMask = if (includeSourceInLocation.enabledIn(parser.getFeatureMask)) { 1 @@ -328,6 +329,7 @@ class JacksonParser( UTF8String.fromBytes(writer.toByteArray) } } + // Reset back to the original configuration: parser.overrideStdFeatures(includeSourceInLocation.getMask, originalMask) result } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index 026dd7390124b..3d0eedd2f689c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -3899,22 +3899,22 @@ abstract class JsonSuite } } } -// extractData( -// """{"data": {"white": "space"}}""", -// expectedInexactData = Seq("""{"white":"space"}"""), -// expectedExactData = Seq("""{"white": "space"}""") -// ) -// extractData( -// """{"data": ["white", "space"]}""", -// expectedInexactData = Seq("""["white","space"]"""), -// expectedExactData = Seq("""["white", "space"]""") -// ) -// val granularFloat = "-999.99999999999999999999999999999999995" -// extractData( -// s"""{"data": {"v": ${granularFloat}}}""", -// expectedInexactData = Seq("""{"v":-1000.0}"""), -// expectedExactData = Seq(s"""{"v": ${granularFloat}}""") -// ) + extractData( + """{"data": {"white": "space"}}""", + expectedInexactData = Seq("""{"white":"space"}"""), + expectedExactData = Seq("""{"white": "space"}""") + ) + extractData( + """{"data": ["white", "space"]}""", + expectedInexactData = Seq("""["white","space"]"""), + expectedExactData = Seq("""["white", "space"]""") + ) + val granularFloat = "-999.99999999999999999999999999999999995" + extractData( + s"""{"data": {"v": ${granularFloat}}}""", + expectedInexactData = Seq("""{"v":-1000.0}"""), + expectedExactData = Seq(s"""{"v": ${granularFloat}}""") + ) extractData( s"""{"data": {"white":\n"space"}}""", expectedInexactData = Seq("""{"white":"space"}"""),