Skip to content

Commit

Permalink
Specify recoverWithNull when reading JSON files (#9304)
Browse files Browse the repository at this point in the history
Signed-off-by: Andy Grove <[email protected]>
  • Loading branch information
andygrove authored Oct 26, 2023
1 parent 1177b8f commit cf109c2
Show file tree
Hide file tree
Showing 10 changed files with 109 additions and 2 deletions.
37 changes: 37 additions & 0 deletions integration_tests/src/main/python/json_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,43 @@ def test_basic_json_read(std_input_path, filename, schema, read_func, allow_non_
"allowNumericLeadingZeros": allow_numeric_leading_zeros}),
conf=updated_conf)

@ignore_order
@pytest.mark.parametrize('filename', [
'malformed1.ndjson',
'malformed2.ndjson',
'malformed3.ndjson',
'malformed4.ndjson'
])
@pytest.mark.parametrize('read_func', [read_json_df, read_json_sql])
@pytest.mark.parametrize('schema', [_int_schema])
@pytest.mark.parametrize('v1_enabled_list', ["", "json"])
def test_read_invalid_json(spark_tmp_table_factory, std_input_path, read_func, filename, schema, v1_enabled_list):
conf = copy_and_update(_enable_all_types_conf, {'spark.sql.sources.useV1SourceList': v1_enabled_list})
assert_gpu_and_cpu_are_equal_collect(
read_func(std_input_path + '/' + filename,
schema,
spark_tmp_table_factory,
{}),
conf=conf)

@pytest.mark.parametrize('filename', [
'mixed-primitives.ndjson',
'mixed-primitives-nested.ndjson',
'simple-nested.ndjson',
pytest.param('mixed-nested.ndjson', marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9353'))
])
@pytest.mark.parametrize('read_func', [read_json_df, read_json_sql])
@pytest.mark.parametrize('schema', [_int_schema])
@pytest.mark.parametrize('v1_enabled_list', ["", "json"])
def test_read_valid_json(spark_tmp_table_factory, std_input_path, read_func, filename, schema, v1_enabled_list):
conf = copy_and_update(_enable_all_types_conf, {'spark.sql.sources.useV1SourceList': v1_enabled_list})
assert_gpu_and_cpu_are_equal_collect(
read_func(std_input_path + '/' + filename,
schema,
spark_tmp_table_factory,
{}),
conf=conf)

@approximate_float
@pytest.mark.parametrize('filename', [
'dates.json',
Expand Down
8 changes: 8 additions & 0 deletions integration_tests/src/test/resources/malformed1.ndjson
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{ "number": 1 }
{ , }
{ : 2 }
{ "number": 2 }
{ "number": }
not_json
{ 4 }
{ "number": 5 }
4 changes: 4 additions & 0 deletions integration_tests/src/test/resources/malformed2.ndjson
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{ "number": 1 }
{ "number": 2 }
{ "number": 3
{ "number": 4 }
4 changes: 4 additions & 0 deletions integration_tests/src/test/resources/malformed3.ndjson
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{ "number": 1 }
{ "number": 2 }
{ "number": 3 } { "number": 4 }
{ "number": 5 }
4 changes: 4 additions & 0 deletions integration_tests/src/test/resources/malformed4.ndjson
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{ "number": 1 }
{ "number": 2 }
{ "number": 3 } x
{ "number": 5 }
10 changes: 10 additions & 0 deletions integration_tests/src/test/resources/mixed-nested.ndjson
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{ "var0": true, "var1": 123.45, "var2": [{ "var0": 123.45, "var1": "hello", "var2": 123.45 }, { "var0": null, "var1": "hello", "var2": null }, 123.45] }
{ "var0": true, "var1": 123.45, "var2": [{ "var0": true, "var1": "hello", "var2": true }, "hello", true] }
{ "var0": null, "var1": true, "var2": "hello" }
{ "var0": "hello", "var1": null, "var2": [true, { "var0": true, "var1": "hello", "var2": null }, null] }
{ "var0": [null, true, null], "var1": "hello", "var2": [[123.45, true, 123.45], { "var0": true, "var1": true, "var2": "hello" }, { "var0": null, "var1": 123.45, "var2": 123.45 }] }
{ "var0": null, "var1": { "var0": [null, 123.45, true], "var1": "hello", "var2": "hello" }, "var2": 123.45 }
{ "var0": null, "var1": { "var0": null, "var1": "hello", "var2": 123.45 }, "var2": 123.45 }
{ "var0": true, "var1": true, "var2": [[123.45, null, 123.45], { "var0": true, "var1": null, "var2": 123.45 }, ["hello", true, "hello"]] }
{ "var0": true, "var1": [{ "var0": true, "var1": "hello", "var2": null }, null, [true, null, null]], "var2": [[123.45, null, true], 123.45, { "var0": null, "var1": 123.45, "var2": "hello" }] }
{ "var0": 123.45, "var1": "hello", "var2": 123.45 }
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{ "a": 123.45, "b": { "b": 123.45, "c": "hello", "d": 123.45 }, "c": { "a": 123.45, "b": "hello", "c": 123.45, "d": "hello" } }
{ "a": 123.45, "b": { "a": "hello", "b": 123.45, "c": "hello", "d": 123.45 }, "c": { "a": 123.45, "b": "hello", "c": 123.45, "d": "hello" } }
{ "a": 123.45, "b": { "a": "hello", "b": 123.45, "c": "hello", "d": 123.45 }, "c": { "a": 123.45, "b": "hello", "c": 123.45, "d": "hello" } }
{ "a": 123.45, "b": { "a": "hello", "b": 123.45, "c": "hello", "d": 123.45 }, "c": { "b": "hello", "c": 123.45, "e": "hello" } }

{ "a": 123.45, "b": { "a": "hello", "b": 123.45, "c": "hello", "d": 123.45 }, "c": { "a": 123.45, "b": "hello", "c": 123.45, "d": "hello" } }
{ "a": 123.45, "b": { "a": "hello", "b": "123.45", ",c": "hello", "d": 123.45 }, "c": { "a": 123.45, "b": "hello", "c": 123.45, "d": "hello" } }
{ "a": 123.45, "b": { "a": "hello", "b": 123.45, "c": "hello", "d": 123.45 }, "c": { "a": 123.45, "b": "hello", "c": 123.45, "d": "hello" } }

{ "a": 123.45 }
{ "b": { "a": "hello", "b": 123.45, "c": "hello", "d": 123.45 } }
{ "c": { "a": 123.45, "b": "hello", "c": 123.45, "d": "hello" } }
{ "a": 123.45, "b": { "a": "hello", "b": 123.45, "c": "hello", "d": true }, "c": { "a": 123.45, "b": "hello", "c": 123.45, "d": "hello" } }
{ "a": 123.45, "b": { "a": "hello", "b": 123.45, "c": "hello", "d": 123.45 }, "c": { "a": 123.45, "b": "hello", "c": 123.45, "d": "hello" } }
9 changes: 9 additions & 0 deletions integration_tests/src/test/resources/mixed-primitives.ndjson
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{ "var0": "hello", "var1": 3.141, "var2": true }
"var0": "hello", "var1": 3.141, "var2": true }
{ "var0": "hello", "var1": true, "var2": "hello" }
{ "var0": true, "var1": "hello", "hello" }
{ "var0": true, "var1": "hello", "var2": "hello" } { "var0": true, "var1": "hello", "var2": "hello" }
{ "var0": true, "var1": "hello", "var2" }
{ "var0": 123, "var1": true, "var2" true }
{ "var0": 123, "var1": true "var2": true
{ "var0": "hello", "var1": "hello", "var2": "hello" }
10 changes: 10 additions & 0 deletions integration_tests/src/test/resources/simple-nested.ndjson
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{ "a": 123.45, "b": { "a": "hello", "b": 123.45, "c": "hello", "d": 123.45 }, "c": { "a": 123.45, "b": "hello", "c": 123.45, "d": "hello" } }
{ "a": 123.45, "b": { "a": "hello", "b": 123.45, "c": "hello", "d": 123.45 }, "c": { "a": 123.45, "b": "hello", "c": 123.45, "d": "hello" } }
{ "a": 123.45, "b": { "a": "hello", "b": 123.45, "c": "hello", "d": 123.45 }, "c": { "a": 123.45, "b": "hello", "c": 123.45, "d": "hello" } }
{ "a": 123.45, "b": { "a": "hello", "b": 123.45, "c": "hello", "d": 123.45 }, "c": { "a": 123.45, "b": "hello", "c": 123.45, "d": "hello" } }
{ "a": 123.45, "b": { "a": "hello", "b": 123.45, "c": "hello", "d": 123.45 }, "c": { "a": 123.45, "b": "hello", "c": 123.45, "d": "hello" } }
{ "a": 123.45, "b": { "a": "hello", "b": 123.45, "c": "hello", "d": 123.45 }, "c": { "a": 123.45, "b": "hello", "c": 123.45, "d": "hello" } }
{ "a": 123.45, "b": { "a": "hello", "b": 123.45, "c": "hello", "d": 123.45 }, "c": { "a": 123.45, "b": "hello", "c": 123.45, "d": "hello" } }
{ "a": 123.45, "b": { "a": "hello", "b": 123.45, "c": "hello", "d": 123.45 }, "c": { "a": 123.45, "b": "hello", "c": 123.45, "d": "hello" } }
{ "a": 123.45, "b": { "a": "hello", "b": 123.45, "c": "hello", "d": 123.45 }, "c": { "a": 123.45, "b": "hello", "c": 123.45, "d": "hello" } }
{ "a": 123.45, "b": { "a": "hello", "b": 123.45, "c": "hello", "d": 123.45 }, "c": { "a": 123.45, "b": "hello", "c": 123.45, "d": "hello" } }
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,14 @@ object JsonPartitionReader {
RmmRapidsRetryIterator.withRetryNoSplit(dataBufferer.getBufferAndRelease) { dataBuffer =>
withResource(new NvtxWithMetrics(formatName + " decode",
NvtxColor.DARK_GREEN, decodeTime)) { _ =>
Table.readJSON(cudfSchema, jsonOpts, dataBuffer, 0, dataSize)
try {
Table.readJSON(cudfSchema, jsonOpts, dataBuffer, 0, dataSize)
} catch {
case e: AssertionError if e.getMessage == "CudfColumns can't be null or empty" =>
// this happens when every row in a JSON file is invalid (or we are
// trying to read a non-JSON file format as JSON)
throw new IOException(s"Error when processing file [$partFile]", e)
}
}
}
} catch {
Expand All @@ -294,7 +301,7 @@ class JsonPartitionReader(
maxBytesPerChunk, execMetrics, HostLineBuffererFactory) {

def buildJsonOptions(parsedOptions: JSONOptions): cudf.JSONOptions = {
val builder = cudf.JSONOptions.builder()
val builder = cudf.JSONOptions.builder().withRecoverWithNull(true)
builder.build
}

Expand Down

0 comments on commit cf109c2

Please sign in to comment.