From f800f5a2fa9a961699345e6febe740b4b8f4760e Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Wed, 24 Jan 2024 12:14:05 -0800 Subject: [PATCH] JSON single quote normalization API (#14729) The goal of this PR is to address [10004](https://github.com/rapidsai/cudf/issues/10004) by supporting parsing of JSON files containing single quotes for field/value strings. This is a follow-up work to the POC [PR 14545](https://github.com/rapidsai/cudf/pull/14545) Authors: - Shruti Shivakumar (https://github.com/shrshi) Approvers: - Andy Grove (https://github.com/andygrove) - Vyas Ramasubramani (https://github.com/vyasr) - Vukasin Milovanovic (https://github.com/vuule) - Elias Stehle (https://github.com/elstehle) - Robert (Bobby) Evans (https://github.com/revans2) URL: https://github.com/rapidsai/cudf/pull/14729 --- cpp/CMakeLists.txt | 1 + cpp/include/cudf/io/detail/json.hpp | 14 +- cpp/include/cudf/io/json.hpp | 31 +++ .../io/json/json_quote_normalization.cu} | 204 +++-------------- cpp/src/io/json/read_json.cu | 21 +- cpp/src/io/json/read_json.hpp | 2 +- cpp/tests/CMakeLists.txt | 2 +- .../io/json_quote_normalization_test.cpp | 215 ++++++++++++++++++ .../main/java/ai/rapids/cudf/JSONOptions.java | 15 ++ java/src/main/java/ai/rapids/cudf/Table.java | 11 +- java/src/main/native/src/TableJni.cpp | 44 ++-- .../test/java/ai/rapids/cudf/TableTest.java | 33 +++ java/src/test/js | 0 java/src/test/resources/single_quotes.json | 2 + 14 files changed, 401 insertions(+), 194 deletions(-) rename cpp/{tests/io/fst/quote_normalization_test.cu => src/io/json/json_quote_normalization.cu} (56%) create mode 100644 cpp/tests/io/json_quote_normalization_test.cpp create mode 100644 java/src/test/js create mode 100644 java/src/test/resources/single_quotes.json diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 90eaec6804a..3925ac55d6b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -375,6 +375,7 @@ add_library( src/io/functions.cpp src/io/json/byte_range_info.cu src/io/json/json_column.cu + src/io/json/json_quote_normalization.cu src/io/json/json_tree.cu src/io/json/nested_json_gpu.cu src/io/json/read_json.cu diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index d0a9543397d..0eb0e17ea10 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -51,4 +51,16 @@ void write_json(data_sink* sink, json_writer_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); + +/** + * @brief Normalize single quotes to double quotes using FST + * + * @param inbuf Input device buffer + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource to use for device memory allocation + */ +rmm::device_uvector normalize_single_quotes(rmm::device_uvector&& inbuf, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + } // namespace cudf::io::json::detail diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 2a39a539cc7..f0c3d48ab7e 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -115,6 +115,9 @@ class json_reader_options { // Whether to keep the quote characters of string values bool _keep_quotes = false; + // Normalize single quotes + bool _normalize_single_quotes = false; + // Whether to recover after an invalid JSON line json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL; @@ -255,6 +258,13 @@ class json_reader_options { */ bool is_enabled_keep_quotes() const { return _keep_quotes; } + /** + * @brief Whether the reader should normalize single quotes around strings + * + * @returns true if the reader should normalize single quotes, false otherwise + */ + bool is_enabled_normalize_single_quotes() const { return _normalize_single_quotes; } + /** * @brief Queries the JSON reader's behavior on invalid JSON lines. * @@ -340,6 +350,14 @@ class json_reader_options { */ void enable_keep_quotes(bool val) { _keep_quotes = val; } + /** + * @brief Set whether the reader should enable normalization of single quotes around strings. + * + * @param val Boolean value to indicate whether the reader should normalize single quotes around + * strings + */ + void enable_normalize_single_quotes(bool val) { _normalize_single_quotes = val; } + /** * @brief Specifies the JSON reader's behavior on invalid JSON lines. * @@ -502,6 +520,19 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set whether the reader should normalize single quotes around strings + * + * @param val Boolean value to indicate whether the reader should normalize single quotes + * of strings + * @return this for chaining + */ + json_reader_options_builder& normalize_single_quotes(bool val) + { + options._normalize_single_quotes = val; + return *this; + } + /** * @brief Specifies the JSON reader's behavior on invalid JSON lines. * diff --git a/cpp/tests/io/fst/quote_normalization_test.cu b/cpp/src/io/json/json_quote_normalization.cu similarity index 56% rename from cpp/tests/io/fst/quote_normalization_test.cu rename to cpp/src/io/json/json_quote_normalization.cu index d0794b8f17e..7c9466748cd 100644 --- a/cpp/tests/io/fst/quote_normalization_test.cu +++ b/cpp/src/io/json/json_quote_normalization.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,19 +15,13 @@ */ #include -#include -#include -#include -#include - -#include -#include +#include #include #include #include -#include +#include #include #include @@ -36,17 +30,16 @@ #include #include -namespace { +namespace cudf::io::json { + +using SymbolT = char; +using StateT = char; +using SymbolOffsetT = uint32_t; -// Type used to represent the atomic symbol type used within the finite-state machine -// TODO: type aliasing to be declared in a common header for better maintainability and -// pre-empt future bugs -using SymbolT = char; -using StateT = char; +namespace normalize_quotes { // Type sufficiently large to index symbols within the input and output (may be unsigned) -using SymbolOffsetT = uint32_t; -enum class dfa_states : char { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES }; +enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES }; enum class dfa_symbol_group_id : uint32_t { DOUBLE_QUOTE_CHAR, ///< Quote character SG: " SINGLE_QUOTE_CHAR, ///< Quote character SG: ' @@ -62,7 +55,7 @@ constexpr auto TT_DQS = dfa_states::TT_DQS; constexpr auto TT_SQS = dfa_states::TT_SQS; constexpr auto TT_DEC = dfa_states::TT_DEC; constexpr auto TT_SEC = dfa_states::TT_SEC; -constexpr auto TT_NUM_STATES = static_cast(dfa_states::TT_NUM_STATES); +constexpr auto TT_NUM_STATES = static_cast(dfa_states::TT_NUM_STATES); constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); // The i-th string representing all the characters of a symbol group @@ -80,7 +73,7 @@ std::array, TT_NUM_STATES> const qna_s }}; // The DFA's starting state -constexpr char start_state = static_cast(TT_OOS); +constexpr auto start_state = static_cast(TT_OOS); struct TransduceToNormalizedQuotes { /** @@ -112,7 +105,7 @@ struct TransduceToNormalizedQuotes { // SEC | Sigma\{'} -> {\*} // Whether this transition translates to the escape sequence: \" - const bool outputs_escape_sequence = + bool const outputs_escape_sequence = (state_id == static_cast(dfa_states::TT_SQS)) && (match_id == static_cast(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR)); // Case when a double quote needs to be replaced by the escape sequence: \" @@ -156,19 +149,19 @@ struct TransduceToNormalizedQuotes { SymbolT const read_symbol) const { // Whether this transition translates to the escape sequence: \" - const bool sqs_outputs_escape_sequence = + bool const sqs_outputs_escape_sequence = (state_id == static_cast(dfa_states::TT_SQS)) && (match_id == static_cast(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR)); // Number of characters to output on this transition if (sqs_outputs_escape_sequence) { return 2; } // Whether this transition translates to the escape sequence \ or unescaped ' - const bool sec_outputs_escape_sequence = + bool const sec_outputs_escape_sequence = (state_id == static_cast(dfa_states::TT_SEC)) && (match_id != static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)); // Number of characters to output on this transition if (sec_outputs_escape_sequence) { return 2; } // Whether this transition translates to no output - const bool sqs_outputs_nop = + bool const sqs_outputs_nop = (state_id == static_cast(dfa_states::TT_SQS)) && (match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR)); // Number of characters to output on this transition @@ -177,156 +170,33 @@ struct TransduceToNormalizedQuotes { } }; -} // namespace +} // namespace normalize_quotes -// Base test fixture for tests -struct FstTest : public cudf::test::BaseFixture {}; +namespace detail { -void run_test(std::string& input, std::string& output) +rmm::device_uvector normalize_single_quotes(rmm::device_uvector&& inbuf, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - // Prepare cuda stream for data transfers & kernels - rmm::cuda_stream stream{}; - rmm::cuda_stream_view stream_view(stream); - - auto parser = cudf::io::fst::detail::make_fst( - cudf::io::fst::detail::make_symbol_group_lut(qna_sgs), - cudf::io::fst::detail::make_transition_table(qna_state_tt), - cudf::io::fst::detail::make_translation_functor(TransduceToNormalizedQuotes{}), + auto parser = fst::detail::make_fst( + fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs), + fst::detail::make_transition_table(normalize_quotes::qna_state_tt), + fst::detail::make_translation_functor(normalize_quotes::TransduceToNormalizedQuotes{}), stream); - auto d_input_scalar = cudf::make_string_scalar(input, stream_view); - auto& d_input = static_cast&>(*d_input_scalar); - - // Prepare input & output buffers - constexpr std::size_t single_item = 1; - cudf::detail::hostdevice_vector output_gpu(input.size() * 2, stream_view); - cudf::detail::hostdevice_vector output_gpu_size(single_item, stream_view); - - // Allocate device-side temporary storage & run algorithm - parser.Transduce(d_input.data(), - static_cast(d_input.size()), - output_gpu.device_ptr(), + rmm::device_uvector outbuf(inbuf.size() * 2, stream, mr); + rmm::device_scalar outbuf_size(stream, mr); + parser.Transduce(inbuf.data(), + static_cast(inbuf.size()), + outbuf.data(), thrust::make_discard_iterator(), - output_gpu_size.device_ptr(), - start_state, - stream_view); - - // Async copy results from device to host - output_gpu.device_to_host_async(stream_view); - output_gpu_size.device_to_host_async(stream_view); - - // Make sure results have been copied back to host - stream.synchronize(); - - // Verify results - ASSERT_EQ(output_gpu_size[0], output.size()); - CUDF_TEST_EXPECT_VECTOR_EQUAL(output_gpu, output, output.size()); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization1) -{ - std::string input = R"({"A":'TEST"'})"; - std::string output = R"({"A":"TEST\""})"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization2) -{ - std::string input = R"({'A':"TEST'"} ['OTHER STUFF'])"; - std::string output = R"({"A":"TEST'"} ["OTHER STUFF"])"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization3) -{ - std::string input = R"(['{"A": "B"}',"{'A': 'B'}"])"; - std::string output = R"(["{\"A\": \"B\"}","{'A': 'B'}"])"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization4) -{ - std::string input = R"({"ain't ain't a word and you ain't supposed to say it":'"""""""""""'})"; - std::string output = - R"({"ain't ain't a word and you ain't supposed to say it":"\"\"\"\"\"\"\"\"\"\"\""})"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization5) -{ - std::string input = R"({"\"'\"'\"'\"'":'"\'"\'"\'"\'"'})"; - std::string output = R"({"\"'\"'\"'\"'":"\"'\"'\"'\"'\""})"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization6) -{ - std::string input = R"([{"ABC':'CBA":'XYZ":"ZXY'}])"; - std::string output = R"([{"ABC':'CBA":"XYZ\":\"ZXY"}])"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization7) -{ - std::string input = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])"; - std::string output = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization8) -{ - std::string input = R"(['\t','\\t','\\','\\\"\'\\\\','\n','\b','\u0012'])"; - std::string output = R"(["\t","\\t","\\","\\\"'\\\\","\n","\b","\u0012"])"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid1) -{ - std::string input = R"(["THIS IS A TEST'])"; - std::string output = R"(["THIS IS A TEST'])"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid2) -{ - std::string input = R"(['THIS IS A TEST"])"; - std::string output = R"(["THIS IS A TEST\"])"; - run_test(input, output); -} + outbuf_size.data(), + normalize_quotes::start_state, + stream); -TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid3) -{ - std::string input = R"({"MORE TEST'N":'RESUL})"; - std::string output = R"({"MORE TEST'N":"RESUL})"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid4) -{ - std::string input = R"({"NUMBER":100'0,'STRING':'SOMETHING'})"; - std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid5) -{ - std::string input = R"({'NUMBER':100"0,"STRING":"SOMETHING"})"; - std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid6) -{ - std::string input = R"({'a':'\\''})"; - std::string output = R"({"a":"\\""})"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid7) -{ - std::string input = R"(}'a': 'b'{)"; - std::string output = R"(}"a": "b"{)"; - run_test(input, output); + outbuf.resize(outbuf_size.value(stream), stream); + return outbuf; } -CUDF_TEST_PROGRAM_MAIN() +} // namespace detail +} // namespace cudf::io::json diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 080da7800f4..2cfb5fa03c9 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -45,6 +46,15 @@ size_t sources_size(host_span> const sources, }); } +/** + * @brief Read from array of data sources into RMM buffer + * + * @param sources Array of data sources + * @param compression Compression format of source + * @param range_offset Number of bytes to skip from source start + * @param range_size Number of bytes to read from source + * @param stream CUDA stream used for device memory operations and kernel launches + */ rmm::device_uvector ingest_raw_input(host_span> sources, compression_type compression, size_t range_offset, @@ -217,7 +227,14 @@ table_with_metadata read_json(host_span> sources, "Multiple inputs are supported only for JSON Lines format"); } - auto const buffer = get_record_range_raw_input(sources, reader_opts, stream); + auto buffer = get_record_range_raw_input(sources, reader_opts, stream); + + // If input JSON buffer has single quotes and option to normalize single quotes is enabled, + // invoke pre-processing FST + if (reader_opts.is_enabled_normalize_single_quotes()) { + buffer = + normalize_single_quotes(std::move(buffer), stream, rmm::mr::get_current_device_resource()); + } return device_parse_nested_json(buffer, reader_opts, stream, mr); // For debug purposes, use host_parse_nested_json() diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp index db37e7abcdb..d05134fa837 100644 --- a/cpp/src/io/json/read_json.hpp +++ b/cpp/src/io/json/read_json.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index eee736613fe..24085eb5e10 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -313,13 +313,13 @@ ConfigureTest(JSON_TYPE_CAST_TEST io/json_type_cast_test.cu) ConfigureTest(NESTED_JSON_TEST io/nested_json_test.cpp io/json_tree.cpp) ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp) ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp) +ConfigureTest(JSON_QUOTE_NORMALIZATION io/json_quote_normalization_test.cpp) ConfigureTest( DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp GPUS 1 PERCENT 30 ) target_link_libraries(DATA_CHUNK_SOURCE_TEST PRIVATE ZLIB::ZLIB) -ConfigureTest(QUOTE_NORMALIZATION_TEST io/fst/quote_normalization_test.cu) ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu) ConfigureTest(FST_TEST io/fst/fst_test.cu) ConfigureTest(TYPE_INFERENCE_TEST io/type_inference_test.cu) diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp new file mode 100644 index 00000000000..50faea5e4d8 --- /dev/null +++ b/cpp/tests/io/json_quote_normalization_test.cpp @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +// Base test fixture for tests +struct JsonNormalizationTest : public cudf::test::BaseFixture {}; + +void run_test(const std::string& host_input, const std::string& expected_host_output) +{ + // RMM memory resource + std::shared_ptr rsc = + std::make_shared(); + + rmm::device_uvector device_input( + host_input.size(), cudf::test::get_default_stream(), rsc.get()); + CUDF_CUDA_TRY(cudaMemcpyAsync(device_input.data(), + host_input.data(), + host_input.size(), + cudaMemcpyHostToDevice, + cudf::test::get_default_stream().value())); + // Preprocessing FST + auto device_fst_output = cudf::io::json::detail::normalize_single_quotes( + std::move(device_input), cudf::test::get_default_stream(), rsc.get()); + + std::string preprocessed_host_output(device_fst_output.size(), 0); + CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(), + device_fst_output.data(), + preprocessed_host_output.size(), + cudaMemcpyDeviceToHost, + cudf::test::get_default_stream().value())); + CUDF_TEST_EXPECT_VECTOR_EQUAL( + preprocessed_host_output, expected_host_output, preprocessed_host_output.size()); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization1) +{ + std::string input = R"({"A":'TEST"'})"; + std::string output = R"({"A":"TEST\""})"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization2) +{ + std::string input = R"({'A':"TEST'"} ['OTHER STUFF'])"; + std::string output = R"({"A":"TEST'"} ["OTHER STUFF"])"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization3) +{ + std::string input = R"(['{"A": "B"}',"{'A': 'B'}"])"; + std::string output = R"(["{\"A\": \"B\"}","{'A': 'B'}"])"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization4) +{ + std::string input = R"({"ain't ain't a word and you ain't supposed to say it":'"""""""""""'})"; + std::string output = + R"({"ain't ain't a word and you ain't supposed to say it":"\"\"\"\"\"\"\"\"\"\"\""})"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization5) +{ + std::string input = R"({"\"'\"'\"'\"'":'"\'"\'"\'"\'"'})"; + std::string output = R"({"\"'\"'\"'\"'":"\"'\"'\"'\"'\""})"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization6) +{ + std::string input = R"([{"ABC':'CBA":'XYZ":"ZXY'}])"; + std::string output = R"([{"ABC':'CBA":"XYZ\":\"ZXY"}])"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization7) +{ + std::string input = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])"; + std::string output = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization8) +{ + std::string input = R"(['\t','\\t','\\','\\\"\'\\\\','\n','\b','\u0012'])"; + std::string output = R"(["\t","\\t","\\","\\\"'\\\\","\n","\b","\u0012"])"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid1) +{ + std::string input = R"(["THIS IS A TEST'])"; + std::string output = R"(["THIS IS A TEST'])"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid2) +{ + std::string input = R"(['THIS IS A TEST"])"; + std::string output = R"(["THIS IS A TEST\"])"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid3) +{ + std::string input = R"({"MORE TEST'N":'RESUL})"; + std::string output = R"({"MORE TEST'N":"RESUL})"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid4) +{ + std::string input = R"({"NUMBER":100'0,'STRING':'SOMETHING'})"; + std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid5) +{ + std::string input = R"({'NUMBER':100"0,"STRING":"SOMETHING"})"; + std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid6) +{ + std::string input = R"({'a':'\\''})"; + std::string output = R"({"a":"\\""})"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid7) +{ + std::string input = R"(}'a': 'b'{)"; + std::string output = R"(}"a": "b"{)"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, ReadJsonOption) +{ + // RMM memory resource + std::shared_ptr rsc = + std::make_shared(); + + // Test input + std::string const host_input = R"({"A":'TEST"'})"; + cudf::io::json_reader_options input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{host_input.data(), host_input.size()}) + .lines(true) + .normalize_single_quotes(true); + + cudf::io::table_with_metadata processed_table = + cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc.get()); + + // Expected table + std::string const expected_input = R"({"A":"TEST\""})"; + cudf::io::json_reader_options expected_input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{expected_input.data(), expected_input.size()}) + .lines(true); + + cudf::io::table_with_metadata expected_table = + cudf::io::read_json(expected_input_options, cudf::test::get_default_stream(), rsc.get()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view()); +} + +TEST_F(JsonNormalizationTest, ErrorCheck) +{ + // RMM memory resource + std::shared_ptr rsc = + std::make_shared(); + + // Test input + std::string const host_input = R"({"A":'TEST"'})"; + cudf::io::json_reader_options input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{host_input.data(), host_input.size()}) + .lines(true); + + EXPECT_THROW(cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc.get()), + cudf::logic_error); +} + +CUDF_TEST_PROGRAM_MAIN() diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index 523d594f8ba..35165c18c7a 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -30,6 +30,7 @@ public final class JSONOptions extends ColumnFilterOptions { private final boolean dayFirst; private final boolean lines; private final boolean recoverWithNull; + private final boolean normalizeSingleQuotes; private final boolean mixedTypesAsStrings; private JSONOptions(Builder builder) { @@ -37,6 +38,7 @@ private JSONOptions(Builder builder) { dayFirst = builder.dayFirst; lines = builder.lines; recoverWithNull = builder.recoverWithNull; + normalizeSingleQuotes = builder.normalizeSingleQuotes; mixedTypesAsStrings = builder.mixedTypesAsStrings; } @@ -53,6 +55,10 @@ public boolean isRecoverWithNull() { return recoverWithNull; } + public boolean isNormalizeSingleQuotes() { + return normalizeSingleQuotes; + } + public boolean isMixedTypesAsStrings() { return mixedTypesAsStrings; } @@ -71,6 +77,7 @@ public static final class Builder extends ColumnFilterOptions.Builder= 0 && offset < buffer.length; return new TableWithMeta(readAndInferJSON(buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(), + opts.isNormalizeSingleQuotes(), opts.isMixedTypesAsStrings())); } @@ -1166,7 +1170,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b try (TableWithMeta twm = new TableWithMeta(readJSON(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(), null, buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines(), - opts.isRecoverWithNull(), opts.isMixedTypesAsStrings()))) { + opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(), + opts.isMixedTypesAsStrings()))) { return gatherJSONColumns(schema, twm); } } @@ -1182,7 +1187,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) { long dsHandle = DataSourceHelper.createWrapperDataSource(ds); try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(), opts.isDayFirst(), opts.isLines(), - opts.isRecoverWithNull(), opts.isMixedTypesAsStrings(), dsHandle))) { + opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(), opts.isMixedTypesAsStrings(), dsHandle))) { return gatherJSONColumns(schema, twm); } finally { DataSourceHelper.destroyWrapperDataSource(dsHandle); diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 1ac15a3023c..cef18b245e7 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1392,7 +1392,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines, - jboolean recover_with_null, jboolean mixed_types_as_string) { + jboolean recover_with_null, jboolean normalize_single_quotes, jboolean mixed_types_as_string) { JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); if (buffer_length <= 0) { @@ -1408,11 +1408,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( auto const recovery_mode = recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL : cudf::io::json_recovery_mode_t::FAIL; - cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) - .dayfirst(static_cast(day_first)) - .lines(static_cast(lines)) - .recovery_mode(recovery_mode) - .mixed_types_as_string(mixed_types_as_string); + cudf::io::json_reader_options_builder opts = + cudf::io::json_reader_options::builder(source) + .dayfirst(static_cast(day_first)) + .lines(static_cast(lines)) + .recovery_mode(recovery_mode) + .normalize_single_quotes(static_cast(normalize_single_quotes)) + .mixed_types_as_string(mixed_types_as_string); auto result = std::make_unique(cudf::io::read_json(opts.build())); @@ -1470,8 +1472,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales, - jboolean day_first, jboolean lines, jboolean recover_with_null, jboolean mixed_types_as_string, - jlong ds_handle) { + jboolean day_first, jboolean lines, jboolean recover_with_null, + jboolean normalize_single_quotes, jboolean mixed_types_as_string, jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1503,11 +1505,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( cudf::io::json_recovery_mode_t recovery_mode = recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL : cudf::io::json_recovery_mode_t::FAIL; - cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) - .dayfirst(static_cast(day_first)) - .lines(static_cast(lines)) - .recovery_mode(recovery_mode) - .mixed_types_as_string(mixed_types_as_string); + cudf::io::json_reader_options_builder opts = + cudf::io::json_reader_options::builder(source) + .dayfirst(static_cast(day_first)) + .lines(static_cast(lines)) + .recovery_mode(recovery_mode) + .normalize_single_quotes(static_cast(normalize_single_quotes)) + .mixed_types_as_string(mixed_types_as_string); if (!n_col_names.is_null() && data_types.size() > 0) { if (n_col_names.size() != n_types.size()) { @@ -1539,7 +1543,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales, jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines, - jboolean recover_with_null, jboolean mixed_types_as_string) { + jboolean recover_with_null, jboolean normalize_single_quotes, jboolean mixed_types_as_string) { bool read_buffer = true; if (buffer == 0) { @@ -1586,11 +1590,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( cudf::io::json_recovery_mode_t recovery_mode = recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL : cudf::io::json_recovery_mode_t::FAIL; - cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) - .dayfirst(static_cast(day_first)) - .lines(static_cast(lines)) - .recovery_mode(recovery_mode) - .mixed_types_as_string(mixed_types_as_string); + cudf::io::json_reader_options_builder opts = + cudf::io::json_reader_options::builder(source) + .dayfirst(static_cast(day_first)) + .lines(static_cast(lines)) + .recovery_mode(recovery_mode) + .normalize_single_quotes(static_cast(normalize_single_quotes)) + .mixed_types_as_string(mixed_types_as_string); if (!n_col_names.is_null() && data_types.size() > 0) { if (n_col_names.size() != n_types.size()) { diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 73002644858..f1c4d0803a3 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -87,6 +87,7 @@ public class TableTest extends CudfTestBase { private static final File TEST_SIMPLE_CSV_FILE = TestUtils.getResourceAsFile("simple.csv"); private static final File TEST_SIMPLE_JSON_FILE = TestUtils.getResourceAsFile("people.json"); private static final File TEST_JSON_ERROR_FILE = TestUtils.getResourceAsFile("people_with_invalid_lines.json"); + private static final File TEST_JSON_SINGLE_QUOTES_FILE = TestUtils.getResourceAsFile("single_quotes.json"); private static final File TEST_MIXED_TYPE_1_JSON = TestUtils.getResourceAsFile("mixed_types_1.json"); private static final File TEST_MIXED_TYPE_2_JSON = TestUtils.getResourceAsFile("mixed_types_2.json"); @@ -330,6 +331,23 @@ void testReadJSONFile() { } @Test + void testReadSingleQuotesJSONFile() throws IOException { + Schema schema = Schema.builder() + .column(DType.STRING, "A") + .build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withNormalizeSingleQuotes(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column("TEST\"", "TESTER'") + .build(); + MultiBufferDataSource source = sourceFrom(TEST_JSON_SINGLE_QUOTES_FILE); + Table table = Table.readJSON(schema, opts, source)) { + assertTablesAreEqual(expected, table); + } + } + void testReadMixedType2JSONFileFeatureDisabled() { Schema schema = Schema.builder() .column(DType.STRING, "a") @@ -377,6 +395,21 @@ void testReadMixedType2JSONFile() throws IOException { } } + @Test + void testReadSingleQuotesJSONFileFeatureDisabled() throws IOException { + Schema schema = Schema.builder() + .column(DType.STRING, "A") + .build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withNormalizeSingleQuotes(false) + .build(); + try (MultiBufferDataSource source = sourceFrom(TEST_JSON_SINGLE_QUOTES_FILE)) { + assertThrows(CudfException.class, () -> + Table.readJSON(schema, opts, source)); + } + } + @Test void testReadJSONFromDataSource() throws IOException { Schema schema = Schema.builder() diff --git a/java/src/test/js b/java/src/test/js new file mode 100644 index 00000000000..e69de29bb2d diff --git a/java/src/test/resources/single_quotes.json b/java/src/test/resources/single_quotes.json new file mode 100644 index 00000000000..cb432fbc643 --- /dev/null +++ b/java/src/test/resources/single_quotes.json @@ -0,0 +1,2 @@ +{"A":'TEST"'} +{'A':"TESTER'"}