From a1120f6f1dc8447f3ff295fe0e6bd88166bec92f Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Tue, 9 Jan 2024 18:20:35 +0000 Subject: [PATCH 01/26] single quote normalization api --- cpp/src/io/json/json_quote_normalization.cu | 200 ++++++++++++++++++++ cpp/src/io/json/read_json.hpp | 1 + 2 files changed, 201 insertions(+) create mode 100644 cpp/src/io/json/json_quote_normalization.cu diff --git a/cpp/src/io/json/json_quote_normalization.cu b/cpp/src/io/json/json_quote_normalization.cu new file mode 100644 index 00000000000..321495a4bce --- /dev/null +++ b/cpp/src/io/json/json_quote_normalization.cu @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace cudf::io::json { + + using SymbolT = char; + using StateT = char; + using SymbolOffsetT = uint32_t; + +namespace normalize_quotes { + + // Type sufficiently large to index symbols within the input and output (may be unsigned) + enum class dfa_states : char { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES }; + enum class dfa_symbol_group_id : uint32_t { + DOUBLE_QUOTE_CHAR, ///< Quote character SG: " + SINGLE_QUOTE_CHAR, ///< Quote character SG: ' + ESCAPE_CHAR, ///< Escape character SG: '\' + NEWLINE_CHAR, ///< Newline character SG: '\n' + OTHER_SYMBOLS, ///< SG implicitly matching all other characters + NUM_SYMBOL_GROUPS ///< Total number of symbol groups + }; + + // Aliases for readability of the transition table + constexpr auto TT_OOS = dfa_states::TT_OOS; + constexpr auto TT_DQS = dfa_states::TT_DQS; + constexpr auto TT_SQS = dfa_states::TT_SQS; + constexpr auto TT_DEC = dfa_states::TT_DEC; + constexpr auto TT_SEC = dfa_states::TT_SEC; + constexpr auto TT_NUM_STATES = static_cast(dfa_states::TT_NUM_STATES); + constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); + + // The i-th string representing all the characters of a symbol group + std::array, NUM_SYMBOL_GROUPS - 1> const qna_sgs{ + {{'\"'}, {'\''}, {'\\'}, {'\n'}}}; + + // Transition table + std::array, TT_NUM_STATES> const qna_state_tt{{ + /* IN_STATE " ' \ \n OTHER */ + /* TT_OOS */ {{TT_DQS, TT_SQS, TT_OOS, TT_OOS, TT_OOS}}, + /* TT_DQS */ {{TT_OOS, TT_DQS, TT_DEC, TT_OOS, TT_DQS}}, + /* TT_SQS */ {{TT_SQS, TT_OOS, TT_SEC, TT_OOS, TT_SQS}}, + /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_OOS, TT_DQS}}, + /* TT_SEC */ {{TT_SQS, TT_SQS, TT_SQS, TT_OOS, TT_SQS}}, + }}; + + // The DFA's starting state + constexpr char start_state = static_cast(TT_OOS); + + struct TransduceToNormalizedQuotes { + /** + * @brief Returns the -th output symbol on the transition (state_id, match_id). + */ + template + constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id, + SymbolGroupT const match_id, + RelativeOffsetT const relative_offset, + SymbolT const read_symbol) const + { + // -------- TRANSLATION TABLE ------------ + // Let the alphabet set be Sigma + // --------------------------------------- + // ---------- NON-SPECIAL CASES: ---------- + // Output symbol same as input symbol + // state | read_symbol -> output_symbol + // DQS | Sigma -> Sigma + // DEC | Sigma -> Sigma + // OOS | Sigma\{'} -> Sigma\{'} + // SQS | Sigma\{', "} -> Sigma\{', "} + // ---------- SPECIAL CASES: -------------- + // Input symbol translates to output symbol + // OOS | {'} -> {"} + // SQS | {'} -> {"} + // SQS | {"} -> {\"} + // SQS | {\} -> + // SEC | {'} -> {'} + // SEC | Sigma\{'} -> {\*} + + // Whether this transition translates to the escape sequence: \" + const bool outputs_escape_sequence = + (state_id == static_cast(dfa_states::TT_SQS)) && + (match_id == static_cast(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR)); + // Case when a double quote needs to be replaced by the escape sequence: \" + if (outputs_escape_sequence) { return (relative_offset == 0) ? '\\' : '"'; } + // Case when a single quote needs to be replaced by a double quote + if ((match_id == static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)) && + ((state_id == static_cast(dfa_states::TT_SQS)) || + (state_id == static_cast(dfa_states::TT_OOS)))) { + return '"'; + } + // Case when the read symbol is an escape character - the actual translation for \ for some + // symbol is handled by transitions from SEC. For now, there is no output for this + // transition + if ((match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR)) && + ((state_id == static_cast(dfa_states::TT_SQS)))) { + return 0; + } + // Case when an escaped single quote in an input single-quoted string needs to be replaced by an + // unescaped single quote + if ((match_id == static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)) && + ((state_id == static_cast(dfa_states::TT_SEC)))) { + return '\''; + } + // Case when an escaped symbol that is not a single-quote needs to be replaced with \ + if (state_id == static_cast(dfa_states::TT_SEC)) { + return (relative_offset == 0) ? '\\' : read_symbol; + } + // In all other cases we simply output the input symbol + return read_symbol; + } + + /** + * @brief Returns the number of output characters for a given transition. During quote + * normalization, we always emit one output character (i.e., either the input character or the + * single quote-input replaced by a double quote), except when we need to escape a double quote + * that was previously inside a single-quoted string. + */ + template + constexpr CUDF_HOST_DEVICE int32_t operator()(StateT const state_id, + SymbolGroupT const match_id, + SymbolT const read_symbol) const + { + // Whether this transition translates to the escape sequence: \" + const bool sqs_outputs_escape_sequence = + (state_id == static_cast(dfa_states::TT_SQS)) && + (match_id == static_cast(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR)); + // Number of characters to output on this transition + if (sqs_outputs_escape_sequence) { return 2; } + // Whether this transition translates to the escape sequence \ or unescaped ' + const bool sec_outputs_escape_sequence = + (state_id == static_cast(dfa_states::TT_SEC)) && + (match_id != static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)); + // Number of characters to output on this transition + if (sec_outputs_escape_sequence) { return 2; } + // Whether this transition translates to no output + const bool sqs_outputs_nop = + (state_id == static_cast(dfa_states::TT_SQS)) && + (match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR)); + // Number of characters to output on this transition + if (sqs_outputs_nop) { return 0; } + return 1; + } + }; + +} // namespace normalize_quotes + +namespace detail { + +std::unique_ptr normalize_quotes(const rmm::device_buffer &inbuf, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { + + auto parser = cudf::io::fst::detail::make_fst( + cudf::io::fst::detail::make_symbol_group_lut(qna_sgs), + cudf::io::fst::detail::make_transition_table(qna_state_tt), + cudf::io::fst::detail::make_translation_functor(TransduceToNormalizedQuotes{}), + stream); + + std::unique_ptr outbuf_ptr(inbuf.size() * 2, stream, mr); + parser.Transduce(inbuf.data(), + static_cast(inbuf.size()), + outbuf_ptr.data(), + thrust::make_discard_iterator(), + thrust::make_discard_iterator(), + normalize_quotes::start_state, + stream); + + return outbuf_ptr; +} + +} // namespace detail +} // namespace cudf::io::json diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp index db37e7abcdb..f890529da98 100644 --- a/cpp/src/io/json/read_json.hpp +++ b/cpp/src/io/json/read_json.hpp @@ -42,4 +42,5 @@ size_type find_first_delimiter_in_chunk(host_span outbuf, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr); } // namespace cudf::io::json::detail From c6b0ba331d434a032d2e68b8ab3b987cfa251aab Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Wed, 10 Jan 2024 00:58:44 +0000 Subject: [PATCH 02/26] test for normalization api --- cpp/src/io/json/read_json.hpp | 4 +- .../io/json_quote_normalization_test.cpp | 66 +++++++++++++++++++ 2 files changed, 68 insertions(+), 2 deletions(-) create mode 100644 cpp/tests/io/json_quote_normalization_test.cpp diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp index f890529da98..8d2d0c466a1 100644 --- a/cpp/src/io/json/read_json.hpp +++ b/cpp/src/io/json/read_json.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -42,5 +42,5 @@ size_type find_first_delimiter_in_chunk(host_span outbuf, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr); +std::unique_ptr normalize_quotes(const rmm::device_buffer &inbuf, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr); } // namespace cudf::io::json::detail diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp new file mode 100644 index 00000000000..68c2b13233c --- /dev/null +++ b/cpp/tests/io/json_quote_normalization_test.cpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include + +// Base test fixture for tests +struct JsonNormalizationTest : public cudf::test::BaseFixture {}; + +TEST_F(JsonNormalizationTest, Valid) +{ + // Test input + std::string const input = R"({"A":'TEST"'})"; + auto device_input_ptr = cudf::make_string_scalar(input, cudf::test::get_default_stream()); + auto& device_input = static_cast&>(*device_input_ptr); + + // RMM memory resource + std::shared_ptr rsc = + std::make_shared(); + + auto device_fst_output_ptr = + normalize_quotes(device_input.data(), cudf::test::get_default_stream(), rsc.get()); + // Initialize parsing options (reading json lines) + cudf::io::json_reader_options input_options = cudf::io::json_reader_options::builder( + cudf::io::source_info{device_span(*device_fst_output_ptr)}); + + cudf::io::table_with_metadata processed_table = + cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc); +} + +CUDF_TEST_PROGRAM_MAIN() From d9a8acf677ae8a6ad404c3e6f77674000e3e0af5 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Thu, 11 Jan 2024 00:24:22 +0000 Subject: [PATCH 03/26] fixes to test --- cpp/CMakeLists.txt | 3 +- cpp/include/cudf/io/detail/json.hpp | 6 +- cpp/src/io/json/json_quote_normalization.cu | 296 +++++++++--------- cpp/src/io/json/read_json.hpp | 3 +- cpp/tests/CMakeLists.txt | 1 + .../io/json_quote_normalization_test.cpp | 47 ++- 6 files changed, 183 insertions(+), 173 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a7c34ca489c..c0c721a830f 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -376,6 +376,7 @@ add_library( src/io/json/legacy/json_gpu.cu src/io/json/legacy/reader_impl.cu src/io/json/write_json.cu + src/io/json/json_quote_normalization.cu src/io/orc/aggregate_orc_metadata.cpp src/io/orc/dict_enc.cu src/io/orc/orc.cpp diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index d0a9543397d..ea72304807d 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -51,4 +51,8 @@ void write_json(data_sink* sink, json_writer_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); + +std::unique_ptr> normalize_quotes(const cudf::device_span& inbuf, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace cudf::io::json::detail diff --git a/cpp/src/io/json/json_quote_normalization.cu b/cpp/src/io/json/json_quote_normalization.cu index 321495a4bce..2ddd27b9ec3 100644 --- a/cpp/src/io/json/json_quote_normalization.cu +++ b/cpp/src/io/json/json_quote_normalization.cu @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -34,167 +35,172 @@ namespace cudf::io::json { - using SymbolT = char; - using StateT = char; - using SymbolOffsetT = uint32_t; +using SymbolT = char; +using StateT = char; +using SymbolOffsetT = uint32_t; namespace normalize_quotes { - // Type sufficiently large to index symbols within the input and output (may be unsigned) - enum class dfa_states : char { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES }; - enum class dfa_symbol_group_id : uint32_t { - DOUBLE_QUOTE_CHAR, ///< Quote character SG: " - SINGLE_QUOTE_CHAR, ///< Quote character SG: ' - ESCAPE_CHAR, ///< Escape character SG: '\' - NEWLINE_CHAR, ///< Newline character SG: '\n' - OTHER_SYMBOLS, ///< SG implicitly matching all other characters - NUM_SYMBOL_GROUPS ///< Total number of symbol groups - }; - - // Aliases for readability of the transition table - constexpr auto TT_OOS = dfa_states::TT_OOS; - constexpr auto TT_DQS = dfa_states::TT_DQS; - constexpr auto TT_SQS = dfa_states::TT_SQS; - constexpr auto TT_DEC = dfa_states::TT_DEC; - constexpr auto TT_SEC = dfa_states::TT_SEC; - constexpr auto TT_NUM_STATES = static_cast(dfa_states::TT_NUM_STATES); - constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); - - // The i-th string representing all the characters of a symbol group - std::array, NUM_SYMBOL_GROUPS - 1> const qna_sgs{ - {{'\"'}, {'\''}, {'\\'}, {'\n'}}}; - - // Transition table - std::array, TT_NUM_STATES> const qna_state_tt{{ - /* IN_STATE " ' \ \n OTHER */ - /* TT_OOS */ {{TT_DQS, TT_SQS, TT_OOS, TT_OOS, TT_OOS}}, - /* TT_DQS */ {{TT_OOS, TT_DQS, TT_DEC, TT_OOS, TT_DQS}}, - /* TT_SQS */ {{TT_SQS, TT_OOS, TT_SEC, TT_OOS, TT_SQS}}, - /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_OOS, TT_DQS}}, - /* TT_SEC */ {{TT_SQS, TT_SQS, TT_SQS, TT_OOS, TT_SQS}}, - }}; - - // The DFA's starting state - constexpr char start_state = static_cast(TT_OOS); - - struct TransduceToNormalizedQuotes { - /** - * @brief Returns the -th output symbol on the transition (state_id, match_id). - */ - template - constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id, - SymbolGroupT const match_id, - RelativeOffsetT const relative_offset, - SymbolT const read_symbol) const - { - // -------- TRANSLATION TABLE ------------ - // Let the alphabet set be Sigma - // --------------------------------------- - // ---------- NON-SPECIAL CASES: ---------- - // Output symbol same as input symbol - // state | read_symbol -> output_symbol - // DQS | Sigma -> Sigma - // DEC | Sigma -> Sigma - // OOS | Sigma\{'} -> Sigma\{'} - // SQS | Sigma\{', "} -> Sigma\{', "} - // ---------- SPECIAL CASES: -------------- - // Input symbol translates to output symbol - // OOS | {'} -> {"} - // SQS | {'} -> {"} - // SQS | {"} -> {\"} - // SQS | {\} -> - // SEC | {'} -> {'} - // SEC | Sigma\{'} -> {\*} - - // Whether this transition translates to the escape sequence: \" - const bool outputs_escape_sequence = - (state_id == static_cast(dfa_states::TT_SQS)) && - (match_id == static_cast(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR)); - // Case when a double quote needs to be replaced by the escape sequence: \" - if (outputs_escape_sequence) { return (relative_offset == 0) ? '\\' : '"'; } - // Case when a single quote needs to be replaced by a double quote - if ((match_id == static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)) && - ((state_id == static_cast(dfa_states::TT_SQS)) || - (state_id == static_cast(dfa_states::TT_OOS)))) { - return '"'; - } - // Case when the read symbol is an escape character - the actual translation for \ for some - // symbol is handled by transitions from SEC. For now, there is no output for this - // transition - if ((match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR)) && - ((state_id == static_cast(dfa_states::TT_SQS)))) { - return 0; - } - // Case when an escaped single quote in an input single-quoted string needs to be replaced by an - // unescaped single quote - if ((match_id == static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)) && - ((state_id == static_cast(dfa_states::TT_SEC)))) { - return '\''; - } - // Case when an escaped symbol that is not a single-quote needs to be replaced with \ - if (state_id == static_cast(dfa_states::TT_SEC)) { - return (relative_offset == 0) ? '\\' : read_symbol; - } - // In all other cases we simply output the input symbol - return read_symbol; +// Type sufficiently large to index symbols within the input and output (may be unsigned) +enum class dfa_states : char { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES }; +enum class dfa_symbol_group_id : uint32_t { + DOUBLE_QUOTE_CHAR, ///< Quote character SG: " + SINGLE_QUOTE_CHAR, ///< Quote character SG: ' + ESCAPE_CHAR, ///< Escape character SG: '\' + NEWLINE_CHAR, ///< Newline character SG: '\n' + OTHER_SYMBOLS, ///< SG implicitly matching all other characters + NUM_SYMBOL_GROUPS ///< Total number of symbol groups +}; + +// Aliases for readability of the transition table +constexpr auto TT_OOS = dfa_states::TT_OOS; +constexpr auto TT_DQS = dfa_states::TT_DQS; +constexpr auto TT_SQS = dfa_states::TT_SQS; +constexpr auto TT_DEC = dfa_states::TT_DEC; +constexpr auto TT_SEC = dfa_states::TT_SEC; +constexpr auto TT_NUM_STATES = static_cast(dfa_states::TT_NUM_STATES); +constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); + +// The i-th string representing all the characters of a symbol group +std::array, NUM_SYMBOL_GROUPS - 1> const qna_sgs{ + {{'\"'}, {'\''}, {'\\'}, {'\n'}}}; + +// Transition table +std::array, TT_NUM_STATES> const qna_state_tt{{ + /* IN_STATE " ' \ \n OTHER */ + /* TT_OOS */ {{TT_DQS, TT_SQS, TT_OOS, TT_OOS, TT_OOS}}, + /* TT_DQS */ {{TT_OOS, TT_DQS, TT_DEC, TT_OOS, TT_DQS}}, + /* TT_SQS */ {{TT_SQS, TT_OOS, TT_SEC, TT_OOS, TT_SQS}}, + /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_OOS, TT_DQS}}, + /* TT_SEC */ {{TT_SQS, TT_SQS, TT_SQS, TT_OOS, TT_SQS}}, +}}; + +// The DFA's starting state +constexpr char start_state = static_cast(TT_OOS); + +struct TransduceToNormalizedQuotes { + /** + * @brief Returns the -th output symbol on the transition (state_id, match_id). + */ + template + constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id, + SymbolGroupT const match_id, + RelativeOffsetT const relative_offset, + SymbolT const read_symbol) const + { + // -------- TRANSLATION TABLE ------------ + // Let the alphabet set be Sigma + // --------------------------------------- + // ---------- NON-SPECIAL CASES: ---------- + // Output symbol same as input symbol + // state | read_symbol -> output_symbol + // DQS | Sigma -> Sigma + // DEC | Sigma -> Sigma + // OOS | Sigma\{'} -> Sigma\{'} + // SQS | Sigma\{', "} -> Sigma\{', "} + // ---------- SPECIAL CASES: -------------- + // Input symbol translates to output symbol + // OOS | {'} -> {"} + // SQS | {'} -> {"} + // SQS | {"} -> {\"} + // SQS | {\} -> + // SEC | {'} -> {'} + // SEC | Sigma\{'} -> {\*} + + // Whether this transition translates to the escape sequence: \" + const bool outputs_escape_sequence = + (state_id == static_cast(dfa_states::TT_SQS)) && + (match_id == static_cast(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR)); + // Case when a double quote needs to be replaced by the escape sequence: \" + if (outputs_escape_sequence) { return (relative_offset == 0) ? '\\' : '"'; } + // Case when a single quote needs to be replaced by a double quote + if ((match_id == static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)) && + ((state_id == static_cast(dfa_states::TT_SQS)) || + (state_id == static_cast(dfa_states::TT_OOS)))) { + return '"'; } - - /** - * @brief Returns the number of output characters for a given transition. During quote - * normalization, we always emit one output character (i.e., either the input character or the - * single quote-input replaced by a double quote), except when we need to escape a double quote - * that was previously inside a single-quoted string. - */ - template - constexpr CUDF_HOST_DEVICE int32_t operator()(StateT const state_id, - SymbolGroupT const match_id, - SymbolT const read_symbol) const - { - // Whether this transition translates to the escape sequence: \" - const bool sqs_outputs_escape_sequence = - (state_id == static_cast(dfa_states::TT_SQS)) && - (match_id == static_cast(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR)); - // Number of characters to output on this transition - if (sqs_outputs_escape_sequence) { return 2; } - // Whether this transition translates to the escape sequence \ or unescaped ' - const bool sec_outputs_escape_sequence = - (state_id == static_cast(dfa_states::TT_SEC)) && - (match_id != static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)); - // Number of characters to output on this transition - if (sec_outputs_escape_sequence) { return 2; } - // Whether this transition translates to no output - const bool sqs_outputs_nop = - (state_id == static_cast(dfa_states::TT_SQS)) && - (match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR)); - // Number of characters to output on this transition - if (sqs_outputs_nop) { return 0; } - return 1; + // Case when the read symbol is an escape character - the actual translation for \ for some + // symbol is handled by transitions from SEC. For now, there is no output for this + // transition + if ((match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR)) && + ((state_id == static_cast(dfa_states::TT_SQS)))) { + return 0; } - }; - -} // namespace normalize_quotes + // Case when an escaped single quote in an input single-quoted string needs to be replaced by an + // unescaped single quote + if ((match_id == static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)) && + ((state_id == static_cast(dfa_states::TT_SEC)))) { + return '\''; + } + // Case when an escaped symbol that is not a single-quote needs to be replaced with \ + if (state_id == static_cast(dfa_states::TT_SEC)) { + return (relative_offset == 0) ? '\\' : read_symbol; + } + // In all other cases we simply output the input symbol + return read_symbol; + } + + /** + * @brief Returns the number of output characters for a given transition. During quote + * normalization, we always emit one output character (i.e., either the input character or the + * single quote-input replaced by a double quote), except when we need to escape a double quote + * that was previously inside a single-quoted string. + */ + template + constexpr CUDF_HOST_DEVICE int32_t operator()(StateT const state_id, + SymbolGroupT const match_id, + SymbolT const read_symbol) const + { + // Whether this transition translates to the escape sequence: \" + const bool sqs_outputs_escape_sequence = + (state_id == static_cast(dfa_states::TT_SQS)) && + (match_id == static_cast(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR)); + // Number of characters to output on this transition + if (sqs_outputs_escape_sequence) { return 2; } + // Whether this transition translates to the escape sequence \ or unescaped ' + const bool sec_outputs_escape_sequence = + (state_id == static_cast(dfa_states::TT_SEC)) && + (match_id != static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)); + // Number of characters to output on this transition + if (sec_outputs_escape_sequence) { return 2; } + // Whether this transition translates to no output + const bool sqs_outputs_nop = + (state_id == static_cast(dfa_states::TT_SQS)) && + (match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR)); + // Number of characters to output on this transition + if (sqs_outputs_nop) { return 0; } + return 1; + } +}; + +} // namespace normalize_quotes namespace detail { -std::unique_ptr normalize_quotes(const rmm::device_buffer &inbuf, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { - +std::unique_ptr> normalize_quotes( + const cudf::device_span& inbuf, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ auto parser = cudf::io::fst::detail::make_fst( - cudf::io::fst::detail::make_symbol_group_lut(qna_sgs), - cudf::io::fst::detail::make_transition_table(qna_state_tt), - cudf::io::fst::detail::make_translation_functor(TransduceToNormalizedQuotes{}), + cudf::io::fst::detail::make_symbol_group_lut(cudf::io::json::normalize_quotes::qna_sgs), + cudf::io::fst::detail::make_transition_table(cudf::io::json::normalize_quotes::qna_state_tt), + cudf::io::fst::detail::make_translation_functor( + cudf::io::json::normalize_quotes::TransduceToNormalizedQuotes{}), stream); - std::unique_ptr outbuf_ptr(inbuf.size() * 2, stream, mr); - parser.Transduce(inbuf.data(), + std::unique_ptr> outbuf_ptr = + std::make_unique>(inbuf.size() * 2, stream, mr); + parser.Transduce(reinterpret_cast(inbuf.data()), static_cast(inbuf.size()), - outbuf_ptr.data(), + outbuf_ptr->data(), thrust::make_discard_iterator(), thrust::make_discard_iterator(), - normalize_quotes::start_state, + cudf::io::json::normalize_quotes::start_state, stream); return outbuf_ptr; } -} // namespace detail -} // namespace cudf::io::json +} // namespace detail +} // namespace cudf::io::json diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp index 8d2d0c466a1..d3acfa7ebc2 100644 --- a/cpp/src/io/json/read_json.hpp +++ b/cpp/src/io/json/read_json.hpp @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -41,6 +42,4 @@ size_type find_first_delimiter_in_chunk(host_span normalize_quotes(const rmm::device_buffer &inbuf, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr); } // namespace cudf::io::json::detail diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index d0abcc225d1..a1503e5b297 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -300,6 +300,7 @@ ConfigureTest(JSON_TYPE_CAST_TEST io/json_type_cast_test.cu) ConfigureTest(NESTED_JSON_TEST io/nested_json_test.cpp io/json_tree.cpp) ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp) ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp) +ConfigureTest(JSON_QUOTE_NORMALIZATION io/json_quote_normalization_test.cpp STREAM_MODE testing) ConfigureTest( DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp GPUS 1 diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp index 68c2b13233c..2a96c93071b 100644 --- a/cpp/tests/io/json_quote_normalization_test.cpp +++ b/cpp/tests/io/json_quote_normalization_test.cpp @@ -14,53 +14,52 @@ * limitations under the License. */ -#include -#include - -#include +#include #include -#include -#include #include +#include +#include #include +#include #include -#include -#include #include -#include -#include +#include +#include #include #include -#include -#include - #include // Base test fixture for tests struct JsonNormalizationTest : public cudf::test::BaseFixture {}; -TEST_F(JsonNormalizationTest, Valid) +TEST_F(JsonNormalizationTest, ValidOutput) { - // Test input - std::string const input = R"({"A":'TEST"'})"; - auto device_input_ptr = cudf::make_string_scalar(input, cudf::test::get_default_stream()); - auto& device_input = static_cast&>(*device_input_ptr); - // RMM memory resource std::shared_ptr rsc = std::make_shared(); - auto device_fst_output_ptr = - normalize_quotes(device_input.data(), cudf::test::get_default_stream(), rsc.get()); + // Test input + std::string const input = R"({"A":'TEST"'})"; + rmm::device_uvector device_input(input.size(), cudf::test::get_default_stream(), rsc.get()); + thrust::copy(input.begin(), input.end(), device_input.begin()); + auto device_input_span = cudf::device_span( + reinterpret_cast(device_input.data()), device_input.size()); + + // Preprocessing FST + auto device_fst_output_ptr = cudf::io::json::detail::normalize_quotes( + device_input_span, cudf::test::get_default_stream(), rsc.get()); + // Initialize parsing options (reading json lines) - cudf::io::json_reader_options input_options = cudf::io::json_reader_options::builder( - cudf::io::source_info{device_span(*device_fst_output_ptr)}); + auto device_fst_output_span = cudf::device_span( + reinterpret_cast(device_fst_output_ptr->data()), device_fst_output_ptr->size()); + cudf::io::json_reader_options input_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{device_fst_output_span}); cudf::io::table_with_metadata processed_table = - cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc); + cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc.get()); } CUDF_TEST_PROGRAM_MAIN() From cfe89e69b546c2246b6d6320a3adf6e45b3931f8 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Thu, 11 Jan 2024 03:11:46 +0000 Subject: [PATCH 04/26] fix to tests --- cpp/tests/CMakeLists.txt | 2 +- .../io/json_quote_normalization_test.cpp | 29 ++++++++++++++----- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index a1503e5b297..3d926cdb968 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -300,7 +300,7 @@ ConfigureTest(JSON_TYPE_CAST_TEST io/json_type_cast_test.cu) ConfigureTest(NESTED_JSON_TEST io/nested_json_test.cpp io/json_tree.cpp) ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp) ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp) -ConfigureTest(JSON_QUOTE_NORMALIZATION io/json_quote_normalization_test.cpp STREAM_MODE testing) +ConfigureTest(JSON_QUOTE_NORMALIZATION io/json_quote_normalization_test.cpp) ConfigureTest( DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp GPUS 1 diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp index 2a96c93071b..99ca623663e 100644 --- a/cpp/tests/io/json_quote_normalization_test.cpp +++ b/cpp/tests/io/json_quote_normalization_test.cpp @@ -18,13 +18,15 @@ #include #include #include +#include +#include #include #include #include #include #include -#include +#include #include #include @@ -42,15 +44,20 @@ TEST_F(JsonNormalizationTest, ValidOutput) std::make_shared(); // Test input - std::string const input = R"({"A":'TEST"'})"; - rmm::device_uvector device_input(input.size(), cudf::test::get_default_stream(), rsc.get()); - thrust::copy(input.begin(), input.end(), device_input.begin()); + std::string const host_input = R"({"A":'TEST"'})"; + thrust::device_vector device_input(host_input.c_str(), + host_input.c_str() + host_input.size()); auto device_input_span = cudf::device_span( - reinterpret_cast(device_input.data()), device_input.size()); + reinterpret_cast(thrust::raw_pointer_cast(device_input.data())), + device_input.size()); // Preprocessing FST auto device_fst_output_ptr = cudf::io::json::detail::normalize_quotes( - device_input_span, cudf::test::get_default_stream(), rsc.get()); + device_input_span, cudf::get_default_stream(), rsc.get()); + /* + for(size_t i = 0; i < device_fst_output_ptr->size(); i++) + std::printf("%c", device_fst_output_ptr->element(i, cudf::get_default_stream())); + */ // Initialize parsing options (reading json lines) auto device_fst_output_span = cudf::device_span( @@ -59,7 +66,15 @@ TEST_F(JsonNormalizationTest, ValidOutput) cudf::io::json_reader_options::builder(cudf::io::source_info{device_fst_output_span}); cudf::io::table_with_metadata processed_table = - cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc.get()); + cudf::io::read_json(input_options, cudf::get_default_stream(), rsc.get()); + + // Expected table + std::string const expected_input = R"({"A":"TEST\""})"; + cudf::io::json_reader_options expected_input_options = cudf::io::json_reader_options::builder( + cudf::io::source_info{expected_input.data(), expected_input.size()}); + cudf::io::table_with_metadata expected_table = + cudf::io::read_json(expected_input_options, cudf::get_default_stream(), rsc.get()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view()); } CUDF_TEST_PROGRAM_MAIN() From b2ce13b1781e47df1a31fea20d2b6f9e739a77a1 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Thu, 11 Jan 2024 03:19:54 +0000 Subject: [PATCH 05/26] pre-commit formatting fixes --- cpp/include/cudf/io/detail/json.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index ea72304807d..b6106938c30 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -52,7 +52,8 @@ void write_json(data_sink* sink, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); -std::unique_ptr> normalize_quotes(const cudf::device_span& inbuf, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); +std::unique_ptr> normalize_quotes( + const cudf::device_span& inbuf, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace cudf::io::json::detail From 2134cf8ca0a9a2c43dfba77495d6b7f876fe7239 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Thu, 11 Jan 2024 18:54:26 +0000 Subject: [PATCH 06/26] finally, the test passes --- cpp/src/io/json/json_quote_normalization.cu | 4 +++- cpp/tests/io/json_quote_normalization_test.cpp | 12 ++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/json/json_quote_normalization.cu b/cpp/src/io/json/json_quote_normalization.cu index 2ddd27b9ec3..f9ec148b044 100644 --- a/cpp/src/io/json/json_quote_normalization.cu +++ b/cpp/src/io/json/json_quote_normalization.cu @@ -191,14 +191,16 @@ std::unique_ptr> normalize_quotes( std::unique_ptr> outbuf_ptr = std::make_unique>(inbuf.size() * 2, stream, mr); + rmm::device_scalar outbuf_size(stream, mr); parser.Transduce(reinterpret_cast(inbuf.data()), static_cast(inbuf.size()), outbuf_ptr->data(), thrust::make_discard_iterator(), - thrust::make_discard_iterator(), + outbuf_size.data(), cudf::io::json::normalize_quotes::start_state, stream); + outbuf_ptr->resize(outbuf_size.value(stream), stream); return outbuf_ptr; } diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp index 99ca623663e..6a451dc8e7b 100644 --- a/cpp/tests/io/json_quote_normalization_test.cpp +++ b/cpp/tests/io/json_quote_normalization_test.cpp @@ -57,21 +57,25 @@ TEST_F(JsonNormalizationTest, ValidOutput) /* for(size_t i = 0; i < device_fst_output_ptr->size(); i++) std::printf("%c", device_fst_output_ptr->element(i, cudf::get_default_stream())); + std::printf("\n"); */ // Initialize parsing options (reading json lines) auto device_fst_output_span = cudf::device_span( reinterpret_cast(device_fst_output_ptr->data()), device_fst_output_ptr->size()); cudf::io::json_reader_options input_options = - cudf::io::json_reader_options::builder(cudf::io::source_info{device_fst_output_span}); + cudf::io::json_reader_options::builder(cudf::io::source_info{device_fst_output_span}) + .lines(true); cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options, cudf::get_default_stream(), rsc.get()); // Expected table - std::string const expected_input = R"({"A":"TEST\""})"; - cudf::io::json_reader_options expected_input_options = cudf::io::json_reader_options::builder( - cudf::io::source_info{expected_input.data(), expected_input.size()}); + std::string const expected_input = R"({"A":"TEST\""})"; + cudf::io::json_reader_options expected_input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{expected_input.data(), expected_input.size()}) + .lines(true); cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options, cudf::get_default_stream(), rsc.get()); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view()); From 04e9d828a781e980bdfa23f36a784c8e48415128 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Thu, 11 Jan 2024 20:36:56 +0000 Subject: [PATCH 07/26] try again with test stream --- cpp/tests/io/json_quote_normalization_test.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp index 6a451dc8e7b..bc9db6d3f68 100644 --- a/cpp/tests/io/json_quote_normalization_test.cpp +++ b/cpp/tests/io/json_quote_normalization_test.cpp @@ -26,6 +26,7 @@ #include #include +#include #include #include @@ -45,18 +46,19 @@ TEST_F(JsonNormalizationTest, ValidOutput) // Test input std::string const host_input = R"({"A":'TEST"'})"; - thrust::device_vector device_input(host_input.c_str(), - host_input.c_str() + host_input.size()); + rmm::device_uvector device_input( + host_input.size(), cudf::test::get_default_stream(), rsc.get()); + for (size_t i = 0; i < host_input.size(); i++) + device_input.set_element_async(i, host_input[i], cudf::test::get_default_stream()); auto device_input_span = cudf::device_span( - reinterpret_cast(thrust::raw_pointer_cast(device_input.data())), - device_input.size()); + reinterpret_cast(device_input.data()), device_input.size()); // Preprocessing FST auto device_fst_output_ptr = cudf::io::json::detail::normalize_quotes( - device_input_span, cudf::get_default_stream(), rsc.get()); + device_input_span, cudf::test::get_default_stream(), rsc.get()); /* for(size_t i = 0; i < device_fst_output_ptr->size(); i++) - std::printf("%c", device_fst_output_ptr->element(i, cudf::get_default_stream())); + std::printf("%c", device_fst_output_ptr->element(i, cudf::test::get_default_stream())); std::printf("\n"); */ @@ -68,7 +70,7 @@ TEST_F(JsonNormalizationTest, ValidOutput) .lines(true); cudf::io::table_with_metadata processed_table = - cudf::io::read_json(input_options, cudf::get_default_stream(), rsc.get()); + cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc.get()); // Expected table std::string const expected_input = R"({"A":"TEST\""})"; @@ -77,7 +79,7 @@ TEST_F(JsonNormalizationTest, ValidOutput) cudf::io::source_info{expected_input.data(), expected_input.size()}) .lines(true); cudf::io::table_with_metadata expected_table = - cudf::io::read_json(expected_input_options, cudf::get_default_stream(), rsc.get()); + cudf::io::read_json(expected_input_options, cudf::test::get_default_stream(), rsc.get()); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view()); } From fa11424c9977fe9bf8b03ddd34c1ba0550fb6287 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Fri, 12 Jan 2024 23:46:20 +0000 Subject: [PATCH 08/26] added option to normalize single quotes in read_json --- cpp/include/cudf/io/detail/json.hpp | 8 ++--- cpp/include/cudf/io/json.hpp | 35 +++++++++++++++++-- cpp/src/io/json/json_quote_normalization.cu | 27 ++++++++++++++ cpp/src/io/json/read_json.cu | 23 ++++++++++-- .../io/json_quote_normalization_test.cpp | 33 +++++++++++++++-- 5 files changed, 115 insertions(+), 11 deletions(-) diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index b6106938c30..a3d8ebf57e0 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -52,8 +52,8 @@ void write_json(data_sink* sink, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); -std::unique_ptr> normalize_quotes( - const cudf::device_span& inbuf, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); +rmm::device_uvector normalize_single_quotes(const cudf::device_span& inbuf, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + } // namespace cudf::io::json::detail diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 472d42b1db5..7e0293db647 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -98,7 +98,7 @@ class json_reader_options { // Read the file as a json object per line bool _lines = false; - + // Bytes to skip from the start size_t _byte_range_offset = 0; // Bytes to read; always reads complete rows @@ -113,6 +113,9 @@ class json_reader_options { // Whether to keep the quote characters of string values bool _keep_quotes = false; + // Normalize single quotes + bool _normalize_single_quotes = false; + // Whether to recover after an invalid JSON line json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL; @@ -246,6 +249,13 @@ class json_reader_options { */ bool is_enabled_keep_quotes() const { return _keep_quotes; } + /** + * @brief Whether the reader should normalize single quotes around strings + * + * @returns true if the reader should normalize single quotes, false otherwise + */ + bool is_enabled_normalize_single_quotes() const { return _normalize_single_quotes; } + /** * @brief Queries the JSON reader's behavior on invalid JSON lines. * @@ -324,6 +334,14 @@ class json_reader_options { */ void enable_keep_quotes(bool val) { _keep_quotes = val; } + /** + * @brief Set whether the reader should enable normalization of single quotes around strings. + * + * @param val Boolean value to indicate whether the reader should normalize single quotes around + * string + */ + void enable_normalize_single_quotes(bool val) { _normalize_single_quotes = val; } + /** * @brief Specifies the JSON reader's behavior on invalid JSON lines. * @@ -474,6 +492,19 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set whether the reader should normalize single quotes around string + * + * @param val Boolean value to indicate whether the reader should normalize single quotes + * of strings + * @return this for chaining + */ + json_reader_options_builder& normalize_single_quotes(bool val) + { + options._normalize_single_quotes = val; + return *this; + } + /** * @brief Specifies the JSON reader's behavior on invalid JSON lines. * diff --git a/cpp/src/io/json/json_quote_normalization.cu b/cpp/src/io/json/json_quote_normalization.cu index f9ec148b044..d2088bc34d3 100644 --- a/cpp/src/io/json/json_quote_normalization.cu +++ b/cpp/src/io/json/json_quote_normalization.cu @@ -177,6 +177,7 @@ struct TransduceToNormalizedQuotes { namespace detail { +/* std::unique_ptr> normalize_quotes( const cudf::device_span& inbuf, rmm::cuda_stream_view stream, @@ -203,6 +204,32 @@ std::unique_ptr> normalize_quotes( outbuf_ptr->resize(outbuf_size.value(stream), stream); return outbuf_ptr; } +*/ + +rmm::device_uvector normalize_single_quotes(const cudf::device_span& inbuf, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto parser = cudf::io::fst::detail::make_fst( + cudf::io::fst::detail::make_symbol_group_lut(cudf::io::json::normalize_quotes::qna_sgs), + cudf::io::fst::detail::make_transition_table(cudf::io::json::normalize_quotes::qna_state_tt), + cudf::io::fst::detail::make_translation_functor( + cudf::io::json::normalize_quotes::TransduceToNormalizedQuotes{}), + stream); + + rmm::device_uvector outbuf(inbuf.size() * 2, stream, mr); + rmm::device_scalar outbuf_size(stream, mr); + parser.Transduce(reinterpret_cast(inbuf.data()), + static_cast(inbuf.size()), + outbuf.data(), + thrust::make_discard_iterator(), + outbuf_size.data(), + cudf::io::json::normalize_quotes::start_state, + stream); + + outbuf.resize(outbuf_size.value(stream), stream); + return outbuf; +} } // namespace detail } // namespace cudf::io::json diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 080da7800f4..0d7edb65ef7 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -49,6 +50,7 @@ rmm::device_uvector ingest_raw_input(host_span compression_type compression, size_t range_offset, size_t range_size, + bool normalize_single_quotes, rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); @@ -103,7 +105,12 @@ rmm::device_uvector ingest_raw_input(host_span } stream.synchronize(); - return d_buffer; + if(normalize_single_quotes) { + auto d_buffer_span = cudf::device_span( + reinterpret_cast(d_buffer.data()), d_buffer.size()); + return cudf::io::json::detail::normalize_single_quotes(d_buffer_span, stream, rmm::mr::get_current_device_resource()); + } + else return d_buffer; } else { auto buffer = std::vector(total_source_size); @@ -111,10 +118,16 @@ rmm::device_uvector ingest_raw_input(host_span // Reading to host because decompression of a single block is much faster on the CPU sources[0]->host_read(range_offset, total_source_size, buffer.data()); auto const uncomp_data = decompress(compression, buffer); - return cudf::detail::make_device_uvector_sync( + auto d_buffer = cudf::detail::make_device_uvector_sync( host_span{reinterpret_cast(uncomp_data.data()), uncomp_data.size()}, stream, rmm::mr::get_current_device_resource()); + if(normalize_single_quotes) { + auto d_buffer_span = cudf::device_span( + reinterpret_cast(d_buffer.data()), d_buffer.size()); + return cudf::io::json::detail::normalize_single_quotes(d_buffer_span, stream, rmm::mr::get_current_device_resource()); + } + else return d_buffer; } } @@ -127,6 +140,7 @@ size_type find_first_delimiter_in_chunk(host_span> sources, reader_opts.get_compression(), reader_opts.get_byte_range_offset(), reader_opts.get_byte_range_size(), + reader_opts.is_enabled_normalize_single_quotes(), stream); if (should_load_whole_source(reader_opts)) return buffer; auto first_delim_pos = @@ -175,6 +190,7 @@ auto get_record_range_raw_input(host_span> sources, reader_opts.get_compression(), current_offset, reader_opts.get_byte_range_size(), + reader_opts.is_enabled_normalize_single_quotes(), stream); next_delim_pos = find_first_delimiter(buffer, '\n', stream); if (next_delim_pos == -1) { current_offset += reader_opts.get_byte_range_size(); } @@ -188,6 +204,7 @@ auto get_record_range_raw_input(host_span> sources, reader_opts.get_compression(), first_delim_pos, next_delim_pos - first_delim_pos, + reader_opts.is_enabled_normalize_single_quotes(), stream); } } diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp index bc9db6d3f68..d0f4bcddda9 100644 --- a/cpp/tests/io/json_quote_normalization_test.cpp +++ b/cpp/tests/io/json_quote_normalization_test.cpp @@ -54,7 +54,7 @@ TEST_F(JsonNormalizationTest, ValidOutput) reinterpret_cast(device_input.data()), device_input.size()); // Preprocessing FST - auto device_fst_output_ptr = cudf::io::json::detail::normalize_quotes( + auto device_fst_output = cudf::io::json::detail::normalize_single_quotes( device_input_span, cudf::test::get_default_stream(), rsc.get()); /* for(size_t i = 0; i < device_fst_output_ptr->size(); i++) @@ -64,7 +64,7 @@ TEST_F(JsonNormalizationTest, ValidOutput) // Initialize parsing options (reading json lines) auto device_fst_output_span = cudf::device_span( - reinterpret_cast(device_fst_output_ptr->data()), device_fst_output_ptr->size()); + reinterpret_cast(device_fst_output.data()), device_fst_output.size()); cudf::io::json_reader_options input_options = cudf::io::json_reader_options::builder(cudf::io::source_info{device_fst_output_span}) .lines(true); @@ -83,4 +83,33 @@ TEST_F(JsonNormalizationTest, ValidOutput) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view()); } +TEST_F(JsonNormalizationTest, ReadJsonOption) +{ + // RMM memory resource + std::shared_ptr rsc = + std::make_shared(); + + // Test input + std::string const host_input = R"({"A":'TEST"'})"; + cudf::io::json_reader_options input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{host_input.data(), host_input.size()}) + .lines(true) + .normalize_single_quotes(true); + + cudf::io::table_with_metadata processed_table = + cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc.get()); + + // Expected table + std::string const expected_input = R"({"A":"TEST\""})"; + cudf::io::json_reader_options expected_input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{expected_input.data(), expected_input.size()}) + .lines(true); + + cudf::io::table_with_metadata expected_table = + cudf::io::read_json(expected_input_options, cudf::test::get_default_stream(), rsc.get()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view()); +} + CUDF_TEST_PROGRAM_MAIN() From 2e86d89ed8eb8e37451f8b391ee02440e3046ebc Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Fri, 12 Jan 2024 23:54:23 +0000 Subject: [PATCH 09/26] formatting fixes --- cpp/include/cudf/io/json.hpp | 4 ++-- cpp/src/io/json/read_json.cu | 22 ++++++++++++---------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 7e0293db647..a087e6b40fb 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -98,7 +98,7 @@ class json_reader_options { // Read the file as a json object per line bool _lines = false; - + // Bytes to skip from the start size_t _byte_range_offset = 0; // Bytes to read; always reads complete rows @@ -493,7 +493,7 @@ class json_reader_options_builder { } /** - * @brief Set whether the reader should normalize single quotes around string + * @brief Set whether the reader should normalize single quotes around string * * @param val Boolean value to indicate whether the reader should normalize single quotes * of strings diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 0d7edb65ef7..c159fc2378e 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -50,7 +50,7 @@ rmm::device_uvector ingest_raw_input(host_span compression_type compression, size_t range_offset, size_t range_size, - bool normalize_single_quotes, + bool normalize_single_quotes, rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); @@ -105,12 +105,13 @@ rmm::device_uvector ingest_raw_input(host_span } stream.synchronize(); - if(normalize_single_quotes) { + if (normalize_single_quotes) { auto d_buffer_span = cudf::device_span( reinterpret_cast(d_buffer.data()), d_buffer.size()); - return cudf::io::json::detail::normalize_single_quotes(d_buffer_span, stream, rmm::mr::get_current_device_resource()); - } - else return d_buffer; + return cudf::io::json::detail::normalize_single_quotes( + d_buffer_span, stream, rmm::mr::get_current_device_resource()); + } else + return d_buffer; } else { auto buffer = std::vector(total_source_size); @@ -118,16 +119,17 @@ rmm::device_uvector ingest_raw_input(host_span // Reading to host because decompression of a single block is much faster on the CPU sources[0]->host_read(range_offset, total_source_size, buffer.data()); auto const uncomp_data = decompress(compression, buffer); - auto d_buffer = cudf::detail::make_device_uvector_sync( + auto d_buffer = cudf::detail::make_device_uvector_sync( host_span{reinterpret_cast(uncomp_data.data()), uncomp_data.size()}, stream, rmm::mr::get_current_device_resource()); - if(normalize_single_quotes) { + if (normalize_single_quotes) { auto d_buffer_span = cudf::device_span( reinterpret_cast(d_buffer.data()), d_buffer.size()); - return cudf::io::json::detail::normalize_single_quotes(d_buffer_span, stream, rmm::mr::get_current_device_resource()); - } - else return d_buffer; + return cudf::io::json::detail::normalize_single_quotes( + d_buffer_span, stream, rmm::mr::get_current_device_resource()); + } else + return d_buffer; } } From 9925c1094446ac773b5e623c9f1e3977f1a222b3 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Sat, 13 Jan 2024 03:38:46 +0000 Subject: [PATCH 10/26] adding testing_main --- cpp/tests/io/json_quote_normalization_test.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp index d0f4bcddda9..3dbb6468a65 100644 --- a/cpp/tests/io/json_quote_normalization_test.cpp +++ b/cpp/tests/io/json_quote_normalization_test.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include From 2838c74d2c0013e4b86face14b97fe3842a63039 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Sat, 13 Jan 2024 03:52:18 +0000 Subject: [PATCH 11/26] java bindings --- .../main/java/ai/rapids/cudf/JSONOptions.java | 15 +++++++++++++++ java/src/main/java/ai/rapids/cudf/Table.java | 16 ++++++++++------ java/src/main/native/src/TableJni.cpp | 18 +++++++++++------- .../test/java/ai/rapids/cudf/TableTest.java | 19 +++++++++++++++++++ java/src/test/resources/single_quotes.json | 2 ++ 5 files changed, 57 insertions(+), 13 deletions(-) create mode 100644 java/src/test/resources/single_quotes.json diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index f98687df5fa..d3f906d8af1 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -30,12 +30,14 @@ public final class JSONOptions extends ColumnFilterOptions { private final boolean dayFirst; private final boolean lines; private final boolean recoverWithNull; + private final boolean normalizeSingleQuotes; private JSONOptions(Builder builder) { super(builder); dayFirst = builder.dayFirst; lines = builder.lines; recoverWithNull = builder.recoverWithNull; + normalizeSingleQuotes = builder.normalizeSingleQuotes; } public boolean isDayFirst() { @@ -51,6 +53,10 @@ public boolean isRecoverWithNull() { return recoverWithNull; } + public boolean isNormalizeSingleQuotes() { + return normalizeSingleQuotes; + } + @Override String[] getIncludeColumnNames() { throw new UnsupportedOperationException("JSON reader didn't support column prune"); @@ -65,6 +71,7 @@ public static final class Builder extends ColumnFilterOptions.Builder= 0 && offset < buffer.length; return new TableWithMeta(readAndInferJSON(buffer.getAddress() + offset, len, - opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull())); + opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull()), + opts.isNormalizeSingleQuotes()); } /** @@ -1162,7 +1166,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b try (TableWithMeta twm = new TableWithMeta(readJSON(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(), null, buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines(), - opts.isRecoverWithNull()))) { + opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes()))) { return gatherJSONColumns(schema, twm); } } @@ -1178,7 +1182,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) { long dsHandle = DataSourceHelper.createWrapperDataSource(ds); try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(), opts.isDayFirst(), opts.isLines(), - opts.isRecoverWithNull(), dsHandle))) { + opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(), dsHandle))) { return gatherJSONColumns(schema, twm); } finally { DataSourceHelper.destroyWrapperDataSource(dsHandle); diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index d7d0279174d..ebbf619cc01 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1392,7 +1392,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines, - jboolean recover_with_null) { + jboolean recover_with_null, jboolean normalize_single_quotes) { JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); if (buffer_length <= 0) { @@ -1411,7 +1411,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) .dayfirst(static_cast(day_first)) .lines(static_cast(lines)) - .recovery_mode(recovery_mode); + .recovery_mode(recovery_mode) + .normalize_single_quotes(static_cast(normalize_single_quotes)); auto result = std::make_unique(cudf::io::read_json(opts.build())); @@ -1469,7 +1470,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales, - jboolean day_first, jboolean lines, jboolean recover_with_null, jlong ds_handle) { + jboolean day_first, jboolean lines, jboolean recover_with_null, jboolean normalize_single_quotes, + jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1504,7 +1506,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) .dayfirst(static_cast(day_first)) .lines(static_cast(lines)) - .recovery_mode(recovery_mode); + .recovery_mode(recovery_mode) + .normalize_single_quotes(static_cast(normalize_single_quotes)); if (!n_col_names.is_null() && data_types.size() > 0) { if (n_col_names.size() != n_types.size()) { @@ -1536,7 +1539,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales, jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines, - jboolean recover_with_null) { + jboolean recover_with_null, jboolean normalize_single_quotes) { bool read_buffer = true; if (buffer == 0) { @@ -1586,7 +1589,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) .dayfirst(static_cast(day_first)) .lines(static_cast(lines)) - .recovery_mode(recovery_mode); + .recovery_mode(recovery_mode) + .normalize_single_quotes(static_cast(normalize_single_quotes)); if (!n_col_names.is_null() && data_types.size() > 0) { if (n_col_names.size() != n_types.size()) { diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 8df8ebea8a7..78aa11a074e 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -87,6 +87,7 @@ public class TableTest extends CudfTestBase { private static final File TEST_SIMPLE_CSV_FILE = TestUtils.getResourceAsFile("simple.csv"); private static final File TEST_SIMPLE_JSON_FILE = TestUtils.getResourceAsFile("people.json"); private static final File TEST_JSON_ERROR_FILE = TestUtils.getResourceAsFile("people_with_invalid_lines.json"); + private static final File TEST_JSON_SINGLE_QUOTES_FILE = TestUtils.getResourceAsFile("single_quotes.json"); private static final Schema CSV_DATA_BUFFER_SCHEMA = Schema.builder() .column(DType.INT32, "A") @@ -327,6 +328,24 @@ void testReadJSONFile() { } } + @Test + void testReadSingleQuotesJSONFile() throws IOException { + Schema schema = Schema.builder() + .column(DType.STRING, "A") + .build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withNormalizeSingleQuotes(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column("TEST\"", "TESTER'") + .build(); + MultiBufferDataSource source = sourceFrom(TEST_JSON_SINGLE_QUOTES_FILE); + Table table = Table.readJSON(schema, opts, source)) { + assertTablesAreEqual(expected, table); + } + } + @Test void testReadJSONFromDataSource() throws IOException { Schema schema = Schema.builder() diff --git a/java/src/test/resources/single_quotes.json b/java/src/test/resources/single_quotes.json new file mode 100644 index 00000000000..cb432fbc643 --- /dev/null +++ b/java/src/test/resources/single_quotes.json @@ -0,0 +1,2 @@ +{"A":'TEST"'} +{'A':"TESTER'"} From 23139553a166959dd4a8d9dc1d8185831f438b54 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Sat, 13 Jan 2024 03:57:12 +0000 Subject: [PATCH 12/26] formatting fixes --- java/src/main/native/src/TableJni.cpp | 37 +++++++++++++++------------ 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index ebbf619cc01..95a5904c24f 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1408,11 +1408,12 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( auto const recovery_mode = recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL : cudf::io::json_recovery_mode_t::FAIL; - cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) - .dayfirst(static_cast(day_first)) - .lines(static_cast(lines)) - .recovery_mode(recovery_mode) - .normalize_single_quotes(static_cast(normalize_single_quotes)); + cudf::io::json_reader_options_builder opts = + cudf::io::json_reader_options::builder(source) + .dayfirst(static_cast(day_first)) + .lines(static_cast(lines)) + .recovery_mode(recovery_mode) + .normalize_single_quotes(static_cast(normalize_single_quotes)); auto result = std::make_unique(cudf::io::read_json(opts.build())); @@ -1470,8 +1471,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales, - jboolean day_first, jboolean lines, jboolean recover_with_null, jboolean normalize_single_quotes, - jlong ds_handle) { + jboolean day_first, jboolean lines, jboolean recover_with_null, + jboolean normalize_single_quotes, jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1503,11 +1504,12 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( cudf::io::json_recovery_mode_t recovery_mode = recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL : cudf::io::json_recovery_mode_t::FAIL; - cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) - .dayfirst(static_cast(day_first)) - .lines(static_cast(lines)) - .recovery_mode(recovery_mode) - .normalize_single_quotes(static_cast(normalize_single_quotes)); + cudf::io::json_reader_options_builder opts = + cudf::io::json_reader_options::builder(source) + .dayfirst(static_cast(day_first)) + .lines(static_cast(lines)) + .recovery_mode(recovery_mode) + .normalize_single_quotes(static_cast(normalize_single_quotes)); if (!n_col_names.is_null() && data_types.size() > 0) { if (n_col_names.size() != n_types.size()) { @@ -1586,11 +1588,12 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( cudf::io::json_recovery_mode_t recovery_mode = recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL : cudf::io::json_recovery_mode_t::FAIL; - cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) - .dayfirst(static_cast(day_first)) - .lines(static_cast(lines)) - .recovery_mode(recovery_mode) - .normalize_single_quotes(static_cast(normalize_single_quotes)); + cudf::io::json_reader_options_builder opts = + cudf::io::json_reader_options::builder(source) + .dayfirst(static_cast(day_first)) + .lines(static_cast(lines)) + .recovery_mode(recovery_mode) + .normalize_single_quotes(static_cast(normalize_single_quotes)); if (!n_col_names.is_null() && data_types.size() > 0) { if (n_col_names.size() != n_types.size()) { From a5bb42e9f47309a6c36f51080c820f5637b44260 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Sat, 13 Jan 2024 07:25:46 +0000 Subject: [PATCH 13/26] compile fix --- java/src/main/java/ai/rapids/cudf/Table.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index ea5c452aba6..b734800b4bc 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -1142,8 +1142,8 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer, assert len <= buffer.length - offset; assert offset >= 0 && offset < buffer.length; return new TableWithMeta(readAndInferJSON(buffer.getAddress() + offset, len, - opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull()), - opts.isNormalizeSingleQuotes()); + opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(), + opts.isNormalizeSingleQuotes())); } /** From e63bca0868ca11a8e3cabdc891f2ba66d36045dd Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Tue, 16 Jan 2024 15:33:55 -0800 Subject: [PATCH 14/26] Update java/src/test/java/ai/rapids/cudf/TableTest.java Co-authored-by: Andy Grove --- java/src/test/java/ai/rapids/cudf/TableTest.java | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 78aa11a074e..126bab9a9d8 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -346,6 +346,21 @@ void testReadSingleQuotesJSONFile() throws IOException { } } + @Test + void testReadSingleQuotesJSONFileFeatureDisabled() throws IOException { + Schema schema = Schema.builder() + .column(DType.STRING, "A") + .build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withNormalizeSingleQuotes(false) + .build(); + try (MultiBufferDataSource source = sourceFrom(TEST_JSON_SINGLE_QUOTES_FILE)) { + assertThrows(CudfException.class, () -> + Table.readJSON(schema, opts, source)); + } + } + @Test void testReadJSONFromDataSource() throws IOException { Schema schema = Schema.builder() From 005b5c280264f608007560860ff9dbaf84ce6814 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Tue, 16 Jan 2024 15:35:07 -0800 Subject: [PATCH 15/26] Update java/src/test/java/ai/rapids/cudf/TableTest.java Co-authored-by: Andy Grove --- java/src/test/java/ai/rapids/cudf/TableTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 126bab9a9d8..f3441d5b0cb 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -342,8 +342,8 @@ void testReadSingleQuotesJSONFile() throws IOException { .build(); MultiBufferDataSource source = sourceFrom(TEST_JSON_SINGLE_QUOTES_FILE); Table table = Table.readJSON(schema, opts, source)) { - assertTablesAreEqual(expected, table); - } + assertTablesAreEqual(expected, table); + } } @Test From 1a8f5f31d1e4ade94c3407f10e18139d77d797cc Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Tue, 16 Jan 2024 23:35:40 +0000 Subject: [PATCH 16/26] added an error test for when normalize quotes is not enabled --- cpp/tests/io/json_quote_normalization_test.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp index 3dbb6468a65..2d6e35cc3ad 100644 --- a/cpp/tests/io/json_quote_normalization_test.cpp +++ b/cpp/tests/io/json_quote_normalization_test.cpp @@ -113,4 +113,21 @@ TEST_F(JsonNormalizationTest, ReadJsonOption) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view()); } +TEST_F(JsonNormalizationTest, ErrorCheck) +{ + // RMM memory resource + std::shared_ptr rsc = + std::make_shared(); + + // Test input + std::string const host_input = R"({"A":'TEST"'})"; + cudf::io::json_reader_options input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{host_input.data(), host_input.size()}) + .lines(true); + + EXPECT_THROW(cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc.get()), + cudf::logic_error); +} + CUDF_TEST_PROGRAM_MAIN() From 2001866a80602d0bcc2f3ec9ec18c846ff8a6191 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Thu, 18 Jan 2024 00:11:38 +0000 Subject: [PATCH 17/26] addressing PR reviews; adding comments --- cpp/include/cudf/io/detail/json.hpp | 9 +++++- cpp/src/io/json/json_quote_normalization.cu | 33 ++------------------- cpp/src/io/json/read_json.cu | 13 ++++++++ cpp/src/io/json/read_json.hpp | 2 +- 4 files changed, 24 insertions(+), 33 deletions(-) diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index a3d8ebf57e0..f67daf3b4da 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -52,7 +52,14 @@ void write_json(data_sink* sink, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); -rmm::device_uvector normalize_single_quotes(const cudf::device_span& inbuf, +/** + * @brief Normalize single quotes to double quotes using FST + * + * @param inbuf Input device span buffer + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource to use for device memory allocation + */ +rmm::device_uvector normalize_single_quotes(cudf::device_span inbuf, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); diff --git a/cpp/src/io/json/json_quote_normalization.cu b/cpp/src/io/json/json_quote_normalization.cu index d2088bc34d3..3d80bb93a15 100644 --- a/cpp/src/io/json/json_quote_normalization.cu +++ b/cpp/src/io/json/json_quote_normalization.cu @@ -177,36 +177,7 @@ struct TransduceToNormalizedQuotes { namespace detail { -/* -std::unique_ptr> normalize_quotes( - const cudf::device_span& inbuf, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - auto parser = cudf::io::fst::detail::make_fst( - cudf::io::fst::detail::make_symbol_group_lut(cudf::io::json::normalize_quotes::qna_sgs), - cudf::io::fst::detail::make_transition_table(cudf::io::json::normalize_quotes::qna_state_tt), - cudf::io::fst::detail::make_translation_functor( - cudf::io::json::normalize_quotes::TransduceToNormalizedQuotes{}), - stream); - - std::unique_ptr> outbuf_ptr = - std::make_unique>(inbuf.size() * 2, stream, mr); - rmm::device_scalar outbuf_size(stream, mr); - parser.Transduce(reinterpret_cast(inbuf.data()), - static_cast(inbuf.size()), - outbuf_ptr->data(), - thrust::make_discard_iterator(), - outbuf_size.data(), - cudf::io::json::normalize_quotes::start_state, - stream); - - outbuf_ptr->resize(outbuf_size.value(stream), stream); - return outbuf_ptr; -} -*/ - -rmm::device_uvector normalize_single_quotes(const cudf::device_span& inbuf, +rmm::device_uvector normalize_single_quotes(cudf::device_span inbuf, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -219,7 +190,7 @@ rmm::device_uvector normalize_single_quotes(const cudf::device_span outbuf(inbuf.size() * 2, stream, mr); rmm::device_scalar outbuf_size(stream, mr); - parser.Transduce(reinterpret_cast(inbuf.data()), + parser.Transduce(reinterpret_cast(inbuf.data()), static_cast(inbuf.size()), outbuf.data(), thrust::make_discard_iterator(), diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index c159fc2378e..af4c78475c0 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -46,6 +46,16 @@ size_t sources_size(host_span> const sources, }); } +/** + * @brief Read from array of data sources into RMM buffer + * + * @param sources Array of data sources + * @param compression Compression format of source + * @param range_offset Number of bytes to skip from source start + * @param range_size Number of bytes to read from source + * @param normalize_single_quotes Boolean to indicate whether pre-processing FST should be called + * @param stream CUDA stream used for device memory operations and kernel launches + */ rmm::device_uvector ingest_raw_input(host_span> sources, compression_type compression, size_t range_offset, @@ -105,6 +115,8 @@ rmm::device_uvector ingest_raw_input(host_span } stream.synchronize(); + // If input JSON buffer has single quotes and option to normalize single quotes is enabled, + // invoke pre-processing FST if (normalize_single_quotes) { auto d_buffer_span = cudf::device_span( reinterpret_cast(d_buffer.data()), d_buffer.size()); @@ -123,6 +135,7 @@ rmm::device_uvector ingest_raw_input(host_span host_span{reinterpret_cast(uncomp_data.data()), uncomp_data.size()}, stream, rmm::mr::get_current_device_resource()); + // Quote normalization FST if (normalize_single_quotes) { auto d_buffer_span = cudf::device_span( reinterpret_cast(d_buffer.data()), d_buffer.size()); diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp index d3acfa7ebc2..d05134fa837 100644 --- a/cpp/src/io/json/read_json.hpp +++ b/cpp/src/io/json/read_json.hpp @@ -22,7 +22,6 @@ #include #include -#include #include #include @@ -42,4 +41,5 @@ size_type find_first_delimiter_in_chunk(host_span Date: Thu, 18 Jan 2024 07:17:31 +0000 Subject: [PATCH 18/26] moved tests; removed duplicated fst code --- cpp/tests/CMakeLists.txt | 1 - cpp/tests/io/fst/quote_normalization_test.cu | 332 ------------------ .../io/json_quote_normalization_test.cpp | 147 ++++++-- 3 files changed, 119 insertions(+), 361 deletions(-) delete mode 100644 cpp/tests/io/fst/quote_normalization_test.cu diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 3a7a0dd55e9..60324e525ff 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -315,7 +315,6 @@ ConfigureTest( PERCENT 30 ) target_link_libraries(DATA_CHUNK_SOURCE_TEST PRIVATE ZLIB::ZLIB) -ConfigureTest(QUOTE_NORMALIZATION_TEST io/fst/quote_normalization_test.cu) ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu) ConfigureTest(FST_TEST io/fst/fst_test.cu) ConfigureTest(TYPE_INFERENCE_TEST io/type_inference_test.cu) diff --git a/cpp/tests/io/fst/quote_normalization_test.cu b/cpp/tests/io/fst/quote_normalization_test.cu deleted file mode 100644 index d0794b8f17e..00000000000 --- a/cpp/tests/io/fst/quote_normalization_test.cu +++ /dev/null @@ -1,332 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include - -#include - -#include -#include -#include - -namespace { - -// Type used to represent the atomic symbol type used within the finite-state machine -// TODO: type aliasing to be declared in a common header for better maintainability and -// pre-empt future bugs -using SymbolT = char; -using StateT = char; - -// Type sufficiently large to index symbols within the input and output (may be unsigned) -using SymbolOffsetT = uint32_t; -enum class dfa_states : char { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES }; -enum class dfa_symbol_group_id : uint32_t { - DOUBLE_QUOTE_CHAR, ///< Quote character SG: " - SINGLE_QUOTE_CHAR, ///< Quote character SG: ' - ESCAPE_CHAR, ///< Escape character SG: '\' - NEWLINE_CHAR, ///< Newline character SG: '\n' - OTHER_SYMBOLS, ///< SG implicitly matching all other characters - NUM_SYMBOL_GROUPS ///< Total number of symbol groups -}; - -// Aliases for readability of the transition table -constexpr auto TT_OOS = dfa_states::TT_OOS; -constexpr auto TT_DQS = dfa_states::TT_DQS; -constexpr auto TT_SQS = dfa_states::TT_SQS; -constexpr auto TT_DEC = dfa_states::TT_DEC; -constexpr auto TT_SEC = dfa_states::TT_SEC; -constexpr auto TT_NUM_STATES = static_cast(dfa_states::TT_NUM_STATES); -constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); - -// The i-th string representing all the characters of a symbol group -std::array, NUM_SYMBOL_GROUPS - 1> const qna_sgs{ - {{'\"'}, {'\''}, {'\\'}, {'\n'}}}; - -// Transition table -std::array, TT_NUM_STATES> const qna_state_tt{{ - /* IN_STATE " ' \ \n OTHER */ - /* TT_OOS */ {{TT_DQS, TT_SQS, TT_OOS, TT_OOS, TT_OOS}}, - /* TT_DQS */ {{TT_OOS, TT_DQS, TT_DEC, TT_OOS, TT_DQS}}, - /* TT_SQS */ {{TT_SQS, TT_OOS, TT_SEC, TT_OOS, TT_SQS}}, - /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_OOS, TT_DQS}}, - /* TT_SEC */ {{TT_SQS, TT_SQS, TT_SQS, TT_OOS, TT_SQS}}, -}}; - -// The DFA's starting state -constexpr char start_state = static_cast(TT_OOS); - -struct TransduceToNormalizedQuotes { - /** - * @brief Returns the -th output symbol on the transition (state_id, match_id). - */ - template - constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id, - SymbolGroupT const match_id, - RelativeOffsetT const relative_offset, - SymbolT const read_symbol) const - { - // -------- TRANSLATION TABLE ------------ - // Let the alphabet set be Sigma - // --------------------------------------- - // ---------- NON-SPECIAL CASES: ---------- - // Output symbol same as input symbol - // state | read_symbol -> output_symbol - // DQS | Sigma -> Sigma - // DEC | Sigma -> Sigma - // OOS | Sigma\{'} -> Sigma\{'} - // SQS | Sigma\{', "} -> Sigma\{', "} - // ---------- SPECIAL CASES: -------------- - // Input symbol translates to output symbol - // OOS | {'} -> {"} - // SQS | {'} -> {"} - // SQS | {"} -> {\"} - // SQS | {\} -> - // SEC | {'} -> {'} - // SEC | Sigma\{'} -> {\*} - - // Whether this transition translates to the escape sequence: \" - const bool outputs_escape_sequence = - (state_id == static_cast(dfa_states::TT_SQS)) && - (match_id == static_cast(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR)); - // Case when a double quote needs to be replaced by the escape sequence: \" - if (outputs_escape_sequence) { return (relative_offset == 0) ? '\\' : '"'; } - // Case when a single quote needs to be replaced by a double quote - if ((match_id == static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)) && - ((state_id == static_cast(dfa_states::TT_SQS)) || - (state_id == static_cast(dfa_states::TT_OOS)))) { - return '"'; - } - // Case when the read symbol is an escape character - the actual translation for \ for some - // symbol is handled by transitions from SEC. For now, there is no output for this - // transition - if ((match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR)) && - ((state_id == static_cast(dfa_states::TT_SQS)))) { - return 0; - } - // Case when an escaped single quote in an input single-quoted string needs to be replaced by an - // unescaped single quote - if ((match_id == static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)) && - ((state_id == static_cast(dfa_states::TT_SEC)))) { - return '\''; - } - // Case when an escaped symbol that is not a single-quote needs to be replaced with \ - if (state_id == static_cast(dfa_states::TT_SEC)) { - return (relative_offset == 0) ? '\\' : read_symbol; - } - // In all other cases we simply output the input symbol - return read_symbol; - } - - /** - * @brief Returns the number of output characters for a given transition. During quote - * normalization, we always emit one output character (i.e., either the input character or the - * single quote-input replaced by a double quote), except when we need to escape a double quote - * that was previously inside a single-quoted string. - */ - template - constexpr CUDF_HOST_DEVICE int32_t operator()(StateT const state_id, - SymbolGroupT const match_id, - SymbolT const read_symbol) const - { - // Whether this transition translates to the escape sequence: \" - const bool sqs_outputs_escape_sequence = - (state_id == static_cast(dfa_states::TT_SQS)) && - (match_id == static_cast(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR)); - // Number of characters to output on this transition - if (sqs_outputs_escape_sequence) { return 2; } - // Whether this transition translates to the escape sequence \ or unescaped ' - const bool sec_outputs_escape_sequence = - (state_id == static_cast(dfa_states::TT_SEC)) && - (match_id != static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)); - // Number of characters to output on this transition - if (sec_outputs_escape_sequence) { return 2; } - // Whether this transition translates to no output - const bool sqs_outputs_nop = - (state_id == static_cast(dfa_states::TT_SQS)) && - (match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR)); - // Number of characters to output on this transition - if (sqs_outputs_nop) { return 0; } - return 1; - } -}; - -} // namespace - -// Base test fixture for tests -struct FstTest : public cudf::test::BaseFixture {}; - -void run_test(std::string& input, std::string& output) -{ - // Prepare cuda stream for data transfers & kernels - rmm::cuda_stream stream{}; - rmm::cuda_stream_view stream_view(stream); - - auto parser = cudf::io::fst::detail::make_fst( - cudf::io::fst::detail::make_symbol_group_lut(qna_sgs), - cudf::io::fst::detail::make_transition_table(qna_state_tt), - cudf::io::fst::detail::make_translation_functor(TransduceToNormalizedQuotes{}), - stream); - - auto d_input_scalar = cudf::make_string_scalar(input, stream_view); - auto& d_input = static_cast&>(*d_input_scalar); - - // Prepare input & output buffers - constexpr std::size_t single_item = 1; - cudf::detail::hostdevice_vector output_gpu(input.size() * 2, stream_view); - cudf::detail::hostdevice_vector output_gpu_size(single_item, stream_view); - - // Allocate device-side temporary storage & run algorithm - parser.Transduce(d_input.data(), - static_cast(d_input.size()), - output_gpu.device_ptr(), - thrust::make_discard_iterator(), - output_gpu_size.device_ptr(), - start_state, - stream_view); - - // Async copy results from device to host - output_gpu.device_to_host_async(stream_view); - output_gpu_size.device_to_host_async(stream_view); - - // Make sure results have been copied back to host - stream.synchronize(); - - // Verify results - ASSERT_EQ(output_gpu_size[0], output.size()); - CUDF_TEST_EXPECT_VECTOR_EQUAL(output_gpu, output, output.size()); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization1) -{ - std::string input = R"({"A":'TEST"'})"; - std::string output = R"({"A":"TEST\""})"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization2) -{ - std::string input = R"({'A':"TEST'"} ['OTHER STUFF'])"; - std::string output = R"({"A":"TEST'"} ["OTHER STUFF"])"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization3) -{ - std::string input = R"(['{"A": "B"}',"{'A': 'B'}"])"; - std::string output = R"(["{\"A\": \"B\"}","{'A': 'B'}"])"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization4) -{ - std::string input = R"({"ain't ain't a word and you ain't supposed to say it":'"""""""""""'})"; - std::string output = - R"({"ain't ain't a word and you ain't supposed to say it":"\"\"\"\"\"\"\"\"\"\"\""})"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization5) -{ - std::string input = R"({"\"'\"'\"'\"'":'"\'"\'"\'"\'"'})"; - std::string output = R"({"\"'\"'\"'\"'":"\"'\"'\"'\"'\""})"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization6) -{ - std::string input = R"([{"ABC':'CBA":'XYZ":"ZXY'}])"; - std::string output = R"([{"ABC':'CBA":"XYZ\":\"ZXY"}])"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization7) -{ - std::string input = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])"; - std::string output = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization8) -{ - std::string input = R"(['\t','\\t','\\','\\\"\'\\\\','\n','\b','\u0012'])"; - std::string output = R"(["\t","\\t","\\","\\\"'\\\\","\n","\b","\u0012"])"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid1) -{ - std::string input = R"(["THIS IS A TEST'])"; - std::string output = R"(["THIS IS A TEST'])"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid2) -{ - std::string input = R"(['THIS IS A TEST"])"; - std::string output = R"(["THIS IS A TEST\"])"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid3) -{ - std::string input = R"({"MORE TEST'N":'RESUL})"; - std::string output = R"({"MORE TEST'N":"RESUL})"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid4) -{ - std::string input = R"({"NUMBER":100'0,'STRING':'SOMETHING'})"; - std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid5) -{ - std::string input = R"({'NUMBER':100"0,"STRING":"SOMETHING"})"; - std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid6) -{ - std::string input = R"({'a':'\\''})"; - std::string output = R"({"a":"\\""})"; - run_test(input, output); -} - -TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid7) -{ - std::string input = R"(}'a': 'b'{)"; - std::string output = R"(}"a": "b"{)"; - run_test(input, output); -} - -CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp index 2d6e35cc3ad..92450d07403 100644 --- a/cpp/tests/io/json_quote_normalization_test.cpp +++ b/cpp/tests/io/json_quote_normalization_test.cpp @@ -39,49 +39,140 @@ // Base test fixture for tests struct JsonNormalizationTest : public cudf::test::BaseFixture {}; -TEST_F(JsonNormalizationTest, ValidOutput) +void run_test(const std::string& host_input, const std::string& expected_host_output) { // RMM memory resource std::shared_ptr rsc = std::make_shared(); - // Test input - std::string const host_input = R"({"A":'TEST"'})"; rmm::device_uvector device_input( host_input.size(), cudf::test::get_default_stream(), rsc.get()); - for (size_t i = 0; i < host_input.size(); i++) - device_input.set_element_async(i, host_input[i], cudf::test::get_default_stream()); + CUDF_CUDA_TRY(cudaMemcpyAsync(device_input.data(), + host_input.data(), + host_input.size(), + cudaMemcpyHostToDevice, + cudf::test::get_default_stream().value())); auto device_input_span = cudf::device_span( reinterpret_cast(device_input.data()), device_input.size()); // Preprocessing FST auto device_fst_output = cudf::io::json::detail::normalize_single_quotes( device_input_span, cudf::test::get_default_stream(), rsc.get()); - /* - for(size_t i = 0; i < device_fst_output_ptr->size(); i++) - std::printf("%c", device_fst_output_ptr->element(i, cudf::test::get_default_stream())); - std::printf("\n"); - */ - - // Initialize parsing options (reading json lines) - auto device_fst_output_span = cudf::device_span( - reinterpret_cast(device_fst_output.data()), device_fst_output.size()); - cudf::io::json_reader_options input_options = - cudf::io::json_reader_options::builder(cudf::io::source_info{device_fst_output_span}) - .lines(true); - cudf::io::table_with_metadata processed_table = - cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc.get()); + std::string preprocessed_host_output(device_fst_output.size(), 0); + CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(), + device_fst_output.data(), + preprocessed_host_output.size(), + cudaMemcpyDeviceToHost, + cudf::test::get_default_stream().value())); + CUDF_TEST_EXPECT_VECTOR_EQUAL( + preprocessed_host_output, expected_host_output, preprocessed_host_output.size()); +} - // Expected table - std::string const expected_input = R"({"A":"TEST\""})"; - cudf::io::json_reader_options expected_input_options = - cudf::io::json_reader_options::builder( - cudf::io::source_info{expected_input.data(), expected_input.size()}) - .lines(true); - cudf::io::table_with_metadata expected_table = - cudf::io::read_json(expected_input_options, cudf::test::get_default_stream(), rsc.get()); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view()); +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization1) +{ + std::string input = R"({"A":'TEST"'})"; + std::string output = R"({"A":"TEST\""})"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization2) +{ + std::string input = R"({'A':"TEST'"} ['OTHER STUFF'])"; + std::string output = R"({"A":"TEST'"} ["OTHER STUFF"])"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization3) +{ + std::string input = R"(['{"A": "B"}',"{'A': 'B'}"])"; + std::string output = R"(["{\"A\": \"B\"}","{'A': 'B'}"])"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization4) +{ + std::string input = R"({"ain't ain't a word and you ain't supposed to say it":'"""""""""""'})"; + std::string output = + R"({"ain't ain't a word and you ain't supposed to say it":"\"\"\"\"\"\"\"\"\"\"\""})"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization5) +{ + std::string input = R"({"\"'\"'\"'\"'":'"\'"\'"\'"\'"'})"; + std::string output = R"({"\"'\"'\"'\"'":"\"'\"'\"'\"'\""})"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization6) +{ + std::string input = R"([{"ABC':'CBA":'XYZ":"ZXY'}])"; + std::string output = R"([{"ABC':'CBA":"XYZ\":\"ZXY"}])"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization7) +{ + std::string input = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])"; + std::string output = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization8) +{ + std::string input = R"(['\t','\\t','\\','\\\"\'\\\\','\n','\b','\u0012'])"; + std::string output = R"(["\t","\\t","\\","\\\"'\\\\","\n","\b","\u0012"])"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid1) +{ + std::string input = R"(["THIS IS A TEST'])"; + std::string output = R"(["THIS IS A TEST'])"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid2) +{ + std::string input = R"(['THIS IS A TEST"])"; + std::string output = R"(["THIS IS A TEST\"])"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid3) +{ + std::string input = R"({"MORE TEST'N":'RESUL})"; + std::string output = R"({"MORE TEST'N":"RESUL})"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid4) +{ + std::string input = R"({"NUMBER":100'0,'STRING':'SOMETHING'})"; + std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid5) +{ + std::string input = R"({'NUMBER':100"0,"STRING":"SOMETHING"})"; + std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid6) +{ + std::string input = R"({'a':'\\''})"; + std::string output = R"({"a":"\\""})"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid7) +{ + std::string input = R"(}'a': 'b'{)"; + std::string output = R"(}"a": "b"{)"; + run_test(input, output); } TEST_F(JsonNormalizationTest, ReadJsonOption) From 55503e374feec5765dd3abd0580056191bffe5f9 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Thu, 18 Jan 2024 07:42:09 +0000 Subject: [PATCH 19/26] moved preprocess step to read_json --- cpp/src/io/json/read_json.cu | 38 +++++++++++++----------------------- 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index af4c78475c0..bf50edce5fa 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -53,14 +53,12 @@ size_t sources_size(host_span> const sources, * @param compression Compression format of source * @param range_offset Number of bytes to skip from source start * @param range_size Number of bytes to read from source - * @param normalize_single_quotes Boolean to indicate whether pre-processing FST should be called * @param stream CUDA stream used for device memory operations and kernel launches */ rmm::device_uvector ingest_raw_input(host_span> sources, compression_type compression, size_t range_offset, size_t range_size, - bool normalize_single_quotes, rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); @@ -115,15 +113,7 @@ rmm::device_uvector ingest_raw_input(host_span } stream.synchronize(); - // If input JSON buffer has single quotes and option to normalize single quotes is enabled, - // invoke pre-processing FST - if (normalize_single_quotes) { - auto d_buffer_span = cudf::device_span( - reinterpret_cast(d_buffer.data()), d_buffer.size()); - return cudf::io::json::detail::normalize_single_quotes( - d_buffer_span, stream, rmm::mr::get_current_device_resource()); - } else - return d_buffer; + return d_buffer; } else { auto buffer = std::vector(total_source_size); @@ -131,18 +121,10 @@ rmm::device_uvector ingest_raw_input(host_span // Reading to host because decompression of a single block is much faster on the CPU sources[0]->host_read(range_offset, total_source_size, buffer.data()); auto const uncomp_data = decompress(compression, buffer); - auto d_buffer = cudf::detail::make_device_uvector_sync( + return cudf::detail::make_device_uvector_sync( host_span{reinterpret_cast(uncomp_data.data()), uncomp_data.size()}, stream, rmm::mr::get_current_device_resource()); - // Quote normalization FST - if (normalize_single_quotes) { - auto d_buffer_span = cudf::device_span( - reinterpret_cast(d_buffer.data()), d_buffer.size()); - return cudf::io::json::detail::normalize_single_quotes( - d_buffer_span, stream, rmm::mr::get_current_device_resource()); - } else - return d_buffer; } } @@ -155,7 +137,6 @@ size_type find_first_delimiter_in_chunk(host_span> sources, reader_opts.get_compression(), reader_opts.get_byte_range_offset(), reader_opts.get_byte_range_size(), - reader_opts.is_enabled_normalize_single_quotes(), stream); if (should_load_whole_source(reader_opts)) return buffer; auto first_delim_pos = @@ -205,7 +185,6 @@ auto get_record_range_raw_input(host_span> sources, reader_opts.get_compression(), current_offset, reader_opts.get_byte_range_size(), - reader_opts.is_enabled_normalize_single_quotes(), stream); next_delim_pos = find_first_delimiter(buffer, '\n', stream); if (next_delim_pos == -1) { current_offset += reader_opts.get_byte_range_size(); } @@ -219,7 +198,6 @@ auto get_record_range_raw_input(host_span> sources, reader_opts.get_compression(), first_delim_pos, next_delim_pos - first_delim_pos, - reader_opts.is_enabled_normalize_single_quotes(), stream); } } @@ -251,6 +229,18 @@ table_with_metadata read_json(host_span> sources, auto const buffer = get_record_range_raw_input(sources, reader_opts, stream); + // If input JSON buffer has single quotes and option to normalize single quotes is enabled, + // invoke pre-processing FST + if (reader_opts.is_enabled_normalize_single_quotes()) { + auto buffer_span = cudf::device_span( + reinterpret_cast(buffer.data()), buffer.size()); + return device_parse_nested_json(cudf::io::json::detail::normalize_single_quotes( + buffer_span, stream, rmm::mr::get_current_device_resource()), + reader_opts, + stream, + mr); + } + return device_parse_nested_json(buffer, reader_opts, stream, mr); // For debug purposes, use host_parse_nested_json() } From a8852771a252e0064f55d268cd1479eaa5b06916 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Fri, 19 Jan 2024 22:05:01 +0000 Subject: [PATCH 20/26] PR reviews - modifiable input buffer in normalize quotes parameter --- cpp/include/cudf/io/detail/json.hpp | 4 ++-- cpp/src/io/json/json_quote_normalization.cu | 4 ++-- cpp/src/io/json/read_json.cu | 11 +++-------- cpp/tests/io/json_quote_normalization_test.cpp | 6 +----- 4 files changed, 8 insertions(+), 17 deletions(-) diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index f67daf3b4da..0eb0e17ea10 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -55,11 +55,11 @@ void write_json(data_sink* sink, /** * @brief Normalize single quotes to double quotes using FST * - * @param inbuf Input device span buffer + * @param inbuf Input device buffer * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation */ -rmm::device_uvector normalize_single_quotes(cudf::device_span inbuf, +rmm::device_uvector normalize_single_quotes(rmm::device_uvector&& inbuf, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); diff --git a/cpp/src/io/json/json_quote_normalization.cu b/cpp/src/io/json/json_quote_normalization.cu index 3d80bb93a15..61b78bb008f 100644 --- a/cpp/src/io/json/json_quote_normalization.cu +++ b/cpp/src/io/json/json_quote_normalization.cu @@ -177,7 +177,7 @@ struct TransduceToNormalizedQuotes { namespace detail { -rmm::device_uvector normalize_single_quotes(cudf::device_span inbuf, +rmm::device_uvector normalize_single_quotes(rmm::device_uvector&& inbuf, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -190,7 +190,7 @@ rmm::device_uvector normalize_single_quotes(cudf::device_span outbuf(inbuf.size() * 2, stream, mr); rmm::device_scalar outbuf_size(stream, mr); - parser.Transduce(reinterpret_cast(inbuf.data()), + parser.Transduce(inbuf.data(), static_cast(inbuf.size()), outbuf.data(), thrust::make_discard_iterator(), diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index bf50edce5fa..7edd5c6b75e 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -227,18 +227,13 @@ table_with_metadata read_json(host_span> sources, "Multiple inputs are supported only for JSON Lines format"); } - auto const buffer = get_record_range_raw_input(sources, reader_opts, stream); + auto buffer = get_record_range_raw_input(sources, reader_opts, stream); // If input JSON buffer has single quotes and option to normalize single quotes is enabled, // invoke pre-processing FST if (reader_opts.is_enabled_normalize_single_quotes()) { - auto buffer_span = cudf::device_span( - reinterpret_cast(buffer.data()), buffer.size()); - return device_parse_nested_json(cudf::io::json::detail::normalize_single_quotes( - buffer_span, stream, rmm::mr::get_current_device_resource()), - reader_opts, - stream, - mr); + buffer = cudf::io::json::detail::normalize_single_quotes( + std::move(buffer), stream, rmm::mr::get_current_device_resource()); } return device_parse_nested_json(buffer, reader_opts, stream, mr); diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp index 92450d07403..5c512999a7b 100644 --- a/cpp/tests/io/json_quote_normalization_test.cpp +++ b/cpp/tests/io/json_quote_normalization_test.cpp @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -52,12 +51,9 @@ void run_test(const std::string& host_input, const std::string& expected_host_ou host_input.size(), cudaMemcpyHostToDevice, cudf::test::get_default_stream().value())); - auto device_input_span = cudf::device_span( - reinterpret_cast(device_input.data()), device_input.size()); - // Preprocessing FST auto device_fst_output = cudf::io::json::detail::normalize_single_quotes( - device_input_span, cudf::test::get_default_stream(), rsc.get()); + std::move(device_input), cudf::test::get_default_stream(), rsc.get()); std::string preprocessed_host_output(device_fst_output.size(), 0); CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(), From de1f1b3beee7c15b0e03639004fcc98a89d08670 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Sat, 20 Jan 2024 01:46:30 +0000 Subject: [PATCH 21/26] don't need fully qualified name in enclosing namespace --- cpp/src/io/json/read_json.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 7edd5c6b75e..2cfb5fa03c9 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -232,8 +232,8 @@ table_with_metadata read_json(host_span> sources, // If input JSON buffer has single quotes and option to normalize single quotes is enabled, // invoke pre-processing FST if (reader_opts.is_enabled_normalize_single_quotes()) { - buffer = cudf::io::json::detail::normalize_single_quotes( - std::move(buffer), stream, rmm::mr::get_current_device_resource()); + buffer = + normalize_single_quotes(std::move(buffer), stream, rmm::mr::get_current_device_resource()); } return device_parse_nested_json(buffer, reader_opts, stream, mr); From 8441b3914404815df20ff1170eef865945b44107 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Mon, 22 Jan 2024 21:40:59 +0000 Subject: [PATCH 22/26] header files cleanup; more fully-qualified names cleanup --- cpp/include/cudf/io/json.hpp | 6 ++--- cpp/src/io/json/json_quote_normalization.cu | 22 ++++++++----------- .../io/json_quote_normalization_test.cpp | 4 ---- 3 files changed, 12 insertions(+), 20 deletions(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index a087e6b40fb..e8f461d808f 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -335,10 +335,10 @@ class json_reader_options { void enable_keep_quotes(bool val) { _keep_quotes = val; } /** - * @brief Set whether the reader should enable normalization of single quotes around strings. + * @brief Set whether the reader should enable normalization of single quotes around strings. * * @param val Boolean value to indicate whether the reader should normalize single quotes around - * string + * strings */ void enable_normalize_single_quotes(bool val) { _normalize_single_quotes = val; } @@ -493,7 +493,7 @@ class json_reader_options_builder { } /** - * @brief Set whether the reader should normalize single quotes around string + * @brief Set whether the reader should normalize single quotes around strings * * @param val Boolean value to indicate whether the reader should normalize single quotes * of strings diff --git a/cpp/src/io/json/json_quote_normalization.cu b/cpp/src/io/json/json_quote_normalization.cu index 61b78bb008f..f0e21115e27 100644 --- a/cpp/src/io/json/json_quote_normalization.cu +++ b/cpp/src/io/json/json_quote_normalization.cu @@ -15,16 +15,13 @@ */ #include -#include #include -#include -#include #include #include #include -#include +#include #include #include @@ -42,7 +39,7 @@ using SymbolOffsetT = uint32_t; namespace normalize_quotes { // Type sufficiently large to index symbols within the input and output (may be unsigned) -enum class dfa_states : char { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES }; +enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES }; enum class dfa_symbol_group_id : uint32_t { DOUBLE_QUOTE_CHAR, ///< Quote character SG: " SINGLE_QUOTE_CHAR, ///< Quote character SG: ' @@ -58,7 +55,7 @@ constexpr auto TT_DQS = dfa_states::TT_DQS; constexpr auto TT_SQS = dfa_states::TT_SQS; constexpr auto TT_DEC = dfa_states::TT_DEC; constexpr auto TT_SEC = dfa_states::TT_SEC; -constexpr auto TT_NUM_STATES = static_cast(dfa_states::TT_NUM_STATES); +constexpr auto TT_NUM_STATES = static_cast(dfa_states::TT_NUM_STATES); constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); // The i-th string representing all the characters of a symbol group @@ -76,7 +73,7 @@ std::array, TT_NUM_STATES> const qna_s }}; // The DFA's starting state -constexpr char start_state = static_cast(TT_OOS); +constexpr char start_state = static_cast(TT_OOS); struct TransduceToNormalizedQuotes { /** @@ -181,11 +178,10 @@ rmm::device_uvector normalize_single_quotes(rmm::device_uvector outbuf(inbuf.size() * 2, stream, mr); @@ -195,7 +191,7 @@ rmm::device_uvector normalize_single_quotes(rmm::device_uvector #include -#include -#include -#include -#include #include #include From d5b9707ea81ff50f8aaae71716e17756a908b399 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Tue, 23 Jan 2024 01:59:18 +0000 Subject: [PATCH 23/26] alphabetizing the new file in add_library --- cpp/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 23256029c62..5bfe9ebadf6 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -370,13 +370,13 @@ add_library( src/io/functions.cpp src/io/json/byte_range_info.cu src/io/json/json_column.cu + src/io/json/json_quote_normalization.cu src/io/json/json_tree.cu src/io/json/nested_json_gpu.cu src/io/json/read_json.cu src/io/json/legacy/json_gpu.cu src/io/json/legacy/reader_impl.cu src/io/json/write_json.cu - src/io/json/json_quote_normalization.cu src/io/orc/aggregate_orc_metadata.cpp src/io/orc/dict_enc.cu src/io/orc/orc.cpp From 4e358fdf911c134614c8b3516063ee0d255c0e8a Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Tue, 23 Jan 2024 02:00:18 +0000 Subject: [PATCH 24/26] more header file cleanup --- cpp/tests/io/json_quote_normalization_test.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp index 8443ea20d88..50faea5e4d8 100644 --- a/cpp/tests/io/json_quote_normalization_test.cpp +++ b/cpp/tests/io/json_quote_normalization_test.cpp @@ -16,8 +16,7 @@ #include #include -#include -#include +#include #include #include From a79683daad4bfe3025f375443de98a1783255d87 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Tue, 23 Jan 2024 02:14:53 +0000 Subject: [PATCH 25/26] guiding the consts eastwards --- cpp/src/io/json/json_quote_normalization.cu | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/json/json_quote_normalization.cu b/cpp/src/io/json/json_quote_normalization.cu index f0e21115e27..7c9466748cd 100644 --- a/cpp/src/io/json/json_quote_normalization.cu +++ b/cpp/src/io/json/json_quote_normalization.cu @@ -73,7 +73,7 @@ std::array, TT_NUM_STATES> const qna_s }}; // The DFA's starting state -constexpr char start_state = static_cast(TT_OOS); +constexpr auto start_state = static_cast(TT_OOS); struct TransduceToNormalizedQuotes { /** @@ -105,7 +105,7 @@ struct TransduceToNormalizedQuotes { // SEC | Sigma\{'} -> {\*} // Whether this transition translates to the escape sequence: \" - const bool outputs_escape_sequence = + bool const outputs_escape_sequence = (state_id == static_cast(dfa_states::TT_SQS)) && (match_id == static_cast(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR)); // Case when a double quote needs to be replaced by the escape sequence: \" @@ -149,19 +149,19 @@ struct TransduceToNormalizedQuotes { SymbolT const read_symbol) const { // Whether this transition translates to the escape sequence: \" - const bool sqs_outputs_escape_sequence = + bool const sqs_outputs_escape_sequence = (state_id == static_cast(dfa_states::TT_SQS)) && (match_id == static_cast(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR)); // Number of characters to output on this transition if (sqs_outputs_escape_sequence) { return 2; } // Whether this transition translates to the escape sequence \ or unescaped ' - const bool sec_outputs_escape_sequence = + bool const sec_outputs_escape_sequence = (state_id == static_cast(dfa_states::TT_SEC)) && (match_id != static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)); // Number of characters to output on this transition if (sec_outputs_escape_sequence) { return 2; } // Whether this transition translates to no output - const bool sqs_outputs_nop = + bool const sqs_outputs_nop = (state_id == static_cast(dfa_states::TT_SQS)) && (match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR)); // Number of characters to output on this transition From 890d09b30bb2b9a127e52e2a87c31db30b573163 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Tue, 23 Jan 2024 02:36:20 +0000 Subject: [PATCH 26/26] formatting fix --- java/src/main/native/src/TableJni.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 61efe2a4edb..cef18b245e7 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1472,8 +1472,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales, - jboolean day_first, jboolean lines, jboolean recover_with_null, jboolean normalize_single_quotes, jboolean mixed_types_as_string, - jlong ds_handle) { + jboolean day_first, jboolean lines, jboolean recover_with_null, + jboolean normalize_single_quotes, jboolean mixed_types_as_string, jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);