From a1120f6f1dc8447f3ff295fe0e6bd88166bec92f Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Tue, 9 Jan 2024 18:20:35 +0000
Subject: [PATCH 01/26] single quote normalization api

---
 cpp/src/io/json/json_quote_normalization.cu | 200 ++++++++++++++++++++
 cpp/src/io/json/read_json.hpp               |   1 +
 2 files changed, 201 insertions(+)
 create mode 100644 cpp/src/io/json/json_quote_normalization.cu
diff --git a/cpp/src/io/json/json_quote_normalization.cu b/cpp/src/io/json/json_quote_normalization.cu
new file mode 100644
index 00000000000..321495a4bce
--- /dev/null
+++ b/cpp/src/io/json/json_quote_normalization.cu
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <io/fst/lookup_tables.cuh>
+#include <io/utilities/hostdevice_vector.hpp>
+
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/repeat_strings.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/iterator/discard_iterator.h>
+
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+namespace cudf::io::json {
+
+  using SymbolT = char;
+  using StateT  = char;
+  using SymbolOffsetT = uint32_t;
+
+namespace normalize_quotes {
+
+  // Type sufficiently large to index symbols within the input and output (may be unsigned)
+  enum class dfa_states : char { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES };
+  enum class dfa_symbol_group_id : uint32_t {
+    DOUBLE_QUOTE_CHAR,  ///< Quote character SG: "
+    SINGLE_QUOTE_CHAR,  ///< Quote character SG: '
+    ESCAPE_CHAR,        ///< Escape character SG: '\'
+    NEWLINE_CHAR,       ///< Newline character SG: '\n'
+    OTHER_SYMBOLS,      ///< SG implicitly matching all other characters
+    NUM_SYMBOL_GROUPS   ///< Total number of symbol groups
+  };
+
+  // Aliases for readability of the transition table
+  constexpr auto TT_OOS            = dfa_states::TT_OOS;
+  constexpr auto TT_DQS            = dfa_states::TT_DQS;
+  constexpr auto TT_SQS            = dfa_states::TT_SQS;
+  constexpr auto TT_DEC            = dfa_states::TT_DEC;
+  constexpr auto TT_SEC            = dfa_states::TT_SEC;
+  constexpr auto TT_NUM_STATES     = static_cast<char>(dfa_states::TT_NUM_STATES);
+  constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
+
+  // The i-th string representing all the characters of a symbol group
+  std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const qna_sgs{
+    {{'\"'}, {'\''}, {'\\'}, {'\n'}}};
+
+  // Transition table
+  std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const qna_state_tt{{
+    /* IN_STATE      "       '       \       \n    OTHER  */
+    /* TT_OOS */ {{TT_DQS, TT_SQS, TT_OOS, TT_OOS, TT_OOS}},
+    /* TT_DQS */ {{TT_OOS, TT_DQS, TT_DEC, TT_OOS, TT_DQS}},
+    /* TT_SQS */ {{TT_SQS, TT_OOS, TT_SEC, TT_OOS, TT_SQS}},
+    /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_OOS, TT_DQS}},
+    /* TT_SEC */ {{TT_SQS, TT_SQS, TT_SQS, TT_OOS, TT_SQS}},
+  }};
+
+  // The DFA's starting state
+  constexpr char start_state = static_cast<char>(TT_OOS);
+
+  struct TransduceToNormalizedQuotes {
+    /**
+     * @brief Returns the <relative_offset>-th output symbol on the transition (state_id, match_id).
+     */
+    template <typename StateT, typename SymbolGroupT, typename RelativeOffsetT, typename SymbolT>
+    constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id,
+                                                  SymbolGroupT const match_id,
+                                                  RelativeOffsetT const relative_offset,
+                                                  SymbolT const read_symbol) const
+    {
+      // -------- TRANSLATION TABLE ------------
+      //      Let the alphabet set be Sigma
+      // ---------------------------------------
+      // ---------- NON-SPECIAL CASES: ----------
+      //      Output symbol same as input symbol <s>
+      // state | read_symbol <s> -> output_symbol <s>
+      // DQS   | Sigma           -> Sigma
+      // DEC   | Sigma           -> Sigma
+      // OOS   | Sigma\{'}       -> Sigma\{'}
+      // SQS   | Sigma\{', "}    -> Sigma\{', "}
+      // ---------- SPECIAL CASES: --------------
+      //    Input symbol translates to output symbol
+      // OOS   | {'}             -> {"}
+      // SQS   | {'}             -> {"}
+      // SQS   | {"}             -> {\"}
+      // SQS   | {\}             -> <nop>
+      // SEC   | {'}             -> {'}
+      // SEC   | Sigma\{'}       -> {\*}
+
+      // Whether this transition translates to the escape sequence: \"
+      const bool outputs_escape_sequence =
+        (state_id == static_cast<StateT>(dfa_states::TT_SQS)) &&
+        (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR));
+      // Case when a double quote needs to be replaced by the escape sequence: \"
+      if (outputs_escape_sequence) { return (relative_offset == 0) ? '\\' : '"'; }
+      // Case when a single quote needs to be replaced by a double quote
+      if ((match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)) &&
+          ((state_id == static_cast<StateT>(dfa_states::TT_SQS)) ||
+           (state_id == static_cast<StateT>(dfa_states::TT_OOS)))) {
+        return '"';
+      }
+      // Case when the read symbol is an escape character - the actual translation for \<s> for some
+      // symbol <s> is handled by transitions from SEC. For now, there is no output for this
+      // transition
+      if ((match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::ESCAPE_CHAR)) &&
+          ((state_id == static_cast<StateT>(dfa_states::TT_SQS)))) {
+        return 0;
+      }
+      // Case when an escaped single quote in an input single-quoted string needs to be replaced by an
+      // unescaped single quote
+      if ((match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)) &&
+          ((state_id == static_cast<StateT>(dfa_states::TT_SEC)))) {
+        return '\'';
+      }
+      // Case when an escaped symbol <s> that is not a single-quote needs to be replaced with \<s>
+      if (state_id == static_cast<StateT>(dfa_states::TT_SEC)) {
+        return (relative_offset == 0) ? '\\' : read_symbol;
+      }
+      // In all other cases we simply output the input symbol
+      return read_symbol;
+    }
+
+    /**
+     * @brief Returns the number of output characters for a given transition. During quote
+     * normalization, we always emit one output character (i.e., either the input character or the
+     * single quote-input replaced by a double quote), except when we need to escape a double quote
+     * that was previously inside a single-quoted string.
+     */
+    template <typename StateT, typename SymbolGroupT, typename SymbolT>
+    constexpr CUDF_HOST_DEVICE int32_t operator()(StateT const state_id,
+                                                  SymbolGroupT const match_id,
+                                                  SymbolT const read_symbol) const
+    {
+      // Whether this transition translates to the escape sequence: \"
+      const bool sqs_outputs_escape_sequence =
+        (state_id == static_cast<StateT>(dfa_states::TT_SQS)) &&
+        (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR));
+      // Number of characters to output on this transition
+      if (sqs_outputs_escape_sequence) { return 2; }
+      // Whether this transition translates to the escape sequence \<s> or unescaped '
+      const bool sec_outputs_escape_sequence =
+        (state_id == static_cast<StateT>(dfa_states::TT_SEC)) &&
+        (match_id != static_cast<SymbolGroupT>(dfa_symbol_group_id::SINGLE_QUOTE_CHAR));
+      // Number of characters to output on this transition
+      if (sec_outputs_escape_sequence) { return 2; }
+      // Whether this transition translates to no output <nop>
+      const bool sqs_outputs_nop =
+        (state_id == static_cast<StateT>(dfa_states::TT_SQS)) &&
+        (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::ESCAPE_CHAR));
+      // Number of characters to output on this transition
+      if (sqs_outputs_nop) { return 0; }
+      return 1;
+    }
+  };
+
+} // namespace normalize_quotes
+
+namespace detail {
+
+std::unique_ptr<rmm::device_buffer> normalize_quotes(const rmm::device_buffer &inbuf, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
+
+  auto parser = cudf::io::fst::detail::make_fst(
+    cudf::io::fst::detail::make_symbol_group_lut(qna_sgs),
+    cudf::io::fst::detail::make_transition_table(qna_state_tt),
+    cudf::io::fst::detail::make_translation_functor(TransduceToNormalizedQuotes{}),
+    stream);
+
+  std::unique_ptr<rmm::device_buffer> outbuf_ptr(inbuf.size() * 2, stream, mr);
+  parser.Transduce(inbuf.data(),
+                   static_cast<SymbolOffsetT>(inbuf.size()),
+                   outbuf_ptr.data(),
+                   thrust::make_discard_iterator(),
+                   thrust::make_discard_iterator(),
+                   normalize_quotes::start_state,
+                   stream);
+
+  return outbuf_ptr;
+}
+
+} // namespace detail
+} // namespace cudf::io::json
diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp
index db37e7abcdb..f890529da98 100644
--- a/cpp/src/io/json/read_json.hpp
+++ b/cpp/src/io/json/read_json.hpp
@@ -42,4 +42,5 @@ size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::data
                                         char const delimiter,
                                         rmm::cuda_stream_view stream);
 
+void normalize_quotes(const rmm::device_buffer &inbuf, std::unique_ptr<rmm::device_buffer> outbuf, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr);
 }  // namespace cudf::io::json::detail

From c6b0ba331d434a032d2e68b8ab3b987cfa251aab Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Wed, 10 Jan 2024 00:58:44 +0000
Subject: [PATCH 02/26] test for normalization api

---
 cpp/src/io/json/read_json.hpp                 |  4 +-
 .../io/json_quote_normalization_test.cpp      | 66 +++++++++++++++++++
 2 files changed, 68 insertions(+), 2 deletions(-)
 create mode 100644 cpp/tests/io/json_quote_normalization_test.cpp

diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp
index f890529da98..8d2d0c466a1 100644
--- a/cpp/src/io/json/read_json.hpp
+++ b/cpp/src/io/json/read_json.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,5 +42,5 @@ size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::data
                                         char const delimiter,
                                         rmm::cuda_stream_view stream);
 
-void normalize_quotes(const rmm::device_buffer &inbuf, std::unique_ptr<rmm::device_buffer> outbuf, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr);
+std::unique_ptr<rmm::device_buffer> normalize_quotes(const rmm::device_buffer &inbuf, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr);
 }  // namespace cudf::io::json::detail
diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp
new file mode 100644
index 00000000000..68c2b13233c
--- /dev/null
+++ b/cpp/tests/io/json_quote_normalization_test.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <io/json/read_json.hpp>
+#include <io/utilities/hostdevice_vector.hpp>
+
+#include <cudf/io/datasource.hpp>
+#include <cudf/io/json.hpp>
+#include <cudf/io/parquet.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/io_metadata_utilities.hpp>
+#include <cudf_test/table_utilities.hpp>
+
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/iterator/zip_iterator.h>
+
+#include <string>
+
+// Base test fixture for tests
+struct JsonNormalizationTest : public cudf::test::BaseFixture {};
+
+TEST_F(JsonNormalizationTest, Valid)
+{
+  // Test input
+  std::string const input = R"({"A":'TEST"'})";
+  auto device_input_ptr   = cudf::make_string_scalar(input, cudf::test::get_default_stream());
+  auto& device_input      = static_cast<scalar_type_t<std::string>&>(*device_input_ptr);
+
+  // RMM memory resource
+  std::shared_ptr<rmm::mr::device_memory_resource> rsc =
+    std::make_shared<rmm::mr::cuda_memory_resource>();
+
+  auto device_fst_output_ptr =
+    normalize_quotes(device_input.data(), cudf::test::get_default_stream(), rsc.get());
+  // Initialize parsing options (reading json lines)
+  cudf::io::json_reader_options input_options = cudf::io::json_reader_options::builder(
+    cudf::io::source_info{device_span(*device_fst_output_ptr)});
+
+  cudf::io::table_with_metadata processed_table =
+    cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc);
+}
+
+CUDF_TEST_PROGRAM_MAIN()

From d9a8acf677ae8a6ad404c3e6f77674000e3e0af5 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Thu, 11 Jan 2024 00:24:22 +0000
Subject: [PATCH 03/26] fixes to test

---
 cpp/CMakeLists.txt                            |   3 +-
 cpp/include/cudf/io/detail/json.hpp           |   6 +-
 cpp/src/io/json/json_quote_normalization.cu   | 296 +++++++++---------
 cpp/src/io/json/read_json.hpp                 |   3 +-
 cpp/tests/CMakeLists.txt                      |   1 +
 .../io/json_quote_normalization_test.cpp      |  47 ++-
 6 files changed, 183 insertions(+), 173 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a7c34ca489c..c0c721a830f 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -376,6 +376,7 @@ add_library(
   src/io/json/legacy/json_gpu.cu
   src/io/json/legacy/reader_impl.cu
   src/io/json/write_json.cu
+  src/io/json/json_quote_normalization.cu
   src/io/orc/aggregate_orc_metadata.cpp
   src/io/orc/dict_enc.cu
   src/io/orc/orc.cpp
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index d0a9543397d..ea72304807d 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,4 +51,8 @@ void write_json(data_sink* sink,
                 json_writer_options const& options,
                 rmm::cuda_stream_view stream,
                 rmm::mr::device_memory_resource* mr);
+
+std::unique_ptr<rmm::device_uvector<char>> normalize_quotes(const cudf::device_span<std::byte>& inbuf,
+                                                     rmm::cuda_stream_view stream,
+                                                     rmm::mr::device_memory_resource* mr);
 }  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/json_quote_normalization.cu b/cpp/src/io/json/json_quote_normalization.cu
index 321495a4bce..2ddd27b9ec3 100644
--- a/cpp/src/io/json/json_quote_normalization.cu
+++ b/cpp/src/io/json/json_quote_normalization.cu
@@ -17,6 +17,7 @@
 #include <io/fst/lookup_tables.cuh>
 #include <io/utilities/hostdevice_vector.hpp>
 
+#include <cudf/io/detail/json.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/repeat_strings.hpp>
 #include <cudf/types.hpp>
@@ -34,167 +35,172 @@
 
 namespace cudf::io::json {
 
-  using SymbolT = char;
-  using StateT  = char;
-  using SymbolOffsetT = uint32_t;
+using SymbolT       = char;
+using StateT        = char;
+using SymbolOffsetT = uint32_t;
 
 namespace normalize_quotes {
 
-  // Type sufficiently large to index symbols within the input and output (may be unsigned)
-  enum class dfa_states : char { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES };
-  enum class dfa_symbol_group_id : uint32_t {
-    DOUBLE_QUOTE_CHAR,  ///< Quote character SG: "
-    SINGLE_QUOTE_CHAR,  ///< Quote character SG: '
-    ESCAPE_CHAR,        ///< Escape character SG: '\'
-    NEWLINE_CHAR,       ///< Newline character SG: '\n'
-    OTHER_SYMBOLS,      ///< SG implicitly matching all other characters
-    NUM_SYMBOL_GROUPS   ///< Total number of symbol groups
-  };
-
-  // Aliases for readability of the transition table
-  constexpr auto TT_OOS            = dfa_states::TT_OOS;
-  constexpr auto TT_DQS            = dfa_states::TT_DQS;
-  constexpr auto TT_SQS            = dfa_states::TT_SQS;
-  constexpr auto TT_DEC            = dfa_states::TT_DEC;
-  constexpr auto TT_SEC            = dfa_states::TT_SEC;
-  constexpr auto TT_NUM_STATES     = static_cast<char>(dfa_states::TT_NUM_STATES);
-  constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
-
-  // The i-th string representing all the characters of a symbol group
-  std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const qna_sgs{
-    {{'\"'}, {'\''}, {'\\'}, {'\n'}}};
-
-  // Transition table
-  std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const qna_state_tt{{
-    /* IN_STATE      "       '       \       \n    OTHER  */
-    /* TT_OOS */ {{TT_DQS, TT_SQS, TT_OOS, TT_OOS, TT_OOS}},
-    /* TT_DQS */ {{TT_OOS, TT_DQS, TT_DEC, TT_OOS, TT_DQS}},
-    /* TT_SQS */ {{TT_SQS, TT_OOS, TT_SEC, TT_OOS, TT_SQS}},
-    /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_OOS, TT_DQS}},
-    /* TT_SEC */ {{TT_SQS, TT_SQS, TT_SQS, TT_OOS, TT_SQS}},
-  }};
-
-  // The DFA's starting state
-  constexpr char start_state = static_cast<char>(TT_OOS);
-
-  struct TransduceToNormalizedQuotes {
-    /**
-     * @brief Returns the <relative_offset>-th output symbol on the transition (state_id, match_id).
-     */
-    template <typename StateT, typename SymbolGroupT, typename RelativeOffsetT, typename SymbolT>
-    constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id,
-                                                  SymbolGroupT const match_id,
-                                                  RelativeOffsetT const relative_offset,
-                                                  SymbolT const read_symbol) const
-    {
-      // -------- TRANSLATION TABLE ------------
-      //      Let the alphabet set be Sigma
-      // ---------------------------------------
-      // ---------- NON-SPECIAL CASES: ----------
-      //      Output symbol same as input symbol <s>
-      // state | read_symbol <s> -> output_symbol <s>
-      // DQS   | Sigma           -> Sigma
-      // DEC   | Sigma           -> Sigma
-      // OOS   | Sigma\{'}       -> Sigma\{'}
-      // SQS   | Sigma\{', "}    -> Sigma\{', "}
-      // ---------- SPECIAL CASES: --------------
-      //    Input symbol translates to output symbol
-      // OOS   | {'}             -> {"}
-      // SQS   | {'}             -> {"}
-      // SQS   | {"}             -> {\"}
-      // SQS   | {\}             -> <nop>
-      // SEC   | {'}             -> {'}
-      // SEC   | Sigma\{'}       -> {\*}
-
-      // Whether this transition translates to the escape sequence: \"
-      const bool outputs_escape_sequence =
-        (state_id == static_cast<StateT>(dfa_states::TT_SQS)) &&
-        (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR));
-      // Case when a double quote needs to be replaced by the escape sequence: \"
-      if (outputs_escape_sequence) { return (relative_offset == 0) ? '\\' : '"'; }
-      // Case when a single quote needs to be replaced by a double quote
-      if ((match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)) &&
-          ((state_id == static_cast<StateT>(dfa_states::TT_SQS)) ||
-           (state_id == static_cast<StateT>(dfa_states::TT_OOS)))) {
-        return '"';
-      }
-      // Case when the read symbol is an escape character - the actual translation for \<s> for some
-      // symbol <s> is handled by transitions from SEC. For now, there is no output for this
-      // transition
-      if ((match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::ESCAPE_CHAR)) &&
-          ((state_id == static_cast<StateT>(dfa_states::TT_SQS)))) {
-        return 0;
-      }
-      // Case when an escaped single quote in an input single-quoted string needs to be replaced by an
-      // unescaped single quote
-      if ((match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)) &&
-          ((state_id == static_cast<StateT>(dfa_states::TT_SEC)))) {
-        return '\'';
-      }
-      // Case when an escaped symbol <s> that is not a single-quote needs to be replaced with \<s>
-      if (state_id == static_cast<StateT>(dfa_states::TT_SEC)) {
-        return (relative_offset == 0) ? '\\' : read_symbol;
-      }
-      // In all other cases we simply output the input symbol
-      return read_symbol;
+// Type sufficiently large to index symbols within the input and output (may be unsigned)
+enum class dfa_states : char { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES };
+enum class dfa_symbol_group_id : uint32_t {
+  DOUBLE_QUOTE_CHAR,  ///< Quote character SG: "
+  SINGLE_QUOTE_CHAR,  ///< Quote character SG: '
+  ESCAPE_CHAR,        ///< Escape character SG: '\'
+  NEWLINE_CHAR,       ///< Newline character SG: '\n'
+  OTHER_SYMBOLS,      ///< SG implicitly matching all other characters
+  NUM_SYMBOL_GROUPS   ///< Total number of symbol groups
+};
+
+// Aliases for readability of the transition table
+constexpr auto TT_OOS            = dfa_states::TT_OOS;
+constexpr auto TT_DQS            = dfa_states::TT_DQS;
+constexpr auto TT_SQS            = dfa_states::TT_SQS;
+constexpr auto TT_DEC            = dfa_states::TT_DEC;
+constexpr auto TT_SEC            = dfa_states::TT_SEC;
+constexpr auto TT_NUM_STATES     = static_cast<char>(dfa_states::TT_NUM_STATES);
+constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
+
+// The i-th string representing all the characters of a symbol group
+std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const qna_sgs{
+  {{'\"'}, {'\''}, {'\\'}, {'\n'}}};
+
+// Transition table
+std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const qna_state_tt{{
+  /* IN_STATE      "       '       \       \n    OTHER  */
+  /* TT_OOS */ {{TT_DQS, TT_SQS, TT_OOS, TT_OOS, TT_OOS}},
+  /* TT_DQS */ {{TT_OOS, TT_DQS, TT_DEC, TT_OOS, TT_DQS}},
+  /* TT_SQS */ {{TT_SQS, TT_OOS, TT_SEC, TT_OOS, TT_SQS}},
+  /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_OOS, TT_DQS}},
+  /* TT_SEC */ {{TT_SQS, TT_SQS, TT_SQS, TT_OOS, TT_SQS}},
+}};
+
+// The DFA's starting state
+constexpr char start_state = static_cast<char>(TT_OOS);
+
+struct TransduceToNormalizedQuotes {
+  /**
+   * @brief Returns the <relative_offset>-th output symbol on the transition (state_id, match_id).
+   */
+  template <typename StateT, typename SymbolGroupT, typename RelativeOffsetT, typename SymbolT>
+  constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id,
+                                                SymbolGroupT const match_id,
+                                                RelativeOffsetT const relative_offset,
+                                                SymbolT const read_symbol) const
+  {
+    // -------- TRANSLATION TABLE ------------
+    //      Let the alphabet set be Sigma
+    // ---------------------------------------
+    // ---------- NON-SPECIAL CASES: ----------
+    //      Output symbol same as input symbol <s>
+    // state | read_symbol <s> -> output_symbol <s>
+    // DQS   | Sigma           -> Sigma
+    // DEC   | Sigma           -> Sigma
+    // OOS   | Sigma\{'}       -> Sigma\{'}
+    // SQS   | Sigma\{', "}    -> Sigma\{', "}
+    // ---------- SPECIAL CASES: --------------
+    //    Input symbol translates to output symbol
+    // OOS   | {'}             -> {"}
+    // SQS   | {'}             -> {"}
+    // SQS   | {"}             -> {\"}
+    // SQS   | {\}             -> <nop>
+    // SEC   | {'}             -> {'}
+    // SEC   | Sigma\{'}       -> {\*}
+
+    // Whether this transition translates to the escape sequence: \"
+    const bool outputs_escape_sequence =
+      (state_id == static_cast<StateT>(dfa_states::TT_SQS)) &&
+      (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR));
+    // Case when a double quote needs to be replaced by the escape sequence: \"
+    if (outputs_escape_sequence) { return (relative_offset == 0) ? '\\' : '"'; }
+    // Case when a single quote needs to be replaced by a double quote
+    if ((match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)) &&
+        ((state_id == static_cast<StateT>(dfa_states::TT_SQS)) ||
+         (state_id == static_cast<StateT>(dfa_states::TT_OOS)))) {
+      return '"';
     }
-
-    /**
-     * @brief Returns the number of output characters for a given transition. During quote
-     * normalization, we always emit one output character (i.e., either the input character or the
-     * single quote-input replaced by a double quote), except when we need to escape a double quote
-     * that was previously inside a single-quoted string.
-     */
-    template <typename StateT, typename SymbolGroupT, typename SymbolT>
-    constexpr CUDF_HOST_DEVICE int32_t operator()(StateT const state_id,
-                                                  SymbolGroupT const match_id,
-                                                  SymbolT const read_symbol) const
-    {
-      // Whether this transition translates to the escape sequence: \"
-      const bool sqs_outputs_escape_sequence =
-        (state_id == static_cast<StateT>(dfa_states::TT_SQS)) &&
-        (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR));
-      // Number of characters to output on this transition
-      if (sqs_outputs_escape_sequence) { return 2; }
-      // Whether this transition translates to the escape sequence \<s> or unescaped '
-      const bool sec_outputs_escape_sequence =
-        (state_id == static_cast<StateT>(dfa_states::TT_SEC)) &&
-        (match_id != static_cast<SymbolGroupT>(dfa_symbol_group_id::SINGLE_QUOTE_CHAR));
-      // Number of characters to output on this transition
-      if (sec_outputs_escape_sequence) { return 2; }
-      // Whether this transition translates to no output <nop>
-      const bool sqs_outputs_nop =
-        (state_id == static_cast<StateT>(dfa_states::TT_SQS)) &&
-        (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::ESCAPE_CHAR));
-      // Number of characters to output on this transition
-      if (sqs_outputs_nop) { return 0; }
-      return 1;
+    // Case when the read symbol is an escape character - the actual translation for \<s> for some
+    // symbol <s> is handled by transitions from SEC. For now, there is no output for this
+    // transition
+    if ((match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::ESCAPE_CHAR)) &&
+        ((state_id == static_cast<StateT>(dfa_states::TT_SQS)))) {
+      return 0;
     }
-  };
-
-} // namespace normalize_quotes
+    // Case when an escaped single quote in an input single-quoted string needs to be replaced by an
+    // unescaped single quote
+    if ((match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)) &&
+        ((state_id == static_cast<StateT>(dfa_states::TT_SEC)))) {
+      return '\'';
+    }
+    // Case when an escaped symbol <s> that is not a single-quote needs to be replaced with \<s>
+    if (state_id == static_cast<StateT>(dfa_states::TT_SEC)) {
+      return (relative_offset == 0) ? '\\' : read_symbol;
+    }
+    // In all other cases we simply output the input symbol
+    return read_symbol;
+  }
+
+  /**
+   * @brief Returns the number of output characters for a given transition. During quote
+   * normalization, we always emit one output character (i.e., either the input character or the
+   * single quote-input replaced by a double quote), except when we need to escape a double quote
+   * that was previously inside a single-quoted string.
+   */
+  template <typename StateT, typename SymbolGroupT, typename SymbolT>
+  constexpr CUDF_HOST_DEVICE int32_t operator()(StateT const state_id,
+                                                SymbolGroupT const match_id,
+                                                SymbolT const read_symbol) const
+  {
+    // Whether this transition translates to the escape sequence: \"
+    const bool sqs_outputs_escape_sequence =
+      (state_id == static_cast<StateT>(dfa_states::TT_SQS)) &&
+      (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR));
+    // Number of characters to output on this transition
+    if (sqs_outputs_escape_sequence) { return 2; }
+    // Whether this transition translates to the escape sequence \<s> or unescaped '
+    const bool sec_outputs_escape_sequence =
+      (state_id == static_cast<StateT>(dfa_states::TT_SEC)) &&
+      (match_id != static_cast<SymbolGroupT>(dfa_symbol_group_id::SINGLE_QUOTE_CHAR));
+    // Number of characters to output on this transition
+    if (sec_outputs_escape_sequence) { return 2; }
+    // Whether this transition translates to no output <nop>
+    const bool sqs_outputs_nop =
+      (state_id == static_cast<StateT>(dfa_states::TT_SQS)) &&
+      (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::ESCAPE_CHAR));
+    // Number of characters to output on this transition
+    if (sqs_outputs_nop) { return 0; }
+    return 1;
+  }
+};
+
+}  // namespace normalize_quotes
 
 namespace detail {
 
-std::unique_ptr<rmm::device_buffer> normalize_quotes(const rmm::device_buffer &inbuf, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
-
+std::unique_ptr<rmm::device_uvector<char>> normalize_quotes(
+  const cudf::device_span<std::byte>& inbuf,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
   auto parser = cudf::io::fst::detail::make_fst(
-    cudf::io::fst::detail::make_symbol_group_lut(qna_sgs),
-    cudf::io::fst::detail::make_transition_table(qna_state_tt),
-    cudf::io::fst::detail::make_translation_functor(TransduceToNormalizedQuotes{}),
+    cudf::io::fst::detail::make_symbol_group_lut(cudf::io::json::normalize_quotes::qna_sgs),
+    cudf::io::fst::detail::make_transition_table(cudf::io::json::normalize_quotes::qna_state_tt),
+    cudf::io::fst::detail::make_translation_functor(
+      cudf::io::json::normalize_quotes::TransduceToNormalizedQuotes{}),
     stream);
 
-  std::unique_ptr<rmm::device_buffer> outbuf_ptr(inbuf.size() * 2, stream, mr);
-  parser.Transduce(inbuf.data(),
+  std::unique_ptr<rmm::device_uvector<char>> outbuf_ptr =
+    std::make_unique<rmm::device_uvector<char>>(inbuf.size() * 2, stream, mr);
+  parser.Transduce(reinterpret_cast<char*>(inbuf.data()),
                    static_cast<SymbolOffsetT>(inbuf.size()),
-                   outbuf_ptr.data(),
+                   outbuf_ptr->data(),
                    thrust::make_discard_iterator(),
                    thrust::make_discard_iterator(),
-                   normalize_quotes::start_state,
+                   cudf::io::json::normalize_quotes::start_state,
                    stream);
 
   return outbuf_ptr;
 }
 
-} // namespace detail
-} // namespace cudf::io::json
+}  // namespace detail
+}  // namespace cudf::io::json
diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp
index 8d2d0c466a1..d3acfa7ebc2 100644
--- a/cpp/src/io/json/read_json.hpp
+++ b/cpp/src/io/json/read_json.hpp
@@ -22,6 +22,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <memory>
@@ -41,6 +42,4 @@ size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::data
                                         json_reader_options const& reader_opts,
                                         char const delimiter,
                                         rmm::cuda_stream_view stream);
-
-std::unique_ptr<rmm::device_buffer> normalize_quotes(const rmm::device_buffer &inbuf, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr);
 }  // namespace cudf::io::json::detail
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index d0abcc225d1..a1503e5b297 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -300,6 +300,7 @@ ConfigureTest(JSON_TYPE_CAST_TEST io/json_type_cast_test.cu)
 ConfigureTest(NESTED_JSON_TEST io/nested_json_test.cpp io/json_tree.cpp)
 ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp)
 ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp)
+ConfigureTest(JSON_QUOTE_NORMALIZATION io/json_quote_normalization_test.cpp STREAM_MODE testing)
 ConfigureTest(
   DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp
   GPUS 1
diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp
index 68c2b13233c..2a96c93071b 100644
--- a/cpp/tests/io/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json_quote_normalization_test.cpp
@@ -14,53 +14,52 @@
  * limitations under the License.
  */
 
-#include <io/json/read_json.hpp>
-#include <io/utilities/hostdevice_vector.hpp>
-
-#include <cudf/io/datasource.hpp>
+#include <cudf/io/detail/json.hpp>
 #include <cudf/io/json.hpp>
-#include <cudf/io/parquet.hpp>
-#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/io_metadata_utilities.hpp>
-#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/default_stream.hpp>
 
+#include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
-#include <thrust/copy.h>
-#include <thrust/iterator/zip_iterator.h>
-
 #include <string>
 
 // Base test fixture for tests
 struct JsonNormalizationTest : public cudf::test::BaseFixture {};
 
-TEST_F(JsonNormalizationTest, Valid)
+TEST_F(JsonNormalizationTest, ValidOutput)
 {
-  // Test input
-  std::string const input = R"({"A":'TEST"'})";
-  auto device_input_ptr   = cudf::make_string_scalar(input, cudf::test::get_default_stream());
-  auto& device_input      = static_cast<scalar_type_t<std::string>&>(*device_input_ptr);
-
   // RMM memory resource
   std::shared_ptr<rmm::mr::device_memory_resource> rsc =
     std::make_shared<rmm::mr::cuda_memory_resource>();
 
-  auto device_fst_output_ptr =
-    normalize_quotes(device_input.data(), cudf::test::get_default_stream(), rsc.get());
+  // Test input
+  std::string const input = R"({"A":'TEST"'})";
+  rmm::device_uvector<char> device_input(input.size(), cudf::test::get_default_stream(), rsc.get());
+  thrust::copy(input.begin(), input.end(), device_input.begin());
+  auto device_input_span = cudf::device_span<std::byte>(
+    reinterpret_cast<std::byte*>(device_input.data()), device_input.size());
+
+  // Preprocessing FST
+  auto device_fst_output_ptr = cudf::io::json::detail::normalize_quotes(
+    device_input_span, cudf::test::get_default_stream(), rsc.get());
+
   // Initialize parsing options (reading json lines)
-  cudf::io::json_reader_options input_options = cudf::io::json_reader_options::builder(
-    cudf::io::source_info{device_span(*device_fst_output_ptr)});
+  auto device_fst_output_span = cudf::device_span<std::byte>(
+    reinterpret_cast<std::byte*>(device_fst_output_ptr->data()), device_fst_output_ptr->size());
+  cudf::io::json_reader_options input_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{device_fst_output_span});
 
   cudf::io::table_with_metadata processed_table =
-    cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc);
+    cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc.get());
 }
 
 CUDF_TEST_PROGRAM_MAIN()

From cfe89e69b546c2246b6d6320a3adf6e45b3931f8 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Thu, 11 Jan 2024 03:11:46 +0000
Subject: [PATCH 04/26] fix to tests

---
 cpp/tests/CMakeLists.txt                      |  2 +-
 .../io/json_quote_normalization_test.cpp      | 29 ++++++++++++++-----
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index a1503e5b297..3d926cdb968 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -300,7 +300,7 @@ ConfigureTest(JSON_TYPE_CAST_TEST io/json_type_cast_test.cu)
 ConfigureTest(NESTED_JSON_TEST io/nested_json_test.cpp io/json_tree.cpp)
 ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp)
 ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp)
-ConfigureTest(JSON_QUOTE_NORMALIZATION io/json_quote_normalization_test.cpp STREAM_MODE testing)
+ConfigureTest(JSON_QUOTE_NORMALIZATION io/json_quote_normalization_test.cpp)
 ConfigureTest(
   DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp
   GPUS 1
diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp
index 2a96c93071b..99ca623663e 100644
--- a/cpp/tests/io/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json_quote_normalization_test.cpp
@@ -18,13 +18,15 @@
 #include <cudf/io/json.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/default_stream.hpp>
+#include <cudf_test/table_utilities.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
@@ -42,15 +44,20 @@ TEST_F(JsonNormalizationTest, ValidOutput)
     std::make_shared<rmm::mr::cuda_memory_resource>();
 
   // Test input
-  std::string const input = R"({"A":'TEST"'})";
-  rmm::device_uvector<char> device_input(input.size(), cudf::test::get_default_stream(), rsc.get());
-  thrust::copy(input.begin(), input.end(), device_input.begin());
+  std::string const host_input = R"({"A":'TEST"'})";
+  thrust::device_vector<char> device_input(host_input.c_str(),
+                                           host_input.c_str() + host_input.size());
   auto device_input_span = cudf::device_span<std::byte>(
-    reinterpret_cast<std::byte*>(device_input.data()), device_input.size());
+    reinterpret_cast<std::byte*>(thrust::raw_pointer_cast(device_input.data())),
+    device_input.size());
 
   // Preprocessing FST
   auto device_fst_output_ptr = cudf::io::json::detail::normalize_quotes(
-    device_input_span, cudf::test::get_default_stream(), rsc.get());
+    device_input_span, cudf::get_default_stream(), rsc.get());
+  /*
+  for(size_t i = 0; i < device_fst_output_ptr->size(); i++)
+    std::printf("%c", device_fst_output_ptr->element(i, cudf::get_default_stream()));
+  */
 
   // Initialize parsing options (reading json lines)
   auto device_fst_output_span = cudf::device_span<std::byte>(
@@ -59,7 +66,15 @@ TEST_F(JsonNormalizationTest, ValidOutput)
     cudf::io::json_reader_options::builder(cudf::io::source_info{device_fst_output_span});
 
   cudf::io::table_with_metadata processed_table =
-    cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc.get());
+    cudf::io::read_json(input_options, cudf::get_default_stream(), rsc.get());
+
+  // Expected table
+  std::string const expected_input                     = R"({"A":"TEST\""})";
+  cudf::io::json_reader_options expected_input_options = cudf::io::json_reader_options::builder(
+    cudf::io::source_info{expected_input.data(), expected_input.size()});
+  cudf::io::table_with_metadata expected_table =
+    cudf::io::read_json(expected_input_options, cudf::get_default_stream(), rsc.get());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view());
 }
 
 CUDF_TEST_PROGRAM_MAIN()

From b2ce13b1781e47df1a31fea20d2b6f9e739a77a1 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Thu, 11 Jan 2024 03:19:54 +0000
Subject: [PATCH 05/26] pre-commit formatting fixes

---
 cpp/include/cudf/io/detail/json.hpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index ea72304807d..b6106938c30 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -52,7 +52,8 @@ void write_json(data_sink* sink,
                 rmm::cuda_stream_view stream,
                 rmm::mr::device_memory_resource* mr);
 
-std::unique_ptr<rmm::device_uvector<char>> normalize_quotes(const cudf::device_span<std::byte>& inbuf,
-                                                     rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr);
+std::unique_ptr<rmm::device_uvector<char>> normalize_quotes(
+  const cudf::device_span<std::byte>& inbuf,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr);
 }  // namespace cudf::io::json::detail

From 2134cf8ca0a9a2c43dfba77495d6b7f876fe7239 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Thu, 11 Jan 2024 18:54:26 +0000
Subject: [PATCH 06/26] finally, the test passes

---
 cpp/src/io/json/json_quote_normalization.cu    |  4 +++-
 cpp/tests/io/json_quote_normalization_test.cpp | 12 ++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/json/json_quote_normalization.cu b/cpp/src/io/json/json_quote_normalization.cu
index 2ddd27b9ec3..f9ec148b044 100644
--- a/cpp/src/io/json/json_quote_normalization.cu
+++ b/cpp/src/io/json/json_quote_normalization.cu
@@ -191,14 +191,16 @@ std::unique_ptr<rmm::device_uvector<char>> normalize_quotes(
 
   std::unique_ptr<rmm::device_uvector<char>> outbuf_ptr =
     std::make_unique<rmm::device_uvector<char>>(inbuf.size() * 2, stream, mr);
+  rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
   parser.Transduce(reinterpret_cast<char*>(inbuf.data()),
                    static_cast<SymbolOffsetT>(inbuf.size()),
                    outbuf_ptr->data(),
                    thrust::make_discard_iterator(),
-                   thrust::make_discard_iterator(),
+                   outbuf_size.data(),
                    cudf::io::json::normalize_quotes::start_state,
                    stream);
 
+  outbuf_ptr->resize(outbuf_size.value(stream), stream);
   return outbuf_ptr;
 }
 
diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp
index 99ca623663e..6a451dc8e7b 100644
--- a/cpp/tests/io/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json_quote_normalization_test.cpp
@@ -57,21 +57,25 @@ TEST_F(JsonNormalizationTest, ValidOutput)
   /*
   for(size_t i = 0; i < device_fst_output_ptr->size(); i++)
     std::printf("%c", device_fst_output_ptr->element(i, cudf::get_default_stream()));
+  std::printf("\n");
   */
 
   // Initialize parsing options (reading json lines)
   auto device_fst_output_span = cudf::device_span<std::byte>(
     reinterpret_cast<std::byte*>(device_fst_output_ptr->data()), device_fst_output_ptr->size());
   cudf::io::json_reader_options input_options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{device_fst_output_span});
+    cudf::io::json_reader_options::builder(cudf::io::source_info{device_fst_output_span})
+      .lines(true);
 
   cudf::io::table_with_metadata processed_table =
     cudf::io::read_json(input_options, cudf::get_default_stream(), rsc.get());
 
   // Expected table
-  std::string const expected_input                     = R"({"A":"TEST\""})";
-  cudf::io::json_reader_options expected_input_options = cudf::io::json_reader_options::builder(
-    cudf::io::source_info{expected_input.data(), expected_input.size()});
+  std::string const expected_input = R"({"A":"TEST\""})";
+  cudf::io::json_reader_options expected_input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{expected_input.data(), expected_input.size()})
+      .lines(true);
   cudf::io::table_with_metadata expected_table =
     cudf::io::read_json(expected_input_options, cudf::get_default_stream(), rsc.get());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view());

From 04e9d828a781e980bdfa23f36a784c8e48415128 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Thu, 11 Jan 2024 20:36:56 +0000
Subject: [PATCH 07/26] try again with test stream

---
 cpp/tests/io/json_quote_normalization_test.cpp | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp
index 6a451dc8e7b..bc9db6d3f68 100644
--- a/cpp/tests/io/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json_quote_normalization_test.cpp
@@ -26,6 +26,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/default_stream.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <rmm/device_uvector.hpp>
@@ -45,18 +46,19 @@ TEST_F(JsonNormalizationTest, ValidOutput)
 
   // Test input
   std::string const host_input = R"({"A":'TEST"'})";
-  thrust::device_vector<char> device_input(host_input.c_str(),
-                                           host_input.c_str() + host_input.size());
+  rmm::device_uvector<char> device_input(
+    host_input.size(), cudf::test::get_default_stream(), rsc.get());
+  for (size_t i = 0; i < host_input.size(); i++)
+    device_input.set_element_async(i, host_input[i], cudf::test::get_default_stream());
   auto device_input_span = cudf::device_span<std::byte>(
-    reinterpret_cast<std::byte*>(thrust::raw_pointer_cast(device_input.data())),
-    device_input.size());
+    reinterpret_cast<std::byte*>(device_input.data()), device_input.size());
 
   // Preprocessing FST
   auto device_fst_output_ptr = cudf::io::json::detail::normalize_quotes(
-    device_input_span, cudf::get_default_stream(), rsc.get());
+    device_input_span, cudf::test::get_default_stream(), rsc.get());
   /*
   for(size_t i = 0; i < device_fst_output_ptr->size(); i++)
-    std::printf("%c", device_fst_output_ptr->element(i, cudf::get_default_stream()));
+    std::printf("%c", device_fst_output_ptr->element(i, cudf::test::get_default_stream()));
   std::printf("\n");
   */
 
@@ -68,7 +70,7 @@ TEST_F(JsonNormalizationTest, ValidOutput)
       .lines(true);
 
   cudf::io::table_with_metadata processed_table =
-    cudf::io::read_json(input_options, cudf::get_default_stream(), rsc.get());
+    cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc.get());
 
   // Expected table
   std::string const expected_input = R"({"A":"TEST\""})";
@@ -77,7 +79,7 @@ TEST_F(JsonNormalizationTest, ValidOutput)
       cudf::io::source_info{expected_input.data(), expected_input.size()})
       .lines(true);
   cudf::io::table_with_metadata expected_table =
-    cudf::io::read_json(expected_input_options, cudf::get_default_stream(), rsc.get());
+    cudf::io::read_json(expected_input_options, cudf::test::get_default_stream(), rsc.get());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view());
 }
 

From fa11424c9977fe9bf8b03ddd34c1ba0550fb6287 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Fri, 12 Jan 2024 23:46:20 +0000
Subject: [PATCH 08/26] added option to normalize single quotes in read_json

---
 cpp/include/cudf/io/detail/json.hpp           |  8 ++---
 cpp/include/cudf/io/json.hpp                  | 35 +++++++++++++++++--
 cpp/src/io/json/json_quote_normalization.cu   | 27 ++++++++++++++
 cpp/src/io/json/read_json.cu                  | 23 ++++++++++--
 .../io/json_quote_normalization_test.cpp      | 33 +++++++++++++++--
 5 files changed, 115 insertions(+), 11 deletions(-)

diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index b6106938c30..a3d8ebf57e0 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -52,8 +52,8 @@ void write_json(data_sink* sink,
                 rmm::cuda_stream_view stream,
                 rmm::mr::device_memory_resource* mr);
 
-std::unique_ptr<rmm::device_uvector<char>> normalize_quotes(
-  const cudf::device_span<std::byte>& inbuf,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+rmm::device_uvector<char> normalize_single_quotes(const cudf::device_span<std::byte>& inbuf,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr);
+
 }  // namespace cudf::io::json::detail
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 472d42b1db5..7e0293db647 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -98,7 +98,7 @@ class json_reader_options {
 
   // Read the file as a json object per line
   bool _lines = false;
-
+  
   // Bytes to skip from the start
   size_t _byte_range_offset = 0;
   // Bytes to read; always reads complete rows
@@ -113,6 +113,9 @@ class json_reader_options {
   // Whether to keep the quote characters of string values
   bool _keep_quotes = false;
 
+  // Normalize single quotes
+  bool _normalize_single_quotes = false;
+
   // Whether to recover after an invalid JSON line
   json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL;
 
@@ -246,6 +249,13 @@ class json_reader_options {
    */
   bool is_enabled_keep_quotes() const { return _keep_quotes; }
 
+  /**
+   * @brief Whether the reader should normalize single quotes around strings
+   *
+   * @returns true if the reader should normalize single quotes, false otherwise
+   */
+  bool is_enabled_normalize_single_quotes() const { return _normalize_single_quotes; }
+
   /**
    * @brief Queries the JSON reader's behavior on invalid JSON lines.
    *
@@ -324,6 +334,14 @@ class json_reader_options {
    */
   void enable_keep_quotes(bool val) { _keep_quotes = val; }
 
+  /**
+   * @brief Set whether the reader should enable normalization of single  quotes around strings.
+   *
+   * @param val Boolean value to indicate whether the reader should normalize single quotes around
+   * string
+   */
+  void enable_normalize_single_quotes(bool val) { _normalize_single_quotes = val; }
+
   /**
    * @brief Specifies the JSON reader's behavior on invalid JSON lines.
    *
@@ -474,6 +492,19 @@ class json_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set whether the reader should normalize single quotes around string 
+   *
+   * @param val Boolean value to indicate whether the reader should normalize single quotes
+   * of strings
+   * @return this for chaining
+   */
+  json_reader_options_builder& normalize_single_quotes(bool val)
+  {
+    options._normalize_single_quotes = val;
+    return *this;
+  }
+
   /**
    * @brief Specifies the JSON reader's behavior on invalid JSON lines.
    *
diff --git a/cpp/src/io/json/json_quote_normalization.cu b/cpp/src/io/json/json_quote_normalization.cu
index f9ec148b044..d2088bc34d3 100644
--- a/cpp/src/io/json/json_quote_normalization.cu
+++ b/cpp/src/io/json/json_quote_normalization.cu
@@ -177,6 +177,7 @@ struct TransduceToNormalizedQuotes {
 
 namespace detail {
 
+/*
 std::unique_ptr<rmm::device_uvector<char>> normalize_quotes(
   const cudf::device_span<std::byte>& inbuf,
   rmm::cuda_stream_view stream,
@@ -203,6 +204,32 @@ std::unique_ptr<rmm::device_uvector<char>> normalize_quotes(
   outbuf_ptr->resize(outbuf_size.value(stream), stream);
   return outbuf_ptr;
 }
+*/
+
+rmm::device_uvector<SymbolT> normalize_single_quotes(const cudf::device_span<std::byte>& inbuf,
+                                                     rmm::cuda_stream_view stream,
+                                                     rmm::mr::device_memory_resource* mr)
+{
+  auto parser = cudf::io::fst::detail::make_fst(
+    cudf::io::fst::detail::make_symbol_group_lut(cudf::io::json::normalize_quotes::qna_sgs),
+    cudf::io::fst::detail::make_transition_table(cudf::io::json::normalize_quotes::qna_state_tt),
+    cudf::io::fst::detail::make_translation_functor(
+      cudf::io::json::normalize_quotes::TransduceToNormalizedQuotes{}),
+    stream);
+
+  rmm::device_uvector<SymbolT> outbuf(inbuf.size() * 2, stream, mr);
+  rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
+  parser.Transduce(reinterpret_cast<SymbolT*>(inbuf.data()),
+                   static_cast<SymbolOffsetT>(inbuf.size()),
+                   outbuf.data(),
+                   thrust::make_discard_iterator(),
+                   outbuf_size.data(),
+                   cudf::io::json::normalize_quotes::start_state,
+                   stream);
+
+  outbuf.resize(outbuf_size.value(stream), stream);
+  return outbuf;
+}
 
 }  // namespace detail
 }  // namespace cudf::io::json
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 080da7800f4..0d7edb65ef7 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/detail/json.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/exec_policy.hpp>
@@ -49,6 +50,7 @@ rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>
                                            compression_type compression,
                                            size_t range_offset,
                                            size_t range_size,
+                                           bool normalize_single_quotes, 
                                            rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
@@ -103,7 +105,12 @@ rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>
     }
 
     stream.synchronize();
-    return d_buffer;
+    if(normalize_single_quotes) {
+      auto d_buffer_span = cudf::device_span<std::byte>(
+        reinterpret_cast<std::byte*>(d_buffer.data()), d_buffer.size());
+      return cudf::io::json::detail::normalize_single_quotes(d_buffer_span, stream, rmm::mr::get_current_device_resource());
+    }
+    else return d_buffer;
 
   } else {
     auto buffer = std::vector<uint8_t>(total_source_size);
@@ -111,10 +118,16 @@ rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>
     // Reading to host because decompression of a single block is much faster on the CPU
     sources[0]->host_read(range_offset, total_source_size, buffer.data());
     auto const uncomp_data = decompress(compression, buffer);
-    return cudf::detail::make_device_uvector_sync(
+    auto d_buffer = cudf::detail::make_device_uvector_sync(
       host_span<char const>{reinterpret_cast<char const*>(uncomp_data.data()), uncomp_data.size()},
       stream,
       rmm::mr::get_current_device_resource());
+    if(normalize_single_quotes) {
+      auto d_buffer_span = cudf::device_span<std::byte>(
+        reinterpret_cast<std::byte*>(d_buffer.data()), d_buffer.size());
+      return cudf::io::json::detail::normalize_single_quotes(d_buffer_span, stream, rmm::mr::get_current_device_resource());
+    }
+    else return d_buffer;
   }
 }
 
@@ -127,6 +140,7 @@ size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::data
                                        reader_opts.get_compression(),
                                        reader_opts.get_byte_range_offset(),
                                        reader_opts.get_byte_range_size(),
+                                       reader_opts.is_enabled_normalize_single_quotes(),
                                        stream);
   return find_first_delimiter(buffer, delimiter, stream);
 }
@@ -158,6 +172,7 @@ auto get_record_range_raw_input(host_span<std::unique_ptr<datasource>> sources,
                                  reader_opts.get_compression(),
                                  reader_opts.get_byte_range_offset(),
                                  reader_opts.get_byte_range_size(),
+                                 reader_opts.is_enabled_normalize_single_quotes(),
                                  stream);
   if (should_load_whole_source(reader_opts)) return buffer;
   auto first_delim_pos =
@@ -175,6 +190,7 @@ auto get_record_range_raw_input(host_span<std::unique_ptr<datasource>> sources,
                                 reader_opts.get_compression(),
                                 current_offset,
                                 reader_opts.get_byte_range_size(),
+                                reader_opts.is_enabled_normalize_single_quotes(),
                                 stream);
       next_delim_pos = find_first_delimiter(buffer, '\n', stream);
       if (next_delim_pos == -1) { current_offset += reader_opts.get_byte_range_size(); }
@@ -188,6 +204,7 @@ auto get_record_range_raw_input(host_span<std::unique_ptr<datasource>> sources,
                             reader_opts.get_compression(),
                             first_delim_pos,
                             next_delim_pos - first_delim_pos,
+                            reader_opts.is_enabled_normalize_single_quotes(),
                             stream);
   }
 }
diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp
index bc9db6d3f68..d0f4bcddda9 100644
--- a/cpp/tests/io/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json_quote_normalization_test.cpp
@@ -54,7 +54,7 @@ TEST_F(JsonNormalizationTest, ValidOutput)
     reinterpret_cast<std::byte*>(device_input.data()), device_input.size());
 
   // Preprocessing FST
-  auto device_fst_output_ptr = cudf::io::json::detail::normalize_quotes(
+  auto device_fst_output = cudf::io::json::detail::normalize_single_quotes(
     device_input_span, cudf::test::get_default_stream(), rsc.get());
   /*
   for(size_t i = 0; i < device_fst_output_ptr->size(); i++)
@@ -64,7 +64,7 @@ TEST_F(JsonNormalizationTest, ValidOutput)
 
   // Initialize parsing options (reading json lines)
   auto device_fst_output_span = cudf::device_span<std::byte>(
-    reinterpret_cast<std::byte*>(device_fst_output_ptr->data()), device_fst_output_ptr->size());
+    reinterpret_cast<std::byte*>(device_fst_output.data()), device_fst_output.size());
   cudf::io::json_reader_options input_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{device_fst_output_span})
       .lines(true);
@@ -83,4 +83,33 @@ TEST_F(JsonNormalizationTest, ValidOutput)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view());
 }
 
+TEST_F(JsonNormalizationTest, ReadJsonOption)
+{
+  // RMM memory resource
+  std::shared_ptr<rmm::mr::device_memory_resource> rsc =
+    std::make_shared<rmm::mr::cuda_memory_resource>();
+
+  // Test input
+  std::string const host_input = R"({"A":'TEST"'})";
+  cudf::io::json_reader_options input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{host_input.data(), host_input.size()})
+      .lines(true)
+      .normalize_single_quotes(true);
+
+  cudf::io::table_with_metadata processed_table =
+    cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc.get());
+
+  // Expected table
+  std::string const expected_input = R"({"A":"TEST\""})";
+  cudf::io::json_reader_options expected_input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{expected_input.data(), expected_input.size()})
+      .lines(true);
+
+  cudf::io::table_with_metadata expected_table =
+    cudf::io::read_json(expected_input_options, cudf::test::get_default_stream(), rsc.get());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view());
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 2e86d89ed8eb8e37451f8b391ee02440e3046ebc Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Fri, 12 Jan 2024 23:54:23 +0000
Subject: [PATCH 09/26] formatting fixes

---
 cpp/include/cudf/io/json.hpp |  4 ++--
 cpp/src/io/json/read_json.cu | 22 ++++++++++++----------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 7e0293db647..a087e6b40fb 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -98,7 +98,7 @@ class json_reader_options {
 
   // Read the file as a json object per line
   bool _lines = false;
-  
+
   // Bytes to skip from the start
   size_t _byte_range_offset = 0;
   // Bytes to read; always reads complete rows
@@ -493,7 +493,7 @@ class json_reader_options_builder {
   }
 
   /**
-   * @brief Set whether the reader should normalize single quotes around string 
+   * @brief Set whether the reader should normalize single quotes around string
    *
    * @param val Boolean value to indicate whether the reader should normalize single quotes
    * of strings
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 0d7edb65ef7..c159fc2378e 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -50,7 +50,7 @@ rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>
                                            compression_type compression,
                                            size_t range_offset,
                                            size_t range_size,
-                                           bool normalize_single_quotes, 
+                                           bool normalize_single_quotes,
                                            rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
@@ -105,12 +105,13 @@ rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>
     }
 
     stream.synchronize();
-    if(normalize_single_quotes) {
+    if (normalize_single_quotes) {
       auto d_buffer_span = cudf::device_span<std::byte>(
         reinterpret_cast<std::byte*>(d_buffer.data()), d_buffer.size());
-      return cudf::io::json::detail::normalize_single_quotes(d_buffer_span, stream, rmm::mr::get_current_device_resource());
-    }
-    else return d_buffer;
+      return cudf::io::json::detail::normalize_single_quotes(
+        d_buffer_span, stream, rmm::mr::get_current_device_resource());
+    } else
+      return d_buffer;
 
   } else {
     auto buffer = std::vector<uint8_t>(total_source_size);
@@ -118,16 +119,17 @@ rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>
     // Reading to host because decompression of a single block is much faster on the CPU
     sources[0]->host_read(range_offset, total_source_size, buffer.data());
     auto const uncomp_data = decompress(compression, buffer);
-    auto d_buffer = cudf::detail::make_device_uvector_sync(
+    auto d_buffer          = cudf::detail::make_device_uvector_sync(
       host_span<char const>{reinterpret_cast<char const*>(uncomp_data.data()), uncomp_data.size()},
       stream,
       rmm::mr::get_current_device_resource());
-    if(normalize_single_quotes) {
+    if (normalize_single_quotes) {
       auto d_buffer_span = cudf::device_span<std::byte>(
         reinterpret_cast<std::byte*>(d_buffer.data()), d_buffer.size());
-      return cudf::io::json::detail::normalize_single_quotes(d_buffer_span, stream, rmm::mr::get_current_device_resource());
-    }
-    else return d_buffer;
+      return cudf::io::json::detail::normalize_single_quotes(
+        d_buffer_span, stream, rmm::mr::get_current_device_resource());
+    } else
+      return d_buffer;
   }
 }
 

From 9925c1094446ac773b5e623c9f1e3977f1a222b3 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Sat, 13 Jan 2024 03:38:46 +0000
Subject: [PATCH 10/26] adding testing_main

---
 cpp/tests/io/json_quote_normalization_test.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp
index d0f4bcddda9..3dbb6468a65 100644
--- a/cpp/tests/io/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json_quote_normalization_test.cpp
@@ -28,6 +28,7 @@
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/default_stream.hpp>
 #include <cudf_test/table_utilities.hpp>
+#include <cudf_test/testing_main.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>

From 2838c74d2c0013e4b86face14b97fe3842a63039 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Sat, 13 Jan 2024 03:52:18 +0000
Subject: [PATCH 11/26] java bindings

---
 .../main/java/ai/rapids/cudf/JSONOptions.java | 15 +++++++++++++++
 java/src/main/java/ai/rapids/cudf/Table.java  | 16 ++++++++++------
 java/src/main/native/src/TableJni.cpp         | 18 +++++++++++-------
 .../test/java/ai/rapids/cudf/TableTest.java   | 19 +++++++++++++++++++
 java/src/test/resources/single_quotes.json    |  2 ++
 5 files changed, 57 insertions(+), 13 deletions(-)
 create mode 100644 java/src/test/resources/single_quotes.json

diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
index f98687df5fa..d3f906d8af1 100644
--- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
@@ -30,12 +30,14 @@ public final class JSONOptions extends ColumnFilterOptions {
   private final boolean dayFirst;
   private final boolean lines;
   private final boolean recoverWithNull;
+  private final boolean normalizeSingleQuotes;
 
   private JSONOptions(Builder builder) {
     super(builder);
     dayFirst = builder.dayFirst;
     lines = builder.lines;
     recoverWithNull = builder.recoverWithNull;
+    normalizeSingleQuotes = builder.normalizeSingleQuotes;
   }
 
   public boolean isDayFirst() {
@@ -51,6 +53,10 @@ public boolean isRecoverWithNull() {
     return recoverWithNull;
   }
 
+  public boolean isNormalizeSingleQuotes() {
+    return normalizeSingleQuotes;
+  }
+
   @Override
   String[] getIncludeColumnNames() {
     throw new UnsupportedOperationException("JSON reader didn't support column prune");
@@ -65,6 +71,7 @@ public static final class Builder  extends ColumnFilterOptions.Builder<JSONOptio
     private boolean lines = true;
 
     private boolean recoverWithNull = false;
+    private boolean normalizeSingleQuotes = false;
 
     /**
      * Whether to parse dates as DD/MM versus MM/DD
@@ -101,6 +108,14 @@ public Builder withRecoverWithNull(boolean recoverWithNull) {
       return this;
     }
 
+    /**
+     * Should the single quotes be normalized.
+     */
+    public Builder withNormalizeSingleQuotes(boolean normalizeSingleQuotes) {
+      this.normalizeSingleQuotes = normalizeSingleQuotes;
+      return this;
+    }
+
     @Override
     public Builder includeColumn(String... names) {
       throw new UnsupportedOperationException("JSON reader didn't support column prune");
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 50ea54ddaab..ea5c452aba6 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -250,16 +250,18 @@ private static native long readJSON(String[] columnNames,
                                         int[] dTypeIds, int[] dTypeScales,
                                         String filePath, long address, long length,
                                         boolean dayFirst, boolean lines,
-                                        boolean recoverWithNulls) throws CudfException;
+                                        boolean recoverWithNulls,
+                                        boolean normalizeSingleQuotes) throws CudfException;
 
   private static native long readJSONFromDataSource(String[] columnNames,
                                       int[] dTypeIds, int[] dTypeScales,
                                       boolean dayFirst, boolean lines,
                                       boolean recoverWithNulls,
+                                      boolean normalizeSingleQuotes,
                                       long dsHandle) throws CudfException;
 
   private static native long readAndInferJSON(long address, long length,
-      boolean dayFirst, boolean lines, boolean recoverWithNulls) throws CudfException;
+      boolean dayFirst, boolean lines, boolean recoverWithNulls, boolean normalizeSingleQuotes) throws CudfException;
 
   /**
    * Read in Parquet formatted data.
@@ -1087,7 +1089,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
             readJSON(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(),
                     path.getAbsolutePath(),
                     0, 0,
-                    opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull()))) {
+                    opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(),
+                    opts.isNormalizeSingleQuotes()))) {
 
       return gatherJSONColumns(schema, twm);
     }
@@ -1139,7 +1142,8 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
     assert len <= buffer.length - offset;
     assert offset >= 0 && offset < buffer.length;
     return new TableWithMeta(readAndInferJSON(buffer.getAddress() + offset, len,
-        opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull()));
+        opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull()),
+        opts.isNormalizeSingleQuotes());
   }
 
   /**
@@ -1162,7 +1166,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
     try (TableWithMeta twm = new TableWithMeta(readJSON(schema.getColumnNames(),
             schema.getTypeIds(), schema.getTypeScales(), null,
             buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines(),
-            opts.isRecoverWithNull()))) {
+            opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes()))) {
       return gatherJSONColumns(schema, twm);
     }
   }
@@ -1178,7 +1182,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) {
     long dsHandle = DataSourceHelper.createWrapperDataSource(ds);
     try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getColumnNames(),
             schema.getTypeIds(), schema.getTypeScales(), opts.isDayFirst(), opts.isLines(),
-            opts.isRecoverWithNull(), dsHandle))) {
+            opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(), dsHandle))) {
       return gatherJSONColumns(schema, twm);
     } finally {
       DataSourceHelper.destroyWrapperDataSource(dsHandle);
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index d7d0279174d..ebbf619cc01 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1392,7 +1392,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
     JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
-    jboolean recover_with_null) {
+    jboolean recover_with_null, jboolean normalize_single_quotes) {
 
   JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
   if (buffer_length <= 0) {
@@ -1411,7 +1411,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
     cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
                                                      .dayfirst(static_cast<bool>(day_first))
                                                      .lines(static_cast<bool>(lines))
-                                                     .recovery_mode(recovery_mode);
+                                                     .recovery_mode(recovery_mode)
+                                                     .normalize_single_quotes(static_cast<bool>(normalize_single_quotes));
 
     auto result =
         std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
@@ -1469,7 +1470,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
     JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
-    jboolean day_first, jboolean lines, jboolean recover_with_null, jlong ds_handle) {
+    jboolean day_first, jboolean lines, jboolean recover_with_null, jboolean normalize_single_quotes,
+    jlong ds_handle) {
 
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
 
@@ -1504,7 +1506,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
     cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
                                                      .dayfirst(static_cast<bool>(day_first))
                                                      .lines(static_cast<bool>(lines))
-                                                     .recovery_mode(recovery_mode);
+                                                     .recovery_mode(recovery_mode)
+                                                     .normalize_single_quotes(static_cast<bool>(normalize_single_quotes));
 
     if (!n_col_names.is_null() && data_types.size() > 0) {
       if (n_col_names.size() != n_types.size()) {
@@ -1536,7 +1539,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
     JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
     jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
-    jboolean recover_with_null) {
+    jboolean recover_with_null, jboolean normalize_single_quotes) {
 
   bool read_buffer = true;
   if (buffer == 0) {
@@ -1586,7 +1589,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
     cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
                                                      .dayfirst(static_cast<bool>(day_first))
                                                      .lines(static_cast<bool>(lines))
-                                                     .recovery_mode(recovery_mode);
+                                                     .recovery_mode(recovery_mode)
+                                                     .normalize_single_quotes(static_cast<bool>(normalize_single_quotes));
 
     if (!n_col_names.is_null() && data_types.size() > 0) {
       if (n_col_names.size() != n_types.size()) {
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 8df8ebea8a7..78aa11a074e 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -87,6 +87,7 @@ public class TableTest extends CudfTestBase {
   private static final File TEST_SIMPLE_CSV_FILE = TestUtils.getResourceAsFile("simple.csv");
   private static final File TEST_SIMPLE_JSON_FILE = TestUtils.getResourceAsFile("people.json");
   private static final File TEST_JSON_ERROR_FILE = TestUtils.getResourceAsFile("people_with_invalid_lines.json");
+  private static final File TEST_JSON_SINGLE_QUOTES_FILE = TestUtils.getResourceAsFile("single_quotes.json");
 
   private static final Schema CSV_DATA_BUFFER_SCHEMA = Schema.builder()
       .column(DType.INT32, "A")
@@ -327,6 +328,24 @@ void testReadJSONFile() {
     }
   }
 
+  @Test
+  void testReadSingleQuotesJSONFile() throws IOException {
+    Schema schema = Schema.builder()
+            .column(DType.STRING, "A")
+            .build();
+    JSONOptions opts = JSONOptions.builder()
+            .withLines(true)
+            .withNormalizeSingleQuotes(true)
+            .build();
+    try (Table expected = new Table.TestBuilder()
+            .column("TEST\"", "TESTER'")
+            .build();
+         MultiBufferDataSource source = sourceFrom(TEST_JSON_SINGLE_QUOTES_FILE);
+         Table table = Table.readJSON(schema, opts, source)) {
+            assertTablesAreEqual(expected, table);
+         }
+  }
+
   @Test
   void testReadJSONFromDataSource() throws IOException {
     Schema schema = Schema.builder()
diff --git a/java/src/test/resources/single_quotes.json b/java/src/test/resources/single_quotes.json
new file mode 100644
index 00000000000..cb432fbc643
--- /dev/null
+++ b/java/src/test/resources/single_quotes.json
@@ -0,0 +1,2 @@
+{"A":'TEST"'}
+{'A':"TESTER'"}

From 23139553a166959dd4a8d9dc1d8185831f438b54 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Sat, 13 Jan 2024 03:57:12 +0000
Subject: [PATCH 12/26] formatting fixes

---
 java/src/main/native/src/TableJni.cpp | 37 +++++++++++++++------------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index ebbf619cc01..95a5904c24f 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1408,11 +1408,12 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
     auto const recovery_mode = recover_with_null ?
                                    cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
                                    cudf::io::json_recovery_mode_t::FAIL;
-    cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
-                                                     .dayfirst(static_cast<bool>(day_first))
-                                                     .lines(static_cast<bool>(lines))
-                                                     .recovery_mode(recovery_mode)
-                                                     .normalize_single_quotes(static_cast<bool>(normalize_single_quotes));
+    cudf::io::json_reader_options_builder opts =
+        cudf::io::json_reader_options::builder(source)
+            .dayfirst(static_cast<bool>(day_first))
+            .lines(static_cast<bool>(lines))
+            .recovery_mode(recovery_mode)
+            .normalize_single_quotes(static_cast<bool>(normalize_single_quotes));
 
     auto result =
         std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
@@ -1470,8 +1471,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
     JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
-    jboolean day_first, jboolean lines, jboolean recover_with_null, jboolean normalize_single_quotes,
-    jlong ds_handle) {
+    jboolean day_first, jboolean lines, jboolean recover_with_null,
+    jboolean normalize_single_quotes, jlong ds_handle) {
 
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
 
@@ -1503,11 +1504,12 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
     cudf::io::json_recovery_mode_t recovery_mode =
         recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
                             cudf::io::json_recovery_mode_t::FAIL;
-    cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
-                                                     .dayfirst(static_cast<bool>(day_first))
-                                                     .lines(static_cast<bool>(lines))
-                                                     .recovery_mode(recovery_mode)
-                                                     .normalize_single_quotes(static_cast<bool>(normalize_single_quotes));
+    cudf::io::json_reader_options_builder opts =
+        cudf::io::json_reader_options::builder(source)
+            .dayfirst(static_cast<bool>(day_first))
+            .lines(static_cast<bool>(lines))
+            .recovery_mode(recovery_mode)
+            .normalize_single_quotes(static_cast<bool>(normalize_single_quotes));
 
     if (!n_col_names.is_null() && data_types.size() > 0) {
       if (n_col_names.size() != n_types.size()) {
@@ -1586,11 +1588,12 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
     cudf::io::json_recovery_mode_t recovery_mode =
         recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
                             cudf::io::json_recovery_mode_t::FAIL;
-    cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
-                                                     .dayfirst(static_cast<bool>(day_first))
-                                                     .lines(static_cast<bool>(lines))
-                                                     .recovery_mode(recovery_mode)
-                                                     .normalize_single_quotes(static_cast<bool>(normalize_single_quotes));
+    cudf::io::json_reader_options_builder opts =
+        cudf::io::json_reader_options::builder(source)
+            .dayfirst(static_cast<bool>(day_first))
+            .lines(static_cast<bool>(lines))
+            .recovery_mode(recovery_mode)
+            .normalize_single_quotes(static_cast<bool>(normalize_single_quotes));
 
     if (!n_col_names.is_null() && data_types.size() > 0) {
       if (n_col_names.size() != n_types.size()) {

From a5bb42e9f47309a6c36f51080c820f5637b44260 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Sat, 13 Jan 2024 07:25:46 +0000
Subject: [PATCH 13/26] compile fix

---
 java/src/main/java/ai/rapids/cudf/Table.java | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index ea5c452aba6..b734800b4bc 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -1142,8 +1142,8 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
     assert len <= buffer.length - offset;
     assert offset >= 0 && offset < buffer.length;
     return new TableWithMeta(readAndInferJSON(buffer.getAddress() + offset, len,
-        opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull()),
-        opts.isNormalizeSingleQuotes());
+        opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(),
+        opts.isNormalizeSingleQuotes()));
   }
 
   /**

From e63bca0868ca11a8e3cabdc891f2ba66d36045dd Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Tue, 16 Jan 2024 15:33:55 -0800
Subject: [PATCH 14/26] Update java/src/test/java/ai/rapids/cudf/TableTest.java

Co-authored-by: Andy Grove <andygrove73@gmail.com>
---
 java/src/test/java/ai/rapids/cudf/TableTest.java | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 78aa11a074e..126bab9a9d8 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -346,6 +346,21 @@ void testReadSingleQuotesJSONFile() throws IOException {
          }
   }
 
+  @Test
+  void testReadSingleQuotesJSONFileFeatureDisabled() throws IOException {
+    Schema schema = Schema.builder()
+      .column(DType.STRING, "A")
+      .build();
+    JSONOptions opts = JSONOptions.builder()
+      .withLines(true)
+      .withNormalizeSingleQuotes(false)
+      .build();
+    try (MultiBufferDataSource source = sourceFrom(TEST_JSON_SINGLE_QUOTES_FILE)) {
+      assertThrows(CudfException.class, () ->
+        Table.readJSON(schema, opts, source));
+    }
+  }
+
   @Test
   void testReadJSONFromDataSource() throws IOException {
     Schema schema = Schema.builder()

From 005b5c280264f608007560860ff9dbaf84ce6814 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Tue, 16 Jan 2024 15:35:07 -0800
Subject: [PATCH 15/26] Update java/src/test/java/ai/rapids/cudf/TableTest.java

Co-authored-by: Andy Grove <andygrove73@gmail.com>
---
 java/src/test/java/ai/rapids/cudf/TableTest.java | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 126bab9a9d8..f3441d5b0cb 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -342,8 +342,8 @@ void testReadSingleQuotesJSONFile() throws IOException {
             .build();
          MultiBufferDataSource source = sourceFrom(TEST_JSON_SINGLE_QUOTES_FILE);
          Table table = Table.readJSON(schema, opts, source)) {
-            assertTablesAreEqual(expected, table);
-         }
+      assertTablesAreEqual(expected, table);
+    }
   }
 
   @Test

From 1a8f5f31d1e4ade94c3407f10e18139d77d797cc Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Tue, 16 Jan 2024 23:35:40 +0000
Subject: [PATCH 16/26] added an error test for when normalize quotes is not
 enabled

---
 cpp/tests/io/json_quote_normalization_test.cpp | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp
index 3dbb6468a65..2d6e35cc3ad 100644
--- a/cpp/tests/io/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json_quote_normalization_test.cpp
@@ -113,4 +113,21 @@ TEST_F(JsonNormalizationTest, ReadJsonOption)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view());
 }
 
+TEST_F(JsonNormalizationTest, ErrorCheck)
+{
+  // RMM memory resource
+  std::shared_ptr<rmm::mr::device_memory_resource> rsc =
+    std::make_shared<rmm::mr::cuda_memory_resource>();
+
+  // Test input
+  std::string const host_input = R"({"A":'TEST"'})";
+  cudf::io::json_reader_options input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{host_input.data(), host_input.size()})
+      .lines(true);
+
+  EXPECT_THROW(cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc.get()),
+               cudf::logic_error);
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 2001866a80602d0bcc2f3ec9ec18c846ff8a6191 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Thu, 18 Jan 2024 00:11:38 +0000
Subject: [PATCH 17/26] addressing PR reviews; adding comments

---
 cpp/include/cudf/io/detail/json.hpp         |  9 +++++-
 cpp/src/io/json/json_quote_normalization.cu | 33 ++-------------------
 cpp/src/io/json/read_json.cu                | 13 ++++++++
 cpp/src/io/json/read_json.hpp               |  2 +-
 4 files changed, 24 insertions(+), 33 deletions(-)

diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index a3d8ebf57e0..f67daf3b4da 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -52,7 +52,14 @@ void write_json(data_sink* sink,
                 rmm::cuda_stream_view stream,
                 rmm::mr::device_memory_resource* mr);
 
-rmm::device_uvector<char> normalize_single_quotes(const cudf::device_span<std::byte>& inbuf,
+/**
+ * @brief Normalize single quotes to double quotes using FST
+ *
+ * @param inbuf Input device span buffer
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource to use for device memory allocation
+ */
+rmm::device_uvector<char> normalize_single_quotes(cudf::device_span<std::byte const> inbuf,
                                                   rmm::cuda_stream_view stream,
                                                   rmm::mr::device_memory_resource* mr);
 
diff --git a/cpp/src/io/json/json_quote_normalization.cu b/cpp/src/io/json/json_quote_normalization.cu
index d2088bc34d3..3d80bb93a15 100644
--- a/cpp/src/io/json/json_quote_normalization.cu
+++ b/cpp/src/io/json/json_quote_normalization.cu
@@ -177,36 +177,7 @@ struct TransduceToNormalizedQuotes {
 
 namespace detail {
 
-/*
-std::unique_ptr<rmm::device_uvector<char>> normalize_quotes(
-  const cudf::device_span<std::byte>& inbuf,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
-{
-  auto parser = cudf::io::fst::detail::make_fst(
-    cudf::io::fst::detail::make_symbol_group_lut(cudf::io::json::normalize_quotes::qna_sgs),
-    cudf::io::fst::detail::make_transition_table(cudf::io::json::normalize_quotes::qna_state_tt),
-    cudf::io::fst::detail::make_translation_functor(
-      cudf::io::json::normalize_quotes::TransduceToNormalizedQuotes{}),
-    stream);
-
-  std::unique_ptr<rmm::device_uvector<char>> outbuf_ptr =
-    std::make_unique<rmm::device_uvector<char>>(inbuf.size() * 2, stream, mr);
-  rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
-  parser.Transduce(reinterpret_cast<char*>(inbuf.data()),
-                   static_cast<SymbolOffsetT>(inbuf.size()),
-                   outbuf_ptr->data(),
-                   thrust::make_discard_iterator(),
-                   outbuf_size.data(),
-                   cudf::io::json::normalize_quotes::start_state,
-                   stream);
-
-  outbuf_ptr->resize(outbuf_size.value(stream), stream);
-  return outbuf_ptr;
-}
-*/
-
-rmm::device_uvector<SymbolT> normalize_single_quotes(const cudf::device_span<std::byte>& inbuf,
+rmm::device_uvector<SymbolT> normalize_single_quotes(cudf::device_span<std::byte const> inbuf,
                                                      rmm::cuda_stream_view stream,
                                                      rmm::mr::device_memory_resource* mr)
 {
@@ -219,7 +190,7 @@ rmm::device_uvector<SymbolT> normalize_single_quotes(const cudf::device_span<std
 
   rmm::device_uvector<SymbolT> outbuf(inbuf.size() * 2, stream, mr);
   rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
-  parser.Transduce(reinterpret_cast<SymbolT*>(inbuf.data()),
+  parser.Transduce(reinterpret_cast<const SymbolT*>(inbuf.data()),
                    static_cast<SymbolOffsetT>(inbuf.size()),
                    outbuf.data(),
                    thrust::make_discard_iterator(),
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index c159fc2378e..af4c78475c0 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -46,6 +46,16 @@ size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
   });
 }
 
+/**
+ * @brief Read from array of data sources into RMM buffer
+ *
+ * @param sources Array of data sources
+ * @param compression Compression format of source
+ * @param range_offset Number of bytes to skip from source start
+ * @param range_size Number of bytes to read from source
+ * @param normalize_single_quotes Boolean to indicate whether pre-processing FST should be called
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
 rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>> sources,
                                            compression_type compression,
                                            size_t range_offset,
@@ -105,6 +115,8 @@ rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>
     }
 
     stream.synchronize();
+    // If input JSON buffer has single quotes and option to normalize single quotes is enabled,
+    // invoke pre-processing FST
     if (normalize_single_quotes) {
       auto d_buffer_span = cudf::device_span<std::byte>(
         reinterpret_cast<std::byte*>(d_buffer.data()), d_buffer.size());
@@ -123,6 +135,7 @@ rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>
       host_span<char const>{reinterpret_cast<char const*>(uncomp_data.data()), uncomp_data.size()},
       stream,
       rmm::mr::get_current_device_resource());
+    // Quote normalization FST
     if (normalize_single_quotes) {
       auto d_buffer_span = cudf::device_span<std::byte>(
         reinterpret_cast<std::byte*>(d_buffer.data()), d_buffer.size());
diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp
index d3acfa7ebc2..d05134fa837 100644
--- a/cpp/src/io/json/read_json.hpp
+++ b/cpp/src/io/json/read_json.hpp
@@ -22,7 +22,6 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <memory>
@@ -42,4 +41,5 @@ size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::data
                                         json_reader_options const& reader_opts,
                                         char const delimiter,
                                         rmm::cuda_stream_view stream);
+
 }  // namespace cudf::io::json::detail

From d0fefbd512b8551f96f422b74181d1c530800b23 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Thu, 18 Jan 2024 07:17:31 +0000
Subject: [PATCH 18/26] moved tests; removed duplicated fst code

---
 cpp/tests/CMakeLists.txt                      |   1 -
 cpp/tests/io/fst/quote_normalization_test.cu  | 332 ------------------
 .../io/json_quote_normalization_test.cpp      | 147 ++++++--
 3 files changed, 119 insertions(+), 361 deletions(-)
 delete mode 100644 cpp/tests/io/fst/quote_normalization_test.cu

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 3a7a0dd55e9..60324e525ff 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -315,7 +315,6 @@ ConfigureTest(
   PERCENT 30
 )
 target_link_libraries(DATA_CHUNK_SOURCE_TEST PRIVATE ZLIB::ZLIB)
-ConfigureTest(QUOTE_NORMALIZATION_TEST io/fst/quote_normalization_test.cu)
 ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu)
 ConfigureTest(FST_TEST io/fst/fst_test.cu)
 ConfigureTest(TYPE_INFERENCE_TEST io/type_inference_test.cu)
diff --git a/cpp/tests/io/fst/quote_normalization_test.cu b/cpp/tests/io/fst/quote_normalization_test.cu
deleted file mode 100644
index d0794b8f17e..00000000000
--- a/cpp/tests/io/fst/quote_normalization_test.cu
+++ /dev/null
@@ -1,332 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <io/fst/lookup_tables.cuh>
-#include <io/utilities/hostdevice_vector.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/testing_main.hpp>
-
-#include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/strings/repeat_strings.hpp>
-#include <cudf/types.hpp>
-
-#include <rmm/cuda_stream.hpp>
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
-#include <rmm/device_uvector.hpp>
-
-#include <thrust/iterator/discard_iterator.h>
-
-#include <cstdlib>
-#include <string>
-#include <vector>
-
-namespace {
-
-// Type used to represent the atomic symbol type used within the finite-state machine
-// TODO: type aliasing to be declared in a common header for better maintainability and
-//        pre-empt future bugs
-using SymbolT = char;
-using StateT  = char;
-
-// Type sufficiently large to index symbols within the input and output (may be unsigned)
-using SymbolOffsetT = uint32_t;
-enum class dfa_states : char { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES };
-enum class dfa_symbol_group_id : uint32_t {
-  DOUBLE_QUOTE_CHAR,  ///< Quote character SG: "
-  SINGLE_QUOTE_CHAR,  ///< Quote character SG: '
-  ESCAPE_CHAR,        ///< Escape character SG: '\'
-  NEWLINE_CHAR,       ///< Newline character SG: '\n'
-  OTHER_SYMBOLS,      ///< SG implicitly matching all other characters
-  NUM_SYMBOL_GROUPS   ///< Total number of symbol groups
-};
-
-// Aliases for readability of the transition table
-constexpr auto TT_OOS            = dfa_states::TT_OOS;
-constexpr auto TT_DQS            = dfa_states::TT_DQS;
-constexpr auto TT_SQS            = dfa_states::TT_SQS;
-constexpr auto TT_DEC            = dfa_states::TT_DEC;
-constexpr auto TT_SEC            = dfa_states::TT_SEC;
-constexpr auto TT_NUM_STATES     = static_cast<char>(dfa_states::TT_NUM_STATES);
-constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
-
-// The i-th string representing all the characters of a symbol group
-std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const qna_sgs{
-  {{'\"'}, {'\''}, {'\\'}, {'\n'}}};
-
-// Transition table
-std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const qna_state_tt{{
-  /* IN_STATE      "       '       \       \n    OTHER  */
-  /* TT_OOS */ {{TT_DQS, TT_SQS, TT_OOS, TT_OOS, TT_OOS}},
-  /* TT_DQS */ {{TT_OOS, TT_DQS, TT_DEC, TT_OOS, TT_DQS}},
-  /* TT_SQS */ {{TT_SQS, TT_OOS, TT_SEC, TT_OOS, TT_SQS}},
-  /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_OOS, TT_DQS}},
-  /* TT_SEC */ {{TT_SQS, TT_SQS, TT_SQS, TT_OOS, TT_SQS}},
-}};
-
-// The DFA's starting state
-constexpr char start_state = static_cast<char>(TT_OOS);
-
-struct TransduceToNormalizedQuotes {
-  /**
-   * @brief Returns the <relative_offset>-th output symbol on the transition (state_id, match_id).
-   */
-  template <typename StateT, typename SymbolGroupT, typename RelativeOffsetT, typename SymbolT>
-  constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id,
-                                                SymbolGroupT const match_id,
-                                                RelativeOffsetT const relative_offset,
-                                                SymbolT const read_symbol) const
-  {
-    // -------- TRANSLATION TABLE ------------
-    //      Let the alphabet set be Sigma
-    // ---------------------------------------
-    // ---------- NON-SPECIAL CASES: ----------
-    //      Output symbol same as input symbol <s>
-    // state | read_symbol <s> -> output_symbol <s>
-    // DQS   | Sigma           -> Sigma
-    // DEC   | Sigma           -> Sigma
-    // OOS   | Sigma\{'}       -> Sigma\{'}
-    // SQS   | Sigma\{', "}    -> Sigma\{', "}
-    // ---------- SPECIAL CASES: --------------
-    //    Input symbol translates to output symbol
-    // OOS   | {'}             -> {"}
-    // SQS   | {'}             -> {"}
-    // SQS   | {"}             -> {\"}
-    // SQS   | {\}             -> <nop>
-    // SEC   | {'}             -> {'}
-    // SEC   | Sigma\{'}       -> {\*}
-
-    // Whether this transition translates to the escape sequence: \"
-    const bool outputs_escape_sequence =
-      (state_id == static_cast<StateT>(dfa_states::TT_SQS)) &&
-      (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR));
-    // Case when a double quote needs to be replaced by the escape sequence: \"
-    if (outputs_escape_sequence) { return (relative_offset == 0) ? '\\' : '"'; }
-    // Case when a single quote needs to be replaced by a double quote
-    if ((match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)) &&
-        ((state_id == static_cast<StateT>(dfa_states::TT_SQS)) ||
-         (state_id == static_cast<StateT>(dfa_states::TT_OOS)))) {
-      return '"';
-    }
-    // Case when the read symbol is an escape character - the actual translation for \<s> for some
-    // symbol <s> is handled by transitions from SEC. For now, there is no output for this
-    // transition
-    if ((match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::ESCAPE_CHAR)) &&
-        ((state_id == static_cast<StateT>(dfa_states::TT_SQS)))) {
-      return 0;
-    }
-    // Case when an escaped single quote in an input single-quoted string needs to be replaced by an
-    // unescaped single quote
-    if ((match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)) &&
-        ((state_id == static_cast<StateT>(dfa_states::TT_SEC)))) {
-      return '\'';
-    }
-    // Case when an escaped symbol <s> that is not a single-quote needs to be replaced with \<s>
-    if (state_id == static_cast<StateT>(dfa_states::TT_SEC)) {
-      return (relative_offset == 0) ? '\\' : read_symbol;
-    }
-    // In all other cases we simply output the input symbol
-    return read_symbol;
-  }
-
-  /**
-   * @brief Returns the number of output characters for a given transition. During quote
-   * normalization, we always emit one output character (i.e., either the input character or the
-   * single quote-input replaced by a double quote), except when we need to escape a double quote
-   * that was previously inside a single-quoted string.
-   */
-  template <typename StateT, typename SymbolGroupT, typename SymbolT>
-  constexpr CUDF_HOST_DEVICE int32_t operator()(StateT const state_id,
-                                                SymbolGroupT const match_id,
-                                                SymbolT const read_symbol) const
-  {
-    // Whether this transition translates to the escape sequence: \"
-    const bool sqs_outputs_escape_sequence =
-      (state_id == static_cast<StateT>(dfa_states::TT_SQS)) &&
-      (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR));
-    // Number of characters to output on this transition
-    if (sqs_outputs_escape_sequence) { return 2; }
-    // Whether this transition translates to the escape sequence \<s> or unescaped '
-    const bool sec_outputs_escape_sequence =
-      (state_id == static_cast<StateT>(dfa_states::TT_SEC)) &&
-      (match_id != static_cast<SymbolGroupT>(dfa_symbol_group_id::SINGLE_QUOTE_CHAR));
-    // Number of characters to output on this transition
-    if (sec_outputs_escape_sequence) { return 2; }
-    // Whether this transition translates to no output <nop>
-    const bool sqs_outputs_nop =
-      (state_id == static_cast<StateT>(dfa_states::TT_SQS)) &&
-      (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::ESCAPE_CHAR));
-    // Number of characters to output on this transition
-    if (sqs_outputs_nop) { return 0; }
-    return 1;
-  }
-};
-
-}  // namespace
-
-// Base test fixture for tests
-struct FstTest : public cudf::test::BaseFixture {};
-
-void run_test(std::string& input, std::string& output)
-{
-  // Prepare cuda stream for data transfers & kernels
-  rmm::cuda_stream stream{};
-  rmm::cuda_stream_view stream_view(stream);
-
-  auto parser = cudf::io::fst::detail::make_fst(
-    cudf::io::fst::detail::make_symbol_group_lut(qna_sgs),
-    cudf::io::fst::detail::make_transition_table(qna_state_tt),
-    cudf::io::fst::detail::make_translation_functor(TransduceToNormalizedQuotes{}),
-    stream);
-
-  auto d_input_scalar = cudf::make_string_scalar(input, stream_view);
-  auto& d_input       = static_cast<cudf::scalar_type_t<std::string>&>(*d_input_scalar);
-
-  // Prepare input & output buffers
-  constexpr std::size_t single_item = 1;
-  cudf::detail::hostdevice_vector<SymbolT> output_gpu(input.size() * 2, stream_view);
-  cudf::detail::hostdevice_vector<SymbolOffsetT> output_gpu_size(single_item, stream_view);
-
-  // Allocate device-side temporary storage & run algorithm
-  parser.Transduce(d_input.data(),
-                   static_cast<SymbolOffsetT>(d_input.size()),
-                   output_gpu.device_ptr(),
-                   thrust::make_discard_iterator(),
-                   output_gpu_size.device_ptr(),
-                   start_state,
-                   stream_view);
-
-  // Async copy results from device to host
-  output_gpu.device_to_host_async(stream_view);
-  output_gpu_size.device_to_host_async(stream_view);
-
-  // Make sure results have been copied back to host
-  stream.synchronize();
-
-  // Verify results
-  ASSERT_EQ(output_gpu_size[0], output.size());
-  CUDF_TEST_EXPECT_VECTOR_EQUAL(output_gpu, output, output.size());
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization1)
-{
-  std::string input  = R"({"A":'TEST"'})";
-  std::string output = R"({"A":"TEST\""})";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization2)
-{
-  std::string input  = R"({'A':"TEST'"} ['OTHER STUFF'])";
-  std::string output = R"({"A":"TEST'"} ["OTHER STUFF"])";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization3)
-{
-  std::string input  = R"(['{"A": "B"}',"{'A': 'B'}"])";
-  std::string output = R"(["{\"A\": \"B\"}","{'A': 'B'}"])";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization4)
-{
-  std::string input = R"({"ain't ain't a word and you ain't supposed to say it":'"""""""""""'})";
-  std::string output =
-    R"({"ain't ain't a word and you ain't supposed to say it":"\"\"\"\"\"\"\"\"\"\"\""})";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization5)
-{
-  std::string input  = R"({"\"'\"'\"'\"'":'"\'"\'"\'"\'"'})";
-  std::string output = R"({"\"'\"'\"'\"'":"\"'\"'\"'\"'\""})";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization6)
-{
-  std::string input  = R"([{"ABC':'CBA":'XYZ":"ZXY'}])";
-  std::string output = R"([{"ABC':'CBA":"XYZ\":\"ZXY"}])";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization7)
-{
-  std::string input  = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])";
-  std::string output = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization8)
-{
-  std::string input  = R"(['\t','\\t','\\','\\\"\'\\\\','\n','\b','\u0012'])";
-  std::string output = R"(["\t","\\t","\\","\\\"'\\\\","\n","\b","\u0012"])";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid1)
-{
-  std::string input  = R"(["THIS IS A TEST'])";
-  std::string output = R"(["THIS IS A TEST'])";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid2)
-{
-  std::string input  = R"(['THIS IS A TEST"])";
-  std::string output = R"(["THIS IS A TEST\"])";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid3)
-{
-  std::string input  = R"({"MORE TEST'N":'RESUL})";
-  std::string output = R"({"MORE TEST'N":"RESUL})";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid4)
-{
-  std::string input  = R"({"NUMBER":100'0,'STRING':'SOMETHING'})";
-  std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid5)
-{
-  std::string input  = R"({'NUMBER':100"0,"STRING":"SOMETHING"})";
-  std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid6)
-{
-  std::string input  = R"({'a':'\\''})";
-  std::string output = R"({"a":"\\""})";
-  run_test(input, output);
-}
-
-TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid7)
-{
-  std::string input  = R"(}'a': 'b'{)";
-  std::string output = R"(}"a": "b"{)";
-  run_test(input, output);
-}
-
-CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp
index 2d6e35cc3ad..92450d07403 100644
--- a/cpp/tests/io/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json_quote_normalization_test.cpp
@@ -39,49 +39,140 @@
 // Base test fixture for tests
 struct JsonNormalizationTest : public cudf::test::BaseFixture {};
 
-TEST_F(JsonNormalizationTest, ValidOutput)
+void run_test(const std::string& host_input, const std::string& expected_host_output)
 {
   // RMM memory resource
   std::shared_ptr<rmm::mr::device_memory_resource> rsc =
     std::make_shared<rmm::mr::cuda_memory_resource>();
 
-  // Test input
-  std::string const host_input = R"({"A":'TEST"'})";
   rmm::device_uvector<char> device_input(
     host_input.size(), cudf::test::get_default_stream(), rsc.get());
-  for (size_t i = 0; i < host_input.size(); i++)
-    device_input.set_element_async(i, host_input[i], cudf::test::get_default_stream());
+  CUDF_CUDA_TRY(cudaMemcpyAsync(device_input.data(),
+                                host_input.data(),
+                                host_input.size(),
+                                cudaMemcpyHostToDevice,
+                                cudf::test::get_default_stream().value()));
   auto device_input_span = cudf::device_span<std::byte>(
     reinterpret_cast<std::byte*>(device_input.data()), device_input.size());
 
   // Preprocessing FST
   auto device_fst_output = cudf::io::json::detail::normalize_single_quotes(
     device_input_span, cudf::test::get_default_stream(), rsc.get());
-  /*
-  for(size_t i = 0; i < device_fst_output_ptr->size(); i++)
-    std::printf("%c", device_fst_output_ptr->element(i, cudf::test::get_default_stream()));
-  std::printf("\n");
-  */
-
-  // Initialize parsing options (reading json lines)
-  auto device_fst_output_span = cudf::device_span<std::byte>(
-    reinterpret_cast<std::byte*>(device_fst_output.data()), device_fst_output.size());
-  cudf::io::json_reader_options input_options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{device_fst_output_span})
-      .lines(true);
 
-  cudf::io::table_with_metadata processed_table =
-    cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc.get());
+  std::string preprocessed_host_output(device_fst_output.size(), 0);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(),
+                                device_fst_output.data(),
+                                preprocessed_host_output.size(),
+                                cudaMemcpyDeviceToHost,
+                                cudf::test::get_default_stream().value()));
+  CUDF_TEST_EXPECT_VECTOR_EQUAL(
+    preprocessed_host_output, expected_host_output, preprocessed_host_output.size());
+}
 
-  // Expected table
-  std::string const expected_input = R"({"A":"TEST\""})";
-  cudf::io::json_reader_options expected_input_options =
-    cudf::io::json_reader_options::builder(
-      cudf::io::source_info{expected_input.data(), expected_input.size()})
-      .lines(true);
-  cudf::io::table_with_metadata expected_table =
-    cudf::io::read_json(expected_input_options, cudf::test::get_default_stream(), rsc.get());
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view());
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization1)
+{
+  std::string input  = R"({"A":'TEST"'})";
+  std::string output = R"({"A":"TEST\""})";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization2)
+{
+  std::string input  = R"({'A':"TEST'"} ['OTHER STUFF'])";
+  std::string output = R"({"A":"TEST'"} ["OTHER STUFF"])";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization3)
+{
+  std::string input  = R"(['{"A": "B"}',"{'A': 'B'}"])";
+  std::string output = R"(["{\"A\": \"B\"}","{'A': 'B'}"])";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization4)
+{
+  std::string input = R"({"ain't ain't a word and you ain't supposed to say it":'"""""""""""'})";
+  std::string output =
+    R"({"ain't ain't a word and you ain't supposed to say it":"\"\"\"\"\"\"\"\"\"\"\""})";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization5)
+{
+  std::string input  = R"({"\"'\"'\"'\"'":'"\'"\'"\'"\'"'})";
+  std::string output = R"({"\"'\"'\"'\"'":"\"'\"'\"'\"'\""})";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization6)
+{
+  std::string input  = R"([{"ABC':'CBA":'XYZ":"ZXY'}])";
+  std::string output = R"([{"ABC':'CBA":"XYZ\":\"ZXY"}])";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization7)
+{
+  std::string input  = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])";
+  std::string output = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization8)
+{
+  std::string input  = R"(['\t','\\t','\\','\\\"\'\\\\','\n','\b','\u0012'])";
+  std::string output = R"(["\t","\\t","\\","\\\"'\\\\","\n","\b","\u0012"])";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid1)
+{
+  std::string input  = R"(["THIS IS A TEST'])";
+  std::string output = R"(["THIS IS A TEST'])";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid2)
+{
+  std::string input  = R"(['THIS IS A TEST"])";
+  std::string output = R"(["THIS IS A TEST\"])";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid3)
+{
+  std::string input  = R"({"MORE TEST'N":'RESUL})";
+  std::string output = R"({"MORE TEST'N":"RESUL})";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid4)
+{
+  std::string input  = R"({"NUMBER":100'0,'STRING':'SOMETHING'})";
+  std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid5)
+{
+  std::string input  = R"({'NUMBER':100"0,"STRING":"SOMETHING"})";
+  std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid6)
+{
+  std::string input  = R"({'a':'\\''})";
+  std::string output = R"({"a":"\\""})";
+  run_test(input, output);
+}
+
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid7)
+{
+  std::string input  = R"(}'a': 'b'{)";
+  std::string output = R"(}"a": "b"{)";
+  run_test(input, output);
 }
 
 TEST_F(JsonNormalizationTest, ReadJsonOption)

From 55503e374feec5765dd3abd0580056191bffe5f9 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Thu, 18 Jan 2024 07:42:09 +0000
Subject: [PATCH 19/26] moved preprocess step to read_json

---
 cpp/src/io/json/read_json.cu | 38 +++++++++++++-----------------------
 1 file changed, 14 insertions(+), 24 deletions(-)

diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index af4c78475c0..bf50edce5fa 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -53,14 +53,12 @@ size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
  * @param compression Compression format of source
  * @param range_offset Number of bytes to skip from source start
  * @param range_size Number of bytes to read from source
- * @param normalize_single_quotes Boolean to indicate whether pre-processing FST should be called
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>> sources,
                                            compression_type compression,
                                            size_t range_offset,
                                            size_t range_size,
-                                           bool normalize_single_quotes,
                                            rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
@@ -115,15 +113,7 @@ rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>
     }
 
     stream.synchronize();
-    // If input JSON buffer has single quotes and option to normalize single quotes is enabled,
-    // invoke pre-processing FST
-    if (normalize_single_quotes) {
-      auto d_buffer_span = cudf::device_span<std::byte>(
-        reinterpret_cast<std::byte*>(d_buffer.data()), d_buffer.size());
-      return cudf::io::json::detail::normalize_single_quotes(
-        d_buffer_span, stream, rmm::mr::get_current_device_resource());
-    } else
-      return d_buffer;
+    return d_buffer;
 
   } else {
     auto buffer = std::vector<uint8_t>(total_source_size);
@@ -131,18 +121,10 @@ rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>
     // Reading to host because decompression of a single block is much faster on the CPU
     sources[0]->host_read(range_offset, total_source_size, buffer.data());
     auto const uncomp_data = decompress(compression, buffer);
-    auto d_buffer          = cudf::detail::make_device_uvector_sync(
+    return cudf::detail::make_device_uvector_sync(
       host_span<char const>{reinterpret_cast<char const*>(uncomp_data.data()), uncomp_data.size()},
       stream,
       rmm::mr::get_current_device_resource());
-    // Quote normalization FST
-    if (normalize_single_quotes) {
-      auto d_buffer_span = cudf::device_span<std::byte>(
-        reinterpret_cast<std::byte*>(d_buffer.data()), d_buffer.size());
-      return cudf::io::json::detail::normalize_single_quotes(
-        d_buffer_span, stream, rmm::mr::get_current_device_resource());
-    } else
-      return d_buffer;
   }
 }
 
@@ -155,7 +137,6 @@ size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::data
                                        reader_opts.get_compression(),
                                        reader_opts.get_byte_range_offset(),
                                        reader_opts.get_byte_range_size(),
-                                       reader_opts.is_enabled_normalize_single_quotes(),
                                        stream);
   return find_first_delimiter(buffer, delimiter, stream);
 }
@@ -187,7 +168,6 @@ auto get_record_range_raw_input(host_span<std::unique_ptr<datasource>> sources,
                                  reader_opts.get_compression(),
                                  reader_opts.get_byte_range_offset(),
                                  reader_opts.get_byte_range_size(),
-                                 reader_opts.is_enabled_normalize_single_quotes(),
                                  stream);
   if (should_load_whole_source(reader_opts)) return buffer;
   auto first_delim_pos =
@@ -205,7 +185,6 @@ auto get_record_range_raw_input(host_span<std::unique_ptr<datasource>> sources,
                                 reader_opts.get_compression(),
                                 current_offset,
                                 reader_opts.get_byte_range_size(),
-                                reader_opts.is_enabled_normalize_single_quotes(),
                                 stream);
       next_delim_pos = find_first_delimiter(buffer, '\n', stream);
       if (next_delim_pos == -1) { current_offset += reader_opts.get_byte_range_size(); }
@@ -219,7 +198,6 @@ auto get_record_range_raw_input(host_span<std::unique_ptr<datasource>> sources,
                             reader_opts.get_compression(),
                             first_delim_pos,
                             next_delim_pos - first_delim_pos,
-                            reader_opts.is_enabled_normalize_single_quotes(),
                             stream);
   }
 }
@@ -251,6 +229,18 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
 
   auto const buffer = get_record_range_raw_input(sources, reader_opts, stream);
 
+  // If input JSON buffer has single quotes and option to normalize single quotes is enabled,
+  // invoke pre-processing FST
+  if (reader_opts.is_enabled_normalize_single_quotes()) {
+    auto buffer_span = cudf::device_span<std::byte const>(
+      reinterpret_cast<const std::byte*>(buffer.data()), buffer.size());
+    return device_parse_nested_json(cudf::io::json::detail::normalize_single_quotes(
+                                      buffer_span, stream, rmm::mr::get_current_device_resource()),
+                                    reader_opts,
+                                    stream,
+                                    mr);
+  }
+
   return device_parse_nested_json(buffer, reader_opts, stream, mr);
   // For debug purposes, use host_parse_nested_json()
 }

From a8852771a252e0064f55d268cd1479eaa5b06916 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Fri, 19 Jan 2024 22:05:01 +0000
Subject: [PATCH 20/26] PR reviews - modifiable input buffer in normalize
 quotes parameter

---
 cpp/include/cudf/io/detail/json.hpp            |  4 ++--
 cpp/src/io/json/json_quote_normalization.cu    |  4 ++--
 cpp/src/io/json/read_json.cu                   | 11 +++--------
 cpp/tests/io/json_quote_normalization_test.cpp |  6 +-----
 4 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index f67daf3b4da..0eb0e17ea10 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -55,11 +55,11 @@ void write_json(data_sink* sink,
 /**
  * @brief Normalize single quotes to double quotes using FST
  *
- * @param inbuf Input device span buffer
+ * @param inbuf Input device buffer
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
-rmm::device_uvector<char> normalize_single_quotes(cudf::device_span<std::byte const> inbuf,
+rmm::device_uvector<char> normalize_single_quotes(rmm::device_uvector<char>&& inbuf,
                                                   rmm::cuda_stream_view stream,
                                                   rmm::mr::device_memory_resource* mr);
 
diff --git a/cpp/src/io/json/json_quote_normalization.cu b/cpp/src/io/json/json_quote_normalization.cu
index 3d80bb93a15..61b78bb008f 100644
--- a/cpp/src/io/json/json_quote_normalization.cu
+++ b/cpp/src/io/json/json_quote_normalization.cu
@@ -177,7 +177,7 @@ struct TransduceToNormalizedQuotes {
 
 namespace detail {
 
-rmm::device_uvector<SymbolT> normalize_single_quotes(cudf::device_span<std::byte const> inbuf,
+rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT>&& inbuf,
                                                      rmm::cuda_stream_view stream,
                                                      rmm::mr::device_memory_resource* mr)
 {
@@ -190,7 +190,7 @@ rmm::device_uvector<SymbolT> normalize_single_quotes(cudf::device_span<std::byte
 
   rmm::device_uvector<SymbolT> outbuf(inbuf.size() * 2, stream, mr);
   rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
-  parser.Transduce(reinterpret_cast<const SymbolT*>(inbuf.data()),
+  parser.Transduce(inbuf.data(),
                    static_cast<SymbolOffsetT>(inbuf.size()),
                    outbuf.data(),
                    thrust::make_discard_iterator(),
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index bf50edce5fa..7edd5c6b75e 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -227,18 +227,13 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                  "Multiple inputs are supported only for JSON Lines format");
   }
 
-  auto const buffer = get_record_range_raw_input(sources, reader_opts, stream);
+  auto buffer = get_record_range_raw_input(sources, reader_opts, stream);
 
   // If input JSON buffer has single quotes and option to normalize single quotes is enabled,
   // invoke pre-processing FST
   if (reader_opts.is_enabled_normalize_single_quotes()) {
-    auto buffer_span = cudf::device_span<std::byte const>(
-      reinterpret_cast<const std::byte*>(buffer.data()), buffer.size());
-    return device_parse_nested_json(cudf::io::json::detail::normalize_single_quotes(
-                                      buffer_span, stream, rmm::mr::get_current_device_resource()),
-                                    reader_opts,
-                                    stream,
-                                    mr);
+    buffer = cudf::io::json::detail::normalize_single_quotes(
+      std::move(buffer), stream, rmm::mr::get_current_device_resource());
   }
 
   return device_parse_nested_json(buffer, reader_opts, stream, mr);
diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp
index 92450d07403..5c512999a7b 100644
--- a/cpp/tests/io/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json_quote_normalization_test.cpp
@@ -21,7 +21,6 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/span.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <cudf_test/base_fixture.hpp>
@@ -52,12 +51,9 @@ void run_test(const std::string& host_input, const std::string& expected_host_ou
                                 host_input.size(),
                                 cudaMemcpyHostToDevice,
                                 cudf::test::get_default_stream().value()));
-  auto device_input_span = cudf::device_span<std::byte>(
-    reinterpret_cast<std::byte*>(device_input.data()), device_input.size());
-
   // Preprocessing FST
   auto device_fst_output = cudf::io::json::detail::normalize_single_quotes(
-    device_input_span, cudf::test::get_default_stream(), rsc.get());
+    std::move(device_input), cudf::test::get_default_stream(), rsc.get());
 
   std::string preprocessed_host_output(device_fst_output.size(), 0);
   CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(),

From de1f1b3beee7c15b0e03639004fcc98a89d08670 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Sat, 20 Jan 2024 01:46:30 +0000
Subject: [PATCH 21/26] don't need fully qualified name in enclosing namespace

---
 cpp/src/io/json/read_json.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 7edd5c6b75e..2cfb5fa03c9 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -232,8 +232,8 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
   // If input JSON buffer has single quotes and option to normalize single quotes is enabled,
   // invoke pre-processing FST
   if (reader_opts.is_enabled_normalize_single_quotes()) {
-    buffer = cudf::io::json::detail::normalize_single_quotes(
-      std::move(buffer), stream, rmm::mr::get_current_device_resource());
+    buffer =
+      normalize_single_quotes(std::move(buffer), stream, rmm::mr::get_current_device_resource());
   }
 
   return device_parse_nested_json(buffer, reader_opts, stream, mr);

From 8441b3914404815df20ff1170eef865945b44107 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Mon, 22 Jan 2024 21:40:59 +0000
Subject: [PATCH 22/26] header files cleanup; more fully-qualified names
 cleanup

---
 cpp/include/cudf/io/json.hpp                  |  6 ++---
 cpp/src/io/json/json_quote_normalization.cu   | 22 ++++++++-----------
 .../io/json_quote_normalization_test.cpp      |  4 ----
 3 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index a087e6b40fb..e8f461d808f 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -335,10 +335,10 @@ class json_reader_options {
   void enable_keep_quotes(bool val) { _keep_quotes = val; }
 
   /**
-   * @brief Set whether the reader should enable normalization of single  quotes around strings.
+   * @brief Set whether the reader should enable normalization of single quotes around strings.
    *
    * @param val Boolean value to indicate whether the reader should normalize single quotes around
-   * string
+   * strings
    */
   void enable_normalize_single_quotes(bool val) { _normalize_single_quotes = val; }
 
@@ -493,7 +493,7 @@ class json_reader_options_builder {
   }
 
   /**
-   * @brief Set whether the reader should normalize single quotes around string
+   * @brief Set whether the reader should normalize single quotes around strings
    *
    * @param val Boolean value to indicate whether the reader should normalize single quotes
    * of strings
diff --git a/cpp/src/io/json/json_quote_normalization.cu b/cpp/src/io/json/json_quote_normalization.cu
index 61b78bb008f..f0e21115e27 100644
--- a/cpp/src/io/json/json_quote_normalization.cu
+++ b/cpp/src/io/json/json_quote_normalization.cu
@@ -15,16 +15,13 @@
  */
 
 #include <io/fst/lookup_tables.cuh>
-#include <io/utilities/hostdevice_vector.hpp>
 
 #include <cudf/io/detail/json.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/strings/repeat_strings.hpp>
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/iterator/discard_iterator.h>
@@ -42,7 +39,7 @@ using SymbolOffsetT = uint32_t;
 namespace normalize_quotes {
 
 // Type sufficiently large to index symbols within the input and output (may be unsigned)
-enum class dfa_states : char { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES };
+enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES };
 enum class dfa_symbol_group_id : uint32_t {
   DOUBLE_QUOTE_CHAR,  ///< Quote character SG: "
   SINGLE_QUOTE_CHAR,  ///< Quote character SG: '
@@ -58,7 +55,7 @@ constexpr auto TT_DQS            = dfa_states::TT_DQS;
 constexpr auto TT_SQS            = dfa_states::TT_SQS;
 constexpr auto TT_DEC            = dfa_states::TT_DEC;
 constexpr auto TT_SEC            = dfa_states::TT_SEC;
-constexpr auto TT_NUM_STATES     = static_cast<char>(dfa_states::TT_NUM_STATES);
+constexpr auto TT_NUM_STATES     = static_cast<StateT>(dfa_states::TT_NUM_STATES);
 constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
 
 // The i-th string representing all the characters of a symbol group
@@ -76,7 +73,7 @@ std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const qna_s
 }};
 
 // The DFA's starting state
-constexpr char start_state = static_cast<char>(TT_OOS);
+constexpr char start_state = static_cast<StateT>(TT_OOS);
 
 struct TransduceToNormalizedQuotes {
   /**
@@ -181,11 +178,10 @@ rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT
                                                      rmm::cuda_stream_view stream,
                                                      rmm::mr::device_memory_resource* mr)
 {
-  auto parser = cudf::io::fst::detail::make_fst(
-    cudf::io::fst::detail::make_symbol_group_lut(cudf::io::json::normalize_quotes::qna_sgs),
-    cudf::io::fst::detail::make_transition_table(cudf::io::json::normalize_quotes::qna_state_tt),
-    cudf::io::fst::detail::make_translation_functor(
-      cudf::io::json::normalize_quotes::TransduceToNormalizedQuotes{}),
+  auto parser = fst::detail::make_fst(
+    fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs),
+    fst::detail::make_transition_table(normalize_quotes::qna_state_tt),
+    fst::detail::make_translation_functor(normalize_quotes::TransduceToNormalizedQuotes{}),
     stream);
 
   rmm::device_uvector<SymbolT> outbuf(inbuf.size() * 2, stream, mr);
@@ -195,7 +191,7 @@ rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT
                    outbuf.data(),
                    thrust::make_discard_iterator(),
                    outbuf_size.data(),
-                   cudf::io::json::normalize_quotes::start_state,
+                   normalize_quotes::start_state,
                    stream);
 
   outbuf.resize(outbuf_size.value(stream), stream);
diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp
index 5c512999a7b..8443ea20d88 100644
--- a/cpp/tests/io/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json_quote_normalization_test.cpp
@@ -16,10 +16,6 @@
 
 #include <cudf/io/detail/json.hpp>
 #include <cudf/io/json.hpp>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 

From d5b9707ea81ff50f8aaae71716e17756a908b399 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Tue, 23 Jan 2024 01:59:18 +0000
Subject: [PATCH 23/26] alphabetizing the new file in add_library

---
 cpp/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 23256029c62..5bfe9ebadf6 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -370,13 +370,13 @@ add_library(
   src/io/functions.cpp
   src/io/json/byte_range_info.cu
   src/io/json/json_column.cu
+  src/io/json/json_quote_normalization.cu
   src/io/json/json_tree.cu
   src/io/json/nested_json_gpu.cu
   src/io/json/read_json.cu
   src/io/json/legacy/json_gpu.cu
   src/io/json/legacy/reader_impl.cu
   src/io/json/write_json.cu
-  src/io/json/json_quote_normalization.cu
   src/io/orc/aggregate_orc_metadata.cpp
   src/io/orc/dict_enc.cu
   src/io/orc/orc.cpp

From 4e358fdf911c134614c8b3516063ee0d255c0e8a Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Tue, 23 Jan 2024 02:00:18 +0000
Subject: [PATCH 24/26] more header file cleanup

---
 cpp/tests/io/json_quote_normalization_test.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp
index 8443ea20d88..50faea5e4d8 100644
--- a/cpp/tests/io/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json_quote_normalization_test.cpp
@@ -16,8 +16,7 @@
 
 #include <cudf/io/detail/json.hpp>
 #include <cudf/io/json.hpp>
-#include <cudf/types.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
+#include <cudf/io/types.hpp>
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/cudf_gtest.hpp>

From a79683daad4bfe3025f375443de98a1783255d87 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Tue, 23 Jan 2024 02:14:53 +0000
Subject: [PATCH 25/26] guiding the consts eastwards

---
 cpp/src/io/json/json_quote_normalization.cu | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/json/json_quote_normalization.cu b/cpp/src/io/json/json_quote_normalization.cu
index f0e21115e27..7c9466748cd 100644
--- a/cpp/src/io/json/json_quote_normalization.cu
+++ b/cpp/src/io/json/json_quote_normalization.cu
@@ -73,7 +73,7 @@ std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const qna_s
 }};
 
 // The DFA's starting state
-constexpr char start_state = static_cast<StateT>(TT_OOS);
+constexpr auto start_state = static_cast<StateT>(TT_OOS);
 
 struct TransduceToNormalizedQuotes {
   /**
@@ -105,7 +105,7 @@ struct TransduceToNormalizedQuotes {
     // SEC   | Sigma\{'}       -> {\*}
 
     // Whether this transition translates to the escape sequence: \"
-    const bool outputs_escape_sequence =
+    bool const outputs_escape_sequence =
       (state_id == static_cast<StateT>(dfa_states::TT_SQS)) &&
       (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR));
     // Case when a double quote needs to be replaced by the escape sequence: \"
@@ -149,19 +149,19 @@ struct TransduceToNormalizedQuotes {
                                                 SymbolT const read_symbol) const
   {
     // Whether this transition translates to the escape sequence: \"
-    const bool sqs_outputs_escape_sequence =
+    bool const sqs_outputs_escape_sequence =
       (state_id == static_cast<StateT>(dfa_states::TT_SQS)) &&
       (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR));
     // Number of characters to output on this transition
     if (sqs_outputs_escape_sequence) { return 2; }
     // Whether this transition translates to the escape sequence \<s> or unescaped '
-    const bool sec_outputs_escape_sequence =
+    bool const sec_outputs_escape_sequence =
       (state_id == static_cast<StateT>(dfa_states::TT_SEC)) &&
       (match_id != static_cast<SymbolGroupT>(dfa_symbol_group_id::SINGLE_QUOTE_CHAR));
     // Number of characters to output on this transition
     if (sec_outputs_escape_sequence) { return 2; }
     // Whether this transition translates to no output <nop>
-    const bool sqs_outputs_nop =
+    bool const sqs_outputs_nop =
       (state_id == static_cast<StateT>(dfa_states::TT_SQS)) &&
       (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::ESCAPE_CHAR));
     // Number of characters to output on this transition

From 890d09b30bb2b9a127e52e2a87c31db30b573163 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Tue, 23 Jan 2024 02:36:20 +0000
Subject: [PATCH 26/26] formatting fix

---
 java/src/main/native/src/TableJni.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 61efe2a4edb..cef18b245e7 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1472,8 +1472,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
     JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
-    jboolean day_first, jboolean lines, jboolean recover_with_null, jboolean normalize_single_quotes, jboolean mixed_types_as_string,
-    jlong ds_handle) {
+    jboolean day_first, jboolean lines, jboolean recover_with_null,
+    jboolean normalize_single_quotes, jboolean mixed_types_as_string, jlong ds_handle) {
 
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);