From ca1a4d65f5fc1b4da767ac9f286fe017c9b56388 Mon Sep 17 00:00:00 2001
From: David <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 4 Jan 2021 10:07:26 -0500
Subject: [PATCH] Fix to_csv delimiter handling of timestamp format(#7023)

Closes #6699
The timestamp format(s) used by the CSV writer have the form `%Y-%m-%dT%H:%M:%SZ`. This means if the column delimiter `','` or the line delimiter `\n` is either `':'` or `'-'` then the timestamp string output could conflict with these delimiters. The current logic simply removed these delimiters from the format if they detected a conflicting column or line delimiter. For example, specifying a dash `'-'` as column delimiter caused the timestamp format to change to `%Y%m%d...` (the dash is removed). I admit this was kind of hacky and also made the output inconsistent with Pandas `to_csv()`.

It is easy enough to simply add double-quotes around the timestamp format to prevent these conflicts as well as make the output consistent. This PR fixes that logic.

Exception logic to check for a dash as column separator was also found in [csv.py](https://github.com/rapidsai/cudf/blob/8c1f01e1fd713d873cf3d943ab409f3e9efc48f8/python/cudf/cudf/io/csv.py#L139-L149), specifically citing issue 6699 in the exception message. Also, there was a pytest specifically created to check for this exception. The exception is removed and the pytest function updated in this PR as well.

Authors:
  - davidwendt <dwendt@nvidia.com>

Approvers:
  - GALI PREM SAGAR
  - Karthikeyan
  - null

URL: https://github.com/rapidsai/cudf/pull/7023
---
 cpp/src/io/csv/writer_impl.cu      | 10 +++-------
 python/cudf/cudf/io/csv.py         | 12 ------------
 python/cudf/cudf/tests/test_csv.py | 17 +++++------------
 3 files changed, 8 insertions(+), 31 deletions(-)

diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 6bbe83ef83f..dda2e0704f6 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -311,19 +311,15 @@ struct column_to_strings_fn {
     }();
 
     // handle the cases where delimiter / line-terminator can be
-    // "-" or ":", in which case they are to be dropped from the format:
+    // "-" or ":", in which case we need to add quotes to the format
     //
     std::string delimiter{options_.get_inter_column_delimiter()};
     std::string newline{options_.get_line_terminator()};
 
     constexpr char const* dash{"-"};
     constexpr char const* colon{":"};
-    if (delimiter == dash || newline == dash) {
-      format.erase(std::remove(format.begin(), format.end(), dash[0]), format.end());
-    }
-
-    if (delimiter == colon || newline == colon) {
-      format.erase(std::remove(format.begin(), format.end(), colon[0]), format.end());
+    if (delimiter == dash || newline == dash || delimiter == colon || newline == colon) {
+      format = "\"" + format + "\"";
     }
 
     auto conv_col_ptr = cudf::strings::from_timestamps(column, format, mr_);
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index 7b83002d0fa..2766e0d79d0 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -136,18 +136,6 @@ def to_csv(
                 "Dataframe doesn't have the labels provided in columns"
             )
 
-    if sep == "-":
-        # TODO: Remove this error once following issue is fixed:
-        # https://github.com/rapidsai/cudf/issues/6699
-        if any(
-            isinstance(col, cudf.core.column.DatetimeColumn)
-            for col in df._data.columns
-        ):
-            raise ValueError(
-                "sep cannot be '-' when writing a datetime64 dtype to csv, "
-                "refer to: https://github.com/rapidsai/cudf/issues/6699"
-            )
-
     # TODO: Need to typecast categorical columns to the underlying
     # categories dtype to write the actual data to csv. Remove this
     # workaround once following issue is fixed:
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index d4ab13c3b93..94a5f061383 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1905,21 +1905,14 @@ def test_csv_reader_category_error():
         cudf.read_csv(StringIO(csv_buf), dtype="category")
 
 
-def test_csv_writer_datetime_sep_error():
-    # TODO: Remove this test once following
-    # issues is fixed: https://github.com/rapidsai/cudf/issues/6699
+def test_csv_writer_datetime_sep():
     df = cudf.DataFrame(
         {"a": cudf.Series([22343, 2323423, 234324234], dtype="datetime64[ns]")}
     )
-
-    with pytest.raises(
-        ValueError,
-        match=re.escape(
-            "sep cannot be '-' when writing a datetime64 dtype to csv, "
-            "refer to: https://github.com/rapidsai/cudf/issues/6699"
-        ),
-    ):
-        df.to_csv(sep="-")
+    df["a"] = df["a"].astype("datetime64[s]")
+    expected = df.to_pandas().to_csv(date_format="%Y-%m-%dT%H:%M:%SZ", sep="-")
+    actual = df.to_csv(sep="-")
+    assert expected == actual
 
 
 def test_na_filter_empty_fields():