From ca1a4d65f5fc1b4da767ac9f286fe017c9b56388 Mon Sep 17 00:00:00 2001 From: David <45795991+davidwendt@users.noreply.github.com> Date: Mon, 4 Jan 2021 10:07:26 -0500 Subject: [PATCH] Fix to_csv delimiter handling of timestamp format(#7023) Closes #6699 The timestamp format(s) used by the CSV writer have the form `%Y-%m-%dT%H:%M:%SZ`. This means if the column delimiter `','` or the line delimiter `\n` is either `':'` or `'-'` then the timestamp string output could conflict with these delimiters. The current logic simply removed these delimiters from the format if they detected a conflicting column or line delimiter. For example, specifying a dash `'-'` as column delimiter caused the timestamp format to change to `%Y%m%d...` (the dash is removed). I admit this was kind of hacky and also made the output inconsistent with Pandas `to_csv()`. It is easy enough to simply add double-quotes around the timestamp format to prevent these conflicts as well as make the output consistent. This PR fixes that logic. Exception logic to check for a dash as column separator was also found in [csv.py](https://github.com/rapidsai/cudf/blob/8c1f01e1fd713d873cf3d943ab409f3e9efc48f8/python/cudf/cudf/io/csv.py#L139-L149), specifically citing issue 6699 in the exception message. Also, there was a pytest specifically created to check for this exception. The exception is removed and the pytest function updated in this PR as well. Authors: - davidwendt Approvers: - GALI PREM SAGAR - Karthikeyan - null URL: https://github.com/rapidsai/cudf/pull/7023 --- cpp/src/io/csv/writer_impl.cu | 10 +++------- python/cudf/cudf/io/csv.py | 12 ------------ python/cudf/cudf/tests/test_csv.py | 17 +++++------------ 3 files changed, 8 insertions(+), 31 deletions(-) diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index 6bbe83ef83f..dda2e0704f6 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -311,19 +311,15 @@ struct column_to_strings_fn { }(); // handle the cases where delimiter / line-terminator can be - // "-" or ":", in which case they are to be dropped from the format: + // "-" or ":", in which case we need to add quotes to the format // std::string delimiter{options_.get_inter_column_delimiter()}; std::string newline{options_.get_line_terminator()}; constexpr char const* dash{"-"}; constexpr char const* colon{":"}; - if (delimiter == dash || newline == dash) { - format.erase(std::remove(format.begin(), format.end(), dash[0]), format.end()); - } - - if (delimiter == colon || newline == colon) { - format.erase(std::remove(format.begin(), format.end(), colon[0]), format.end()); + if (delimiter == dash || newline == dash || delimiter == colon || newline == colon) { + format = "\"" + format + "\""; } auto conv_col_ptr = cudf::strings::from_timestamps(column, format, mr_); diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index 7b83002d0fa..2766e0d79d0 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -136,18 +136,6 @@ def to_csv( "Dataframe doesn't have the labels provided in columns" ) - if sep == "-": - # TODO: Remove this error once following issue is fixed: - # https://github.com/rapidsai/cudf/issues/6699 - if any( - isinstance(col, cudf.core.column.DatetimeColumn) - for col in df._data.columns - ): - raise ValueError( - "sep cannot be '-' when writing a datetime64 dtype to csv, " - "refer to: https://github.com/rapidsai/cudf/issues/6699" - ) - # TODO: Need to typecast categorical columns to the underlying # categories dtype to write the actual data to csv. Remove this # workaround once following issue is fixed: diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index d4ab13c3b93..94a5f061383 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -1905,21 +1905,14 @@ def test_csv_reader_category_error(): cudf.read_csv(StringIO(csv_buf), dtype="category") -def test_csv_writer_datetime_sep_error(): - # TODO: Remove this test once following - # issues is fixed: https://github.com/rapidsai/cudf/issues/6699 +def test_csv_writer_datetime_sep(): df = cudf.DataFrame( {"a": cudf.Series([22343, 2323423, 234324234], dtype="datetime64[ns]")} ) - - with pytest.raises( - ValueError, - match=re.escape( - "sep cannot be '-' when writing a datetime64 dtype to csv, " - "refer to: https://github.com/rapidsai/cudf/issues/6699" - ), - ): - df.to_csv(sep="-") + df["a"] = df["a"].astype("datetime64[s]") + expected = df.to_pandas().to_csv(date_format="%Y-%m-%dT%H:%M:%SZ", sep="-") + actual = df.to_csv(sep="-") + assert expected == actual def test_na_filter_empty_fields():