Add column names validation in parquet writer (#7786)

Fixes: #7738 Parquet writer requires all column names to be of string types, added a validation similar to that of pandas. Authors: - GALI PREM SAGAR (@galipremsagar) Approvers: - Michael Wang (@isVoid) - Keith Kraus (@kkraus14) URL: #7786
rapidsai · Mar 31, 2021 · c05dbed · c05dbed
1 parent b937112
commit c05dbed
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 1 deletion.
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
@@ -312,6 +312,9 @@ cpdef write_parquet(
         num_index_cols_meta = 0
 
     for i, name in enumerate(table._column_names, num_index_cols_meta):
+        if not isinstance(name, str):
+            raise ValueError("parquet must have string column names")
+
         tbl_meta.get().column_metadata[i].set_name(name.encode())
         _set_col_metadata(
             table[name]._column, tbl_meta.get().column_metadata[i]

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
@@ -19,7 +19,7 @@
 import cudf
 from cudf.io.parquet import ParquetWriter, merge_parquet_filemetadata
 from cudf.tests import dataset_generator as dg
-from cudf.tests.utils import assert_eq
+from cudf.tests.utils import assert_eq, assert_exceptions_equal
 
 
 @pytest.fixture(scope="module")
@@ -1937,3 +1937,15 @@ def test_parquet_writer_decimal(tmpdir):
 
     got = pd.read_parquet(fname)
     assert_eq(gdf, got)
+
+
+def test_parquet_writer_column_validation():
+    df = cudf.DataFrame({1: [1, 2, 3], "1": ["a", "b", "c"]})
+    pdf = df.to_pandas()
+
+    assert_exceptions_equal(
+        lfunc=df.to_parquet,
+        rfunc=pdf.to_parquet,
+        lfunc_args_and_kwargs=(["cudf.parquet"],),
+        rfunc_args_and_kwargs=(["pandas.parquet"],),
+    )