Raise temporary error for decimal128 types in parquet reader (#9804)

This PR adds a `decimal128` type validation in parquet reader. This is put in-place to unblock libcudf changes: #9765 and this validation will soon be removed once python side of `decimal128` changes are merged(blocked by libcudf `from_arrow` bug). Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Vyas Ramasubramani (https://github.com/vyasr) URL: #9804
rapidsai · Dec 7, 2021 · ba3aedb · ba3aedb
1 parent a5633c2
commit ba3aedb
Show file tree

Hide file tree

Showing 3 changed files with 55 additions and 4 deletions.
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
@@ -7,6 +7,7 @@
 from uuid import uuid4
 
 import fsspec
+import pyarrow as pa
 from pyarrow import dataset as ds, parquet as pq
 
 import cudf
@@ -614,6 +615,34 @@ def _read_parquet(
     # Simple helper function to dispatch between
     # cudf and pyarrow to read parquet data
     if engine == "cudf":
+        # Temporary error to probe a parquet file
+        # and raise decimal128 support error.
+        if len(filepaths_or_buffers) > 0:
+            try:
+                metadata = pq.read_metadata(filepaths_or_buffers[0])
+            except TypeError:
+                # pq.read_metadata only supports reading metadata from
+                # certain types of file inputs, like str-filepath or file-like
+                # objects, and errors for the rest of inputs. Hence this is
+                # to avoid failing on other types of file inputs.
+                pass
+            else:
+                arrow_schema = metadata.schema.to_arrow_schema()
+                check_cols = arrow_schema.names if columns is None else columns
+                for col_name, arrow_type in zip(
+                    arrow_schema.names, arrow_schema.types
+                ):
+                    if col_name not in check_cols:
+                        continue
+                    if isinstance(arrow_type, pa.ListType):
+                        val_field_types = arrow_type.value_field.flatten()
+                        for val_field_type in val_field_types:
+                            _check_decimal128_type(val_field_type.type)
+                    elif isinstance(arrow_type, pa.StructType):
+                        _ = cudf.StructDtype.from_arrow(arrow_type)
+                    else:
+                        _check_decimal128_type(arrow_type)
+
         return libparquet.read_parquet(
             filepaths_or_buffers,
             columns=columns,
@@ -731,3 +760,11 @@ def merge_parquet_filemetadata(filemetadata_list):
 
 
 ParquetWriter = libparquet.ParquetWriter
+
+
+def _check_decimal128_type(arrow_type):
+    if isinstance(arrow_type, pa.Decimal128Type):
+        if arrow_type.precision > cudf.Decimal64Dtype.MAX_PRECISION:
+            raise NotImplementedError(
+                "Decimal type greater than Decimal64 is not yet supported"
+            )
diff --git a/python/cudf/cudf/tests/data/parquet/nested_decimal128_file.parquet b/python/cudf/cudf/tests/data/parquet/nested_decimal128_file.parquet
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
@@ -629,15 +629,29 @@ def test_parquet_reader_spark_timestamps(datadir):
 def test_parquet_reader_spark_decimals(datadir):
     fname = datadir / "spark_decimal.parquet"
 
-    expect = pd.read_parquet(fname)
-    got = cudf.read_parquet(fname)
+    # expect = pd.read_parquet(fname)
+    with pytest.raises(
+        NotImplementedError,
+        match="Decimal type greater than Decimal64 is not yet supported",
+    ):
+        cudf.read_parquet(fname)
 
     # Convert the decimal dtype from PyArrow to float64 for comparison to cuDF
     # This is because cuDF returns as float64 as it lacks an equivalent dtype
-    expect = expect.apply(pd.to_numeric)
+    # expect = expect.apply(pd.to_numeric)
 
     # np.testing.assert_allclose(expect, got)
-    assert_eq(expect, got)
+    # assert_eq(expect, got)
+
+
+@pytest.mark.parametrize("columns", [["a"], ["b", "a"], None])
+def test_parquet_reader_decimal128_error_validation(datadir, columns):
+    fname = datadir / "nested_decimal128_file.parquet"
+    with pytest.raises(
+        NotImplementedError,
+        match="Decimal type greater than Decimal64 is not yet supported",
+    ):
+        cudf.read_parquet(fname, columns=columns)
 
 
 def test_parquet_reader_microsecond_timestamps(datadir):