From ba3aedbb869ef95cae517f56ffb2bab305bffa40 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 7 Dec 2021 15:23:43 -0600
Subject: [PATCH] Raise temporary error for `decimal128` types in parquet
 reader (#9804)

This PR adds a `decimal128` type validation in parquet reader. This is put in-place to unblock libcudf changes: https://github.com/rapidsai/cudf/pull/9765 and this validation will soon be removed once python side of `decimal128` changes are merged(blocked by libcudf `from_arrow` bug).

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9804
---
 python/cudf/cudf/io/parquet.py                |  37 ++++++++++++++++++
 .../parquet/nested_decimal128_file.parquet    | Bin 0 -> 1692 bytes
 python/cudf/cudf/tests/test_parquet.py        |  22 +++++++++--
 3 files changed, 55 insertions(+), 4 deletions(-)
 create mode 100644 python/cudf/cudf/tests/data/parquet/nested_decimal128_file.parquet

diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 04d64969a16..f9b39bf2cfa 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -7,6 +7,7 @@
 from uuid import uuid4
 
 import fsspec
+import pyarrow as pa
 from pyarrow import dataset as ds, parquet as pq
 
 import cudf
@@ -614,6 +615,34 @@ def _read_parquet(
     # Simple helper function to dispatch between
     # cudf and pyarrow to read parquet data
     if engine == "cudf":
+        # Temporary error to probe a parquet file
+        # and raise decimal128 support error.
+        if len(filepaths_or_buffers) > 0:
+            try:
+                metadata = pq.read_metadata(filepaths_or_buffers[0])
+            except TypeError:
+                # pq.read_metadata only supports reading metadata from
+                # certain types of file inputs, like str-filepath or file-like
+                # objects, and errors for the rest of inputs. Hence this is
+                # to avoid failing on other types of file inputs.
+                pass
+            else:
+                arrow_schema = metadata.schema.to_arrow_schema()
+                check_cols = arrow_schema.names if columns is None else columns
+                for col_name, arrow_type in zip(
+                    arrow_schema.names, arrow_schema.types
+                ):
+                    if col_name not in check_cols:
+                        continue
+                    if isinstance(arrow_type, pa.ListType):
+                        val_field_types = arrow_type.value_field.flatten()
+                        for val_field_type in val_field_types:
+                            _check_decimal128_type(val_field_type.type)
+                    elif isinstance(arrow_type, pa.StructType):
+                        _ = cudf.StructDtype.from_arrow(arrow_type)
+                    else:
+                        _check_decimal128_type(arrow_type)
+
         return libparquet.read_parquet(
             filepaths_or_buffers,
             columns=columns,
@@ -731,3 +760,11 @@ def merge_parquet_filemetadata(filemetadata_list):
 
 
 ParquetWriter = libparquet.ParquetWriter
+
+
+def _check_decimal128_type(arrow_type):
+    if isinstance(arrow_type, pa.Decimal128Type):
+        if arrow_type.precision > cudf.Decimal64Dtype.MAX_PRECISION:
+            raise NotImplementedError(
+                "Decimal type greater than Decimal64 is not yet supported"
+            )
diff --git a/python/cudf/cudf/tests/data/parquet/nested_decimal128_file.parquet b/python/cudf/cudf/tests/data/parquet/nested_decimal128_file.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..7440d357a1263ce9c28b6681e38c35540d5e2af8
GIT binary patch
literal 1692
zcmcgtPj3=I6n{HA(1qBhHl4|CvQZ9nLn5uU2}xt{Fbfr$f&!*0iI;^C#I_az#rOgI
z6nqfRo;(>39zA;O(W8mJH&f^Z4jN?wZ{DBZ`#;InRO61|qAd0*!V!D}APul;L2}jM
zuz5X^z5$mfBS54?I!iDS7jfTB%uH8oQbJ~~9<l3rhSW|Hb!=vThv28>i#0FDA@1#4
z5kKt>N4z)coCzi!`JTxiR`57*{xp@B#S*z-fF;ufbS=L{u3P*Vo3&WBS__(-@Zhuk
zPuj#4Rj7O1<a*x1*6Gj)=-@mygf!F9D|Dt7*O_8*fpfIzoJbLvGs|Ydh2slu_Ljk>
zTDUJRKUwmN6?|I>CM&)>C&q%vbGF62Ne<lju>HJq76}eH=30?pB3{X32|4cSmqH6(
z!-klv-^^AL5t-~WqJ%=9)?fMxzD3?3x#xTNrM=Q%nQ!Gk?@)e?Nsg0~A7YyTp6^~$
zjmE<#cg;@S3;Q;my>6(Ks^g!|ky2w7cGZ2Qx~kCCx)1?X;tEO~(v9-;Mv2%WsY#FI
z9R}S;=HNj|%VVMkL@*?BI;hD<v_$@4l}dS3LqhQSp|+Qiy(;aJ)p$xerg{)!ibQK@
zoM!o}*aT{|-Q*{LF4E8@$wx0R>8%sWPuB1)Ql*+i43s!ifAq9*+R{l1$As7MwW+9O
zaw(cB+T>*Xb?Ot6^^{9mMp8F+1YsQPr>VlRnX@)(hgm(kiN*Dpd7vHRXz1}QpNVr1
v+?)Mq@alDEwB7Fax1+(}e4KjI84P>pFH^al-JM-8?*o`0{IUUR_+9!9wrcs_

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 516ee0d17d3..597ae6c05c0 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -629,15 +629,29 @@ def test_parquet_reader_spark_timestamps(datadir):
 def test_parquet_reader_spark_decimals(datadir):
     fname = datadir / "spark_decimal.parquet"
 
-    expect = pd.read_parquet(fname)
-    got = cudf.read_parquet(fname)
+    # expect = pd.read_parquet(fname)
+    with pytest.raises(
+        NotImplementedError,
+        match="Decimal type greater than Decimal64 is not yet supported",
+    ):
+        cudf.read_parquet(fname)
 
     # Convert the decimal dtype from PyArrow to float64 for comparison to cuDF
     # This is because cuDF returns as float64 as it lacks an equivalent dtype
-    expect = expect.apply(pd.to_numeric)
+    # expect = expect.apply(pd.to_numeric)
 
     # np.testing.assert_allclose(expect, got)
-    assert_eq(expect, got)
+    # assert_eq(expect, got)
+
+
+@pytest.mark.parametrize("columns", [["a"], ["b", "a"], None])
+def test_parquet_reader_decimal128_error_validation(datadir, columns):
+    fname = datadir / "nested_decimal128_file.parquet"
+    with pytest.raises(
+        NotImplementedError,
+        match="Decimal type greater than Decimal64 is not yet supported",
+    ):
+        cudf.read_parquet(fname, columns=columns)
 
 
 def test_parquet_reader_microsecond_timestamps(datadir):