From ba3aedbb869ef95cae517f56ffb2bab305bffa40 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 7 Dec 2021 15:23:43 -0600 Subject: [PATCH] Raise temporary error for `decimal128` types in parquet reader (#9804) This PR adds a `decimal128` type validation in parquet reader. This is put in-place to unblock libcudf changes: https://github.com/rapidsai/cudf/pull/9765 and this validation will soon be removed once python side of `decimal128` changes are merged(blocked by libcudf `from_arrow` bug). Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/9804 --- python/cudf/cudf/io/parquet.py | 37 ++++++++++++++++++ .../parquet/nested_decimal128_file.parquet | Bin 0 -> 1692 bytes python/cudf/cudf/tests/test_parquet.py | 22 +++++++++-- 3 files changed, 55 insertions(+), 4 deletions(-) create mode 100644 python/cudf/cudf/tests/data/parquet/nested_decimal128_file.parquet diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 04d64969a16..f9b39bf2cfa 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -7,6 +7,7 @@ from uuid import uuid4 import fsspec +import pyarrow as pa from pyarrow import dataset as ds, parquet as pq import cudf @@ -614,6 +615,34 @@ def _read_parquet( # Simple helper function to dispatch between # cudf and pyarrow to read parquet data if engine == "cudf": + # Temporary error to probe a parquet file + # and raise decimal128 support error. + if len(filepaths_or_buffers) > 0: + try: + metadata = pq.read_metadata(filepaths_or_buffers[0]) + except TypeError: + # pq.read_metadata only supports reading metadata from + # certain types of file inputs, like str-filepath or file-like + # objects, and errors for the rest of inputs. Hence this is + # to avoid failing on other types of file inputs. + pass + else: + arrow_schema = metadata.schema.to_arrow_schema() + check_cols = arrow_schema.names if columns is None else columns + for col_name, arrow_type in zip( + arrow_schema.names, arrow_schema.types + ): + if col_name not in check_cols: + continue + if isinstance(arrow_type, pa.ListType): + val_field_types = arrow_type.value_field.flatten() + for val_field_type in val_field_types: + _check_decimal128_type(val_field_type.type) + elif isinstance(arrow_type, pa.StructType): + _ = cudf.StructDtype.from_arrow(arrow_type) + else: + _check_decimal128_type(arrow_type) + return libparquet.read_parquet( filepaths_or_buffers, columns=columns, @@ -731,3 +760,11 @@ def merge_parquet_filemetadata(filemetadata_list): ParquetWriter = libparquet.ParquetWriter + + +def _check_decimal128_type(arrow_type): + if isinstance(arrow_type, pa.Decimal128Type): + if arrow_type.precision > cudf.Decimal64Dtype.MAX_PRECISION: + raise NotImplementedError( + "Decimal type greater than Decimal64 is not yet supported" + ) diff --git a/python/cudf/cudf/tests/data/parquet/nested_decimal128_file.parquet b/python/cudf/cudf/tests/data/parquet/nested_decimal128_file.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7440d357a1263ce9c28b6681e38c35540d5e2af8 GIT binary patch literal 1692 zcmcgtPj3=I6n{HA(1qBhHl4|CvQZ9nLn5uU2}xt{Fbfr$f&!*0iI;^C#I_az#rOgI z6nqfRo;(>39zA;O(W8mJH&f^Z4jN?wZ{DBZ`#;InRO61|qAd0*!V!D}APul;L2}jM zuz5X^z5$mfBS54?I!iDS7jfTB%uH8oQbJ~~9i#0FDA@1#4 z5kKt>N4z)coCzi!`JTxiR`57*{xp@B#S*z-fF;ufbS=L{u3P*Vo3&WBS__(-@Zhuk zPuj#4Rj7O1 zTDUJRKUwmN6?|I>CM&)>C&q%vbGF62NeHJq76}eH=30?pB3{X32|4cSmqH6( z!-klv-^^AL5t-~WqJ%=9)?fMxzD3?3x#xTNrM=Q%nQ!Gk?@)e?Nsg0~A7YyTp6^~$ zjmE<#cg;@S3;Q;my>6(Ks^g!|ky2w7cGZ2Qx~kCCx)1?X;tEO~(v9-;Mv2%WsY#FI z9R}S;=HNj|%VVMkL@*?BI;hD8%sWPuB1)Ql*+i43s!ifAq9*+R{l1$As7MwW+9O zaw(cB+T>*Xb?Ot6^^{9mMp8F+1YsQPr>VlRnX@)(hgm(kiN*Dpd7vHRXz1}QpNVr1 v+?)Mq@alDEwB7Fax1+(}e4KjI84P>pFH^al-JM-8?*o`0{IUUR_+9!9wrcs_ literal 0 HcmV?d00001 diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 516ee0d17d3..597ae6c05c0 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -629,15 +629,29 @@ def test_parquet_reader_spark_timestamps(datadir): def test_parquet_reader_spark_decimals(datadir): fname = datadir / "spark_decimal.parquet" - expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname) + # expect = pd.read_parquet(fname) + with pytest.raises( + NotImplementedError, + match="Decimal type greater than Decimal64 is not yet supported", + ): + cudf.read_parquet(fname) # Convert the decimal dtype from PyArrow to float64 for comparison to cuDF # This is because cuDF returns as float64 as it lacks an equivalent dtype - expect = expect.apply(pd.to_numeric) + # expect = expect.apply(pd.to_numeric) # np.testing.assert_allclose(expect, got) - assert_eq(expect, got) + # assert_eq(expect, got) + + +@pytest.mark.parametrize("columns", [["a"], ["b", "a"], None]) +def test_parquet_reader_decimal128_error_validation(datadir, columns): + fname = datadir / "nested_decimal128_file.parquet" + with pytest.raises( + NotImplementedError, + match="Decimal type greater than Decimal64 is not yet supported", + ): + cudf.read_parquet(fname, columns=columns) def test_parquet_reader_microsecond_timestamps(datadir):