Skip to content

Commit

Permalink
Raise temporary error for decimal128 types in parquet reader (#9804)
Browse files Browse the repository at this point in the history
This PR adds a `decimal128` type validation in parquet reader. This is put in-place to unblock libcudf changes: #9765 and this validation will soon be removed once python side of `decimal128` changes are merged(blocked by libcudf `from_arrow` bug).

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #9804
  • Loading branch information
galipremsagar authored Dec 7, 2021
1 parent a5633c2 commit ba3aedb
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 4 deletions.
37 changes: 37 additions & 0 deletions python/cudf/cudf/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from uuid import uuid4

import fsspec
import pyarrow as pa
from pyarrow import dataset as ds, parquet as pq

import cudf
Expand Down Expand Up @@ -614,6 +615,34 @@ def _read_parquet(
# Simple helper function to dispatch between
# cudf and pyarrow to read parquet data
if engine == "cudf":
# Temporary error to probe a parquet file
# and raise decimal128 support error.
if len(filepaths_or_buffers) > 0:
try:
metadata = pq.read_metadata(filepaths_or_buffers[0])
except TypeError:
# pq.read_metadata only supports reading metadata from
# certain types of file inputs, like str-filepath or file-like
# objects, and errors for the rest of inputs. Hence this is
# to avoid failing on other types of file inputs.
pass
else:
arrow_schema = metadata.schema.to_arrow_schema()
check_cols = arrow_schema.names if columns is None else columns
for col_name, arrow_type in zip(
arrow_schema.names, arrow_schema.types
):
if col_name not in check_cols:
continue
if isinstance(arrow_type, pa.ListType):
val_field_types = arrow_type.value_field.flatten()
for val_field_type in val_field_types:
_check_decimal128_type(val_field_type.type)
elif isinstance(arrow_type, pa.StructType):
_ = cudf.StructDtype.from_arrow(arrow_type)
else:
_check_decimal128_type(arrow_type)

return libparquet.read_parquet(
filepaths_or_buffers,
columns=columns,
Expand Down Expand Up @@ -731,3 +760,11 @@ def merge_parquet_filemetadata(filemetadata_list):


ParquetWriter = libparquet.ParquetWriter


def _check_decimal128_type(arrow_type):
if isinstance(arrow_type, pa.Decimal128Type):
if arrow_type.precision > cudf.Decimal64Dtype.MAX_PRECISION:
raise NotImplementedError(
"Decimal type greater than Decimal64 is not yet supported"
)
Binary file not shown.
22 changes: 18 additions & 4 deletions python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,15 +629,29 @@ def test_parquet_reader_spark_timestamps(datadir):
def test_parquet_reader_spark_decimals(datadir):
fname = datadir / "spark_decimal.parquet"

expect = pd.read_parquet(fname)
got = cudf.read_parquet(fname)
# expect = pd.read_parquet(fname)
with pytest.raises(
NotImplementedError,
match="Decimal type greater than Decimal64 is not yet supported",
):
cudf.read_parquet(fname)

# Convert the decimal dtype from PyArrow to float64 for comparison to cuDF
# This is because cuDF returns as float64 as it lacks an equivalent dtype
expect = expect.apply(pd.to_numeric)
# expect = expect.apply(pd.to_numeric)

# np.testing.assert_allclose(expect, got)
assert_eq(expect, got)
# assert_eq(expect, got)


@pytest.mark.parametrize("columns", [["a"], ["b", "a"], None])
def test_parquet_reader_decimal128_error_validation(datadir, columns):
fname = datadir / "nested_decimal128_file.parquet"
with pytest.raises(
NotImplementedError,
match="Decimal type greater than Decimal64 is not yet supported",
):
cudf.read_parquet(fname, columns=columns)


def test_parquet_reader_microsecond_timestamps(datadir):
Expand Down

0 comments on commit ba3aedb

Please sign in to comment.