Skip to content

Commit

Permalink
Enable decimal support in parquet writer (#7673)
Browse files Browse the repository at this point in the history
Resolves #7669

Authors:
  - Devavret Makkar (@devavret)

Approvers:
  - Keith Kraus (@kkraus14)

URL: #7673
  • Loading branch information
devavret authored Mar 23, 2021
1 parent d500142 commit 5cd90a0
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 7 deletions.
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/cpp/io/parquet.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
column_in_metadata& set_nullability(bool nullable)
column_in_metadata& set_list_column_as_map()
column_in_metadata& set_int96_timestamps(bool req)
column_in_metadata& set_decimal_precision(uint8_t precision)
column_in_metadata& child(size_type i)

cdef cppclass table_input_metadata:
Expand Down
15 changes: 9 additions & 6 deletions python/cudf/cudf/_lib/parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ from cudf.utils.dtypes import (
np_to_pa_dtype,
is_categorical_dtype,
is_list_dtype,
is_struct_dtype
is_struct_dtype,
is_decimal_dtype,
)

from cudf._lib.utils cimport get_column_names
Expand Down Expand Up @@ -310,7 +311,7 @@ cpdef write_parquet(

for i, name in enumerate(table._column_names, num_index_cols_meta):
tbl_meta.get().column_metadata[i].set_name(name.encode())
_set_col_children_names(
_set_col_metadata(
table[name]._column, tbl_meta.get().column_metadata[i]
)

Expand Down Expand Up @@ -448,7 +449,7 @@ cdef class ParquetWriter:

for i, name in enumerate(table._column_names, num_index_cols_meta):
self.tbl_meta.get().column_metadata[i].set_name(name.encode())
_set_col_children_names(
_set_col_metadata(
table[name]._column, self.tbl_meta.get().column_metadata[i]
)

Expand Down Expand Up @@ -546,14 +547,16 @@ cdef Column _update_column_struct_field_names(
col.set_base_children(tuple(children))
return col

cdef _set_col_children_names(Column col, column_in_metadata& col_meta):
cdef _set_col_metadata(Column col, column_in_metadata& col_meta):
if is_struct_dtype(col):
for i, (child_col, name) in enumerate(
zip(col.children, list(col.dtype.fields))
):
col_meta.child(i).set_name(name.encode())
_set_col_children_names(child_col, col_meta.child(i))
_set_col_metadata(child_col, col_meta.child(i))
elif is_list_dtype(col):
_set_col_children_names(col.children[1], col_meta.child(1))
_set_col_metadata(col.children[1], col_meta.child(1))
else:
if is_decimal_dtype(col):
col_meta.set_decimal_precision(col.dtype.precision)
return
7 changes: 6 additions & 1 deletion python/cudf/cudf/_lib/utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ from cudf.utils.dtypes import (
is_categorical_dtype,
is_list_dtype,
is_struct_dtype,
is_decimal_dtype,
)


Expand Down Expand Up @@ -80,7 +81,11 @@ cpdef generate_pandas_metadata(Table table, index):
"'category' column dtypes are currently not "
+ "supported by the gpu accelerated parquet writer"
)
elif is_list_dtype(col) or is_struct_dtype(col):
elif (
is_list_dtype(col)
or is_struct_dtype(col)
or is_decimal_dtype(col)
):
types.append(col.dtype.to_arrow())
else:
types.append(np_to_pa_dtype(col.dtype))
Expand Down
15 changes: 15 additions & 0 deletions python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1920,3 +1920,18 @@ def test_parquet_writer_nested(tmpdir, data):

got = pd.read_parquet(fname)
assert_eq(expect, got)


def test_parquet_writer_decimal(tmpdir):
from cudf.core.dtypes import Decimal64Dtype

gdf = cudf.DataFrame({"val": [0.00, 0.01, 0.02]})

gdf["dec_val"] = gdf["val"].astype(Decimal64Dtype(7, 2))

fname = tmpdir.join("test_parquet_writer_decimal.parquet")
gdf.to_parquet(fname)
assert os.path.exists(fname)

got = pd.read_parquet(fname)
assert_eq(gdf, got)

0 comments on commit 5cd90a0

Please sign in to comment.