Skip to content

Commit

Permalink
Add serialization methods for List and StructDtype (#8441)
Browse files Browse the repository at this point in the history
Adds `serialize`/`deserialize` methods for `List` and `StructDtypes`, which I intend to use as part of #8153 when these dtypes are included in the `PackedColumns` object there.

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

Approvers:
  - Marlene  (https://github.com/marlenezw)
  - https://github.com/brandon-b-miller

URL: #8441
  • Loading branch information
charlesbluca authored Jun 21, 2021
1 parent 1de662f commit f71c6fe
Show file tree
Hide file tree
Showing 7 changed files with 190 additions and 67 deletions.
15 changes: 11 additions & 4 deletions python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,12 +181,14 @@ def set_base_data(self, value):

def serialize(self):
header = {}
frames = []
header["type-serialized"] = pickle.dumps(type(self))
header["dtype"] = pickle.dumps(self.dtype)
header["null_count"] = self.null_count
header["size"] = self.size
header["dtype"], dtype_frames = self.dtype.serialize()
header["dtype_frames_count"] = len(dtype_frames)
frames.extend(dtype_frames)

frames = []
sub_headers = []

for item in self.children:
Expand All @@ -211,9 +213,14 @@ def deserialize(cls, header, frames):
else:
mask = None

# Deserialize dtype
dtype = pickle.loads(header["dtype"]["type-serialized"]).deserialize(
header["dtype"], frames[: header["dtype_frames_count"]]
)

# Deserialize child columns
children = []
f = 0
f = header["dtype_frames_count"]
for h in header["subheaders"]:
fcount = h["frame_count"]
child_frames = frames[f : f + fcount]
Expand All @@ -224,7 +231,7 @@ def deserialize(cls, header, frames):
# Materialize list column
return column.build_column(
data=None,
dtype=pickle.loads(header["dtype"]),
dtype=dtype,
mask=mask,
children=tuple(children),
size=header["size"],
Expand Down
80 changes: 76 additions & 4 deletions python/cudf/cudf/core/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import decimal
import pickle
from typing import Any, Optional, Tuple
from typing import Any, Dict, List, Optional, Tuple

import numpy as np
import pandas as pd
Expand All @@ -17,9 +17,11 @@

import cudf
from cudf._typing import Dtype
from cudf.core.abc import Serializable
from cudf.core.buffer import Buffer


class _BaseDtype(ExtensionDtype):
class _BaseDtype(ExtensionDtype, Serializable):
# Base type for all cudf-specific dtypes
pass

Expand Down Expand Up @@ -111,12 +113,16 @@ def construct_from_string(self):

def serialize(self):
header = {}
frames = []
header["type-serialized"] = pickle.dumps(type(self))
header["ordered"] = self.ordered

frames = []

if self.categories is not None:
categories_header, categories_frames = self.categories.serialize()
header["categories"] = categories_header
frames.extend(categories_frames)

return header, frames

@classmethod
Expand Down Expand Up @@ -191,6 +197,30 @@ def __repr__(self):
def __hash__(self):
return hash(self._typ)

def serialize(self) -> Tuple[dict, list]:
header: Dict[str, Dtype] = {}
header["type-serialized"] = pickle.dumps(type(self))

frames = []

if isinstance(self.element_type, _BaseDtype):
header["element-type"], frames = self.element_type.serialize()
else:
header["element-type"] = self.element_type

return header, frames

@classmethod
def deserialize(cls, header: dict, frames: list):
if isinstance(header["element-type"], dict):
element_type = pickle.loads(
header["element-type"]["type-serialized"]
).deserialize(header["element-type"], frames)
else:
element_type = header["element-type"]

return cls(element_type=element_type)


class StructDtype(_BaseDtype):

Expand Down Expand Up @@ -242,6 +272,41 @@ def __repr__(self):
def __hash__(self):
return hash(self._typ)

def serialize(self) -> Tuple[dict, list]:
header: Dict[str, Any] = {}
header["type-serialized"] = pickle.dumps(type(self))

frames: List[Buffer] = []

fields = {}

for k, dtype in self.fields.items():
if isinstance(dtype, _BaseDtype):
dtype_header, dtype_frames = dtype.serialize()
fields[k] = (
dtype_header,
(len(frames), len(frames) + len(dtype_frames)),
)
frames.extend(dtype_frames)
else:
fields[k] = dtype
header["fields"] = fields

return header, frames

@classmethod
def deserialize(cls, header: dict, frames: list):
fields = {}
for k, dtype in header["fields"].items():
if isinstance(dtype, tuple):
dtype_header, (start, stop) = dtype
fields[k] = pickle.loads(
dtype_header["type-serialized"]
).deserialize(dtype_header, frames[start:stop],)
else:
fields[k] = dtype
return cls(fields)


class Decimal64Dtype(_BaseDtype):

Expand Down Expand Up @@ -337,7 +402,14 @@ def _from_decimal(cls, decimal):
return cls(precision, -metadata.exponent)

def serialize(self) -> Tuple[dict, list]:
return {"precision": self.precision, "scale": self.scale}, []
return (
{
"type-serialized": pickle.dumps(type(self)),
"precision": self.precision,
"scale": self.scale,
},
[],
)

@classmethod
def deserialize(cls, header: dict, frames: list):
Expand Down
22 changes: 22 additions & 0 deletions python/cudf/cudf/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -804,3 +804,25 @@ def test_series_construction_with_nulls(input_obj, dtype):
got = cudf.Series(input_obj, dtype="category").to_pandas()

assert_eq(expect, got)


@pytest.mark.parametrize(
"data",
[
{"a": cudf.Series(["a", "b", "c", "a", "c", "b"]).astype("category")},
{
"a": cudf.Series(["a", "a", "b", "b"]).astype("category"),
"b": cudf.Series(["b", "b", "c", "c"]).astype("category"),
"c": cudf.Series(["c", "c", "a", "a"]).astype("category"),
},
{
"a": cudf.Series(["a", None, "b", "b"]).astype("category"),
"b": cudf.Series(["b", "b", None, "c"]).astype("category"),
"c": cudf.Series(["c", "c", "a", None]).astype("category"),
},
],
)
def test_serialize_categorical_columns(data):
df = cudf.DataFrame(data)
recreated = df.__class__.deserialize(*df.serialize())
assert_eq(recreated, df)
38 changes: 38 additions & 0 deletions python/cudf/cudf/tests/test_decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,3 +297,41 @@ def test_series_construction_with_nulls(input_obj):
got = cudf.Series(input_obj).to_arrow()

assert expect == got


@pytest.mark.parametrize(
"data",
[
{
"a": _decimal_series(
["1", "2", "3"], dtype=cudf.Decimal64Dtype(1, 0)
)
},
{
"a": _decimal_series(
["1", "2", "3"], dtype=cudf.Decimal64Dtype(1, 0)
),
"b": _decimal_series(
["1.0", "2.0", "3.0"], dtype=cudf.Decimal64Dtype(2, 1)
),
"c": _decimal_series(
["10.1", "20.2", "30.3"], dtype=cudf.Decimal64Dtype(3, 1)
),
},
{
"a": _decimal_series(
["1", None, "3"], dtype=cudf.Decimal64Dtype(1, 0)
),
"b": _decimal_series(
["1.0", "2.0", None], dtype=cudf.Decimal64Dtype(2, 1)
),
"c": _decimal_series(
[None, "20.2", "30.3"], dtype=cudf.Decimal64Dtype(3, 1)
),
},
],
)
def test_serialize_decimal_columns(data):
df = cudf.DataFrame(data)
recreated = df.__class__.deserialize(*df.serialize())
assert_eq(recreated, df)
20 changes: 20 additions & 0 deletions python/cudf/cudf/tests/test_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,3 +360,23 @@ def test_construction_series_with_nulls(input_obj):
got = cudf.Series(input_obj).to_arrow()

assert expect == got


@pytest.mark.parametrize(
"data",
[
{"a": [[]]},
{"a": [[1, 2, None, 4]]},
{"a": [["cat", None, "dog"]]},
{
"a": [[1, 2, 3, None], [4, None, 5]],
"b": [None, ["fish", "bird"]],
"c": [[], []],
},
{"a": [[1, 2, 3, None], [4, None, 5], None, [6, 7]]},
],
)
def test_serialize_list_columns(data):
df = cudf.DataFrame(data)
recreated = df.__class__.deserialize(*df.serialize())
assert_eq(recreated, df)
60 changes: 1 addition & 59 deletions python/cudf/cudf/tests/test_serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import cudf
from cudf.tests import utils
from cudf.tests.utils import _decimal_series, assert_eq
from cudf.tests.utils import assert_eq


@pytest.mark.parametrize(
Expand Down Expand Up @@ -269,64 +269,6 @@ def test_serialize_string_check_buffer_sizes():
assert expect == got


@pytest.mark.parametrize(
"data",
[
{"a": [[]]},
{"a": [[1, 2, None, 4]]},
{"a": [["cat", None, "dog"]]},
{
"a": [[1, 2, 3, None], [4, None, 5]],
"b": [None, ["fish", "bird"]],
"c": [[], []],
},
{"a": [[1, 2, 3, None], [4, None, 5], None, [6, 7]]},
],
)
def test_serialize_list_columns(data):
df = cudf.DataFrame(data)
recreated = df.__class__.deserialize(*df.serialize())
assert_eq(recreated, df)


@pytest.mark.parametrize(
"data",
[
{
"a": _decimal_series(
["1", "2", "3"], dtype=cudf.Decimal64Dtype(1, 0)
)
},
{
"a": _decimal_series(
["1", "2", "3"], dtype=cudf.Decimal64Dtype(1, 0)
),
"b": _decimal_series(
["1.0", "2.0", "3.0"], dtype=cudf.Decimal64Dtype(2, 1)
),
"c": _decimal_series(
["10.1", "20.2", "30.3"], dtype=cudf.Decimal64Dtype(3, 1)
),
},
{
"a": _decimal_series(
["1", None, "3"], dtype=cudf.Decimal64Dtype(1, 0)
),
"b": _decimal_series(
["1.0", "2.0", None], dtype=cudf.Decimal64Dtype(2, 1)
),
"c": _decimal_series(
[None, "20.2", "30.3"], dtype=cudf.Decimal64Dtype(3, 1)
),
},
],
)
def test_serialize_decimal_columns(data):
df = cudf.DataFrame(data)
recreated = df.__class__.deserialize(*df.serialize())
assert_eq(recreated, df)


def test_deserialize_cudf_0_16(datadir):
fname = datadir / "pkl" / "stringColumnWithRangeIndex_cudf_0.16.pkl"

Expand Down
22 changes: 22 additions & 0 deletions python/cudf/cudf/tests/test_struct.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2020, NVIDIA CORPORATION.

import numpy as np
import pandas as pd
import pyarrow as pa
import pytest
Expand Down Expand Up @@ -53,3 +54,24 @@ def test_series_construction_with_nulls(input_obj):
got = cudf.Series(input_obj).to_arrow()

assert expect == got


@pytest.mark.parametrize(
"fields",
[
{"a": np.dtype(np.int64)},
{"a": np.dtype(np.int64), "b": None},
{
"a": cudf.ListDtype(np.dtype(np.int64)),
"b": cudf.Decimal64Dtype(1, 0),
},
{
"a": cudf.ListDtype(cudf.StructDtype({"b": np.dtype(np.int64)})),
"b": cudf.ListDtype(cudf.ListDtype(np.dtype(np.int64))),
},
],
)
def test_serialize_struct_dtype(fields):
dtype = cudf.StructDtype(fields)
recreated = dtype.__class__.deserialize(*dtype.serialize())
assert recreated == dtype

0 comments on commit f71c6fe

Please sign in to comment.