From 8c50454d677e3fe0def130cf362da87c2495ebfb Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 6 Oct 2021 08:44:53 -0700 Subject: [PATCH 1/5] fix StructColumn to_pandas --- python/cudf/cudf/core/column/struct.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 7167918d14d..0b452e1c754 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -1,6 +1,7 @@ # Copyright (c) 2020, NVIDIA CORPORATION. from __future__ import annotations +import pandas as pd import pyarrow as pa import cudf @@ -80,6 +81,16 @@ def to_arrow(self): pa_type, len(self), buffers, children=children ) + def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series": + # We cannot go via Arrow's `to_pandas` because of the following issue: + # https://issues.apache.org/jira/browse/ARROW-12680 + + pd_series = pd.Series(self.to_arrow().tolist()) + + if index is not None: + pd_series.index = index + return pd_series + def __getitem__(self, args): result = super().__getitem__(args) if isinstance(result, dict): From be768b917d735ba05d883a801d0326d3b5289fff Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 6 Oct 2021 09:05:18 -0700 Subject: [PATCH 2/5] add tests --- python/cudf/cudf/tests/test_struct.py | 43 ++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index 4e5e9c96146..a31cc677c65 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -7,7 +7,7 @@ import cudf from cudf.core.dtypes import StructDtype -from cudf.testing._utils import assert_eq +from cudf.testing._utils import DATETIME_TYPES, TIMEDELTA_TYPES, assert_eq @pytest.mark.parametrize( @@ -292,3 +292,44 @@ def test_struct_field_errors(data): with pytest.raises(IndexError): got.struct.field(100) + + +@pytest.mark.parametrize("dtype", DATETIME_TYPES) +def test_struct_with_datetime(dtype): + df = cudf.DataFrame( + { + "a": [12, 232, 2334], + "datetime": cudf.Series([23432, 3432423, 324324], dtype=dtype), + } + ) + series = df.to_struct() + + actual = series.to_pandas() + expected = pd.Series(series.to_arrow().tolist()) + assert_eq(expected, actual) + + +@pytest.mark.parametrize("dtype", TIMEDELTA_TYPES) +def test_struct_with_timedelta(dtype): + df = cudf.DataFrame( + { + "a": [12, 232, 2334], + "datetime": cudf.Series([23432, 3432423, 324324], dtype=dtype), + } + ) + series = df.to_struct() + + actual = series.to_pandas() + expected = pd.Series(series.to_arrow().tolist()) + assert_eq(expected, actual) + + +def test_struct_int_values(): + series = cudf.Series( + [{"a": 1, "b": 2}, {"a": 10, "b": None}, {"a": 5, "b": 6}] + ) + actual_series = series.to_pandas() + + assert isinstance(actual_series[0]["b"], int) + assert isinstance(actual_series[1]["b"], type(None)) + assert isinstance(actual_series[2]["b"], int) From cd03a50928cf793c8f220eff2445186693c74000 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 6 Oct 2021 09:46:23 -0700 Subject: [PATCH 3/5] dtype --- python/cudf/cudf/core/column/struct.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 0b452e1c754..f0d02a706e2 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -85,7 +85,7 @@ def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series": # We cannot go via Arrow's `to_pandas` because of the following issue: # https://issues.apache.org/jira/browse/ARROW-12680 - pd_series = pd.Series(self.to_arrow().tolist()) + pd_series = pd.Series(self.to_arrow().tolist(), dtype="object") if index is not None: pd_series.index = index From 73d838f2f46708bc005c6b2b3fa6c6ca00c289dd Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 6 Oct 2021 10:34:16 -0700 Subject: [PATCH 4/5] merge tests --- python/cudf/cudf/tests/test_struct.py | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index a31cc677c65..3e367241661 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -294,23 +294,8 @@ def test_struct_field_errors(data): got.struct.field(100) -@pytest.mark.parametrize("dtype", DATETIME_TYPES) -def test_struct_with_datetime(dtype): - df = cudf.DataFrame( - { - "a": [12, 232, 2334], - "datetime": cudf.Series([23432, 3432423, 324324], dtype=dtype), - } - ) - series = df.to_struct() - - actual = series.to_pandas() - expected = pd.Series(series.to_arrow().tolist()) - assert_eq(expected, actual) - - -@pytest.mark.parametrize("dtype", TIMEDELTA_TYPES) -def test_struct_with_timedelta(dtype): +@pytest.mark.parametrize("dtype", DATETIME_TYPES + TIMEDELTA_TYPES) +def test_struct_with_datetime_and_timedelta(dtype): df = cudf.DataFrame( { "a": [12, 232, 2334], From cb299b4adc9c9bfe8ca7f1d99a59d91603070453 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 6 Oct 2021 10:51:14 -0700 Subject: [PATCH 5/5] change test --- python/cudf/cudf/tests/test_struct.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index 3e367241661..d9558cb5041 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -303,9 +303,15 @@ def test_struct_with_datetime_and_timedelta(dtype): } ) series = df.to_struct() + a_array = np.array([12, 232, 2334]) + datetime_array = np.array([23432, 3432423, 324324]).astype(dtype) actual = series.to_pandas() - expected = pd.Series(series.to_arrow().tolist()) + values_list = [] + for i, val in enumerate(a_array): + values_list.append({"a": val, "datetime": datetime_array[i]}) + + expected = pd.Series(values_list) assert_eq(expected, actual)