From fd6625d2e8c389c5a8c8617f7a3ad50abbc09aa0 Mon Sep 17 00:00:00 2001 From: Shane Ding Date: Wed, 7 Jul 2021 18:54:15 +0000 Subject: [PATCH 01/21] allocate correct bufer for children --- python/cudf/cudf/core/column/column.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index b03465bf8d0..901be2002d1 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1339,7 +1339,16 @@ def column_empty( dtype = pandas_dtype(dtype) children = () # type: Tuple[ColumnBase, ...] - if is_categorical_dtype(dtype): + if is_struct_dtype(dtype): + data = None + children = tuple( + build_column( + data=Buffer.empty(row_count * dtype.fields[f].itemsize), + dtype=dtype.fields[f].name, + ) + for f in dtype.fields.keys() + ) + elif is_categorical_dtype(dtype): data = None children = ( build_column( From f50b129efdc218fbea48023b8df4b5be4a3f2bf0 Mon Sep 17 00:00:00 2001 From: Shane Ding Date: Wed, 7 Jul 2021 19:09:03 +0000 Subject: [PATCH 02/21] update child buffer creation --- python/cudf/cudf/core/column/column.py | 5 +---- python/dask_cudf/dask_cudf/tests/test_struct.py | 0 2 files changed, 1 insertion(+), 4 deletions(-) create mode 100644 python/dask_cudf/dask_cudf/tests/test_struct.py diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 901be2002d1..1413a641637 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1342,10 +1342,7 @@ def column_empty( if is_struct_dtype(dtype): data = None children = tuple( - build_column( - data=Buffer.empty(row_count * dtype.fields[f].itemsize), - dtype=dtype.fields[f].name, - ) + column_empty(row_count, dtype.fields[f]) for f in dtype.fields.keys() ) elif is_categorical_dtype(dtype): diff --git a/python/dask_cudf/dask_cudf/tests/test_struct.py b/python/dask_cudf/dask_cudf/tests/test_struct.py new file mode 100644 index 00000000000..e69de29bb2d From 9175e5e82a6d31d1257b2f9f0738b443183044e2 Mon Sep 17 00:00:00 2001 From: Shane Ding Date: Wed, 7 Jul 2021 21:43:35 +0000 Subject: [PATCH 03/21] added test cases --- .../dask_cudf/dask_cudf/tests/test_struct.py | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/python/dask_cudf/dask_cudf/tests/test_struct.py b/python/dask_cudf/dask_cudf/tests/test_struct.py index e69de29bb2d..16ee24de256 100644 --- a/python/dask_cudf/dask_cudf/tests/test_struct.py +++ b/python/dask_cudf/dask_cudf/tests/test_struct.py @@ -0,0 +1,37 @@ +import pytest + +import cudf + +import dask_cudf + + +@pytest.mark.parametrize( + "data, column", + [ + ( + { + "a": [{"a": [1, 2, 3, 4], "b": "Hello world"}, {}, {"a": []}], + "b": [1, 2, 3], + "c": ["rapids", "cudf", "hi"], + }, + "a", + ), + ( + {"a": [{}, {}, {}], "b": [1, 2, 3], "c": ["rapids", "cudf", "hi"]}, + "a", + ), + ( + { + "a": [{}, {}, {}], + "b": [{"a": 1}, {"b": 5}, {"c": "Hello"}], + "c": ["rapids", "cudf", "hi"], + }, + "b", + ), + ], +) +def test_select_struct(data, column): + # df = pd.DataFrame(data) + df = cudf.DataFrame(data) + ddf = dask_cudf.from_cudf(df, 2) + assert df[column].to_arrow() == ddf[column].compute().to_arrow() From be44b91c00e8964ca48f12958f5cd1211c81c793 Mon Sep 17 00:00:00 2001 From: Shane Ding Date: Thu, 8 Jul 2021 14:52:41 +0000 Subject: [PATCH 04/21] remove comment --- python/dask_cudf/dask_cudf/tests/test_struct.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_struct.py b/python/dask_cudf/dask_cudf/tests/test_struct.py index 16ee24de256..1259a198080 100644 --- a/python/dask_cudf/dask_cudf/tests/test_struct.py +++ b/python/dask_cudf/dask_cudf/tests/test_struct.py @@ -31,7 +31,6 @@ ], ) def test_select_struct(data, column): - # df = pd.DataFrame(data) df = cudf.DataFrame(data) ddf = dask_cudf.from_cudf(df, 2) assert df[column].to_arrow() == ddf[column].compute().to_arrow() From b6c2410312f0504cc75e5ecf243bc18dbe5ec722 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Mon, 19 Jul 2021 17:57:34 +0000 Subject: [PATCH 05/21] Implemented StructMethods struct accessor --- python/dask_cudf/dask_cudf/accessors.py | 30 +++++++++++++++++++++++++ python/dask_cudf/dask_cudf/core.py | 6 ++++- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py index 04d3e20b844..2a812f13b7d 100644 --- a/python/dask_cudf/dask_cudf/accessors.py +++ b/python/dask_cudf/dask_cudf/accessors.py @@ -1,5 +1,35 @@ # Copyright (c) 2021, NVIDIA CORPORATION. +class StructMethods: + def __init__(self, d_series): + self.d_series = d_series + def field(self, key): + """ + Extract children of the specified struct column + in the Series + Parameters + ---------- + key: int or str + index/position or field name of the respective + struct column + Returns + ------- + Series + Examples + -------- + >>> s = cudf.Series([{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]) + >>> s.struct.field(0) + 0 1 + 1 3 + dtype: int64 + >>> s.struct.field('a') + 0 1 + 1 3 + dtype: int64 + """ + return self.d_series.map_partitions( + lambda s: s.struct.field(key), meta=self.d_series._meta + ) class ListMethods: def __init__(self, d_series): diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 1a632907047..f1fb408b0d1 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -27,7 +27,7 @@ from cudf import _lib as libcudf from dask_cudf import sorting -from dask_cudf.accessors import ListMethods +from dask_cudf.accessors import ListMethods, StructMethods DASK_VERSION = LooseVersion(dask.__version__) @@ -414,6 +414,10 @@ def groupby(self, *args, **kwargs): def list(self): return ListMethods(self) + @property + def struct(self): + return StructMethods(self) + class Index(Series, dd.core.Index): _partition_type = cudf.Index From c3912b68a59898ce840ebdc16bc80e021e25f944 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Mon, 19 Jul 2021 18:41:10 +0000 Subject: [PATCH 06/21] Fixed example to use dask_cudf in field method comments --- python/dask_cudf/dask_cudf/accessors.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py index 2a812f13b7d..ee5e5a68293 100644 --- a/python/dask_cudf/dask_cudf/accessors.py +++ b/python/dask_cudf/dask_cudf/accessors.py @@ -18,11 +18,12 @@ def field(self, key): Examples -------- >>> s = cudf.Series([{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]) - >>> s.struct.field(0) + >>> ds = dask_cudf.from_cudf(s, 2) + >>> ds.struct.field(0).compute() 0 1 1 3 dtype: int64 - >>> s.struct.field('a') + >>> ds.struct.field('a') 0 1 1 3 dtype: int64 From 394b8e7e3842a7931df463c5b4283635f9c396b2 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Tue, 20 Jul 2021 20:14:16 +0000 Subject: [PATCH 07/21] Partial fix for incorrect metadata in struct accessor field method --- python/dask_cudf/dask_cudf/accessors.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py index ee5e5a68293..a9775df0c58 100644 --- a/python/dask_cudf/dask_cudf/accessors.py +++ b/python/dask_cudf/dask_cudf/accessors.py @@ -23,13 +23,15 @@ def field(self, key): 0 1 1 3 dtype: int64 - >>> ds.struct.field('a') + >>> ds.struct.field('a').compute() 0 1 1 3 dtype: int64 """ + typ = self.d_series._meta.dtype.fields[key] return self.d_series.map_partitions( - lambda s: s.struct.field(key), meta=self.d_series._meta + lambda s: s.struct.field(key), + meta=self.d_series._meta._constructor([], dtype=typ), ) class ListMethods: From 2e41f40b2bd11c1e08c8dd9fdae6a4da4f602caf Mon Sep 17 00:00:00 2001 From: sft-managed Date: Wed, 21 Jul 2021 19:11:01 +0000 Subject: [PATCH 08/21] Added better handling of key types in struct field() accessor method --- python/dask_cudf/dask_cudf/accessors.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py index a9775df0c58..91508cd7437 100644 --- a/python/dask_cudf/dask_cudf/accessors.py +++ b/python/dask_cudf/dask_cudf/accessors.py @@ -28,11 +28,21 @@ def field(self, key): 1 3 dtype: int64 """ - typ = self.d_series._meta.dtype.fields[key] - return self.d_series.map_partitions( - lambda s: s.struct.field(key), - meta=self.d_series._meta._constructor([], dtype=typ), + if key in self.d_series._meta.dtype.fields.keys(): + typ = self.d_series._meta.dtype.fields[key] + return self.d_series.map_partitions( + lambda s: s.struct.field(key), + meta=self.d_series._meta._constructor([], dtype=typ), + ) + elif isinstance(key, int): + key_list = [dict_key for dict_key in self.d_series._meta.dtype.fields.keys()] + typ_key = key_list[key] + typ = self.d_series._meta.dtype.fields[typ_key] + return self.d_series.map_partitions( + lambda s: s.struct.field(key), + meta=self.d_series._meta._constructor([], dtype=typ), ) + class ListMethods: def __init__(self, d_series): From 10f8a853d7b8f7f048cf9a40b51c8b321dcda067 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Thu, 22 Jul 2021 14:04:58 +0000 Subject: [PATCH 09/21] Fixed KeyError handling for field() method in Struct Accessor --- python/dask_cudf/dask_cudf/accessors.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py index 91508cd7437..65fe5522a31 100644 --- a/python/dask_cudf/dask_cudf/accessors.py +++ b/python/dask_cudf/dask_cudf/accessors.py @@ -28,20 +28,25 @@ def field(self, key): 1 3 dtype: int64 """ - if key in self.d_series._meta.dtype.fields.keys(): + try: typ = self.d_series._meta.dtype.fields[key] return self.d_series.map_partitions( lambda s: s.struct.field(key), meta=self.d_series._meta._constructor([], dtype=typ), ) - elif isinstance(key, int): - key_list = [dict_key for dict_key in self.d_series._meta.dtype.fields.keys()] - typ_key = key_list[key] - typ = self.d_series._meta.dtype.fields[typ_key] - return self.d_series.map_partitions( - lambda s: s.struct.field(key), - meta=self.d_series._meta._constructor([], dtype=typ), - ) + except KeyError as e: + if isinstance(key, int): + key_list = [dict_key for dict_key in self.d_series._meta.dtype.fields.keys()] + typ_key = key_list[key] + typ = self.d_series._meta.dtype.fields[typ_key] + return self.d_series.map_partitions( + lambda s: s.struct.field(key), + meta=self.d_series._meta._constructor([], dtype=typ), + ) + else: + print('Field "' + str(key) + '" is not found in the set of existing keys.') + raise e + class ListMethods: From 5bff6cbccdba54f6693c72b7e248bfc7da06f26b Mon Sep 17 00:00:00 2001 From: sft-managed Date: Tue, 27 Jul 2021 20:55:22 +0000 Subject: [PATCH 10/21] Added testing of struct series creation and struct.field() method --- .../dask_cudf/tests/test_accessor.py | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 342f2b60180..ad5397ba9e3 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -438,3 +438,71 @@ def test_sorting(data, ascending, na_position, ignore_index): .reset_index(drop=True) ) assert_eq(expect, got) + +############################################################################# +# Struct Accessor # +############################################################################# + + + +@pytest.mark.parametrize( + "data", + [ + [{"a":5, "b":10}, + {"a":3, "b":7}, + {"a":-3, "b":11}], + + [{"a":None, "b":1}, + {"a":None, "b":0}, + {"a":-3, "b":None}], + + #[{}], + + [{'a':1, 'b':2}], + + [{'b':3, 'c':4}], + + #[{None:None}, + #{None:5}, + #{'string_key':'string_field'}, + #{None:'string_field'}] + + + ], +) +def test_create_struct_series(data): + expect = pd.Series(data) + ds_got = dgd.from_cudf(Series(data), 2) + assert_eq(expect, ds_got.compute()) + +@pytest.mark.parametrize( + "data", + [ + #[{}], + + [{"a":5, "b":10}, + {"a":3, "b":7}, + {"a":-3, "b":11}], + + [{"a":None, "b":1}, + {"a":None, "b":0}, + {"a":-3, "b":None}], + + [{}], + + [{'a':1, 'b':2}], + + [{'b':3, 'c':4}], + + [{None:None}, + {None:5}, + {'string_key':'string_field'}, + {None:'string_field'}] + + + ], +) +def test_struct_field(data): + expect = Series(data).struct.field('a') + ds_got = dgd.from_cudf(Series(data), 2).struct.field('a') + assert_eq(expect, ds_got.compute()) \ No newline at end of file From 3bc0214030737172d2cd274f1294eb0c316e69a5 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Tue, 27 Jul 2021 21:30:10 +0000 Subject: [PATCH 11/21] Modified testing of accessor field method --- python/dask_cudf/dask_cudf/tests/test_accessor.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index ad5397ba9e3..d3418a5c0df 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -478,7 +478,6 @@ def test_create_struct_series(data): @pytest.mark.parametrize( "data", [ - #[{}], [{"a":5, "b":10}, {"a":3, "b":7}, @@ -488,16 +487,10 @@ def test_create_struct_series(data): {"a":None, "b":0}, {"a":-3, "b":None}], - [{}], - [{'a':1, 'b':2}], [{'b':3, 'c':4}], - [{None:None}, - {None:5}, - {'string_key':'string_field'}, - {None:'string_field'}] ], From 9f88244fe8740a859b17a4bdb9649385fa1aa7f0 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Tue, 27 Jul 2021 21:32:46 +0000 Subject: [PATCH 12/21] Modified struct accessor testing further --- .../dask_cudf/tests/test_accessor.py | 35 +++++++++++++------ 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index d3418a5c0df..d956b8cbc4d 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -456,18 +456,10 @@ def test_sorting(data, ascending, na_position, ignore_index): {"a":None, "b":0}, {"a":-3, "b":None}], - #[{}], - [{'a':1, 'b':2}], [{'b':3, 'c':4}], - #[{None:None}, - #{None:5}, - #{'string_key':'string_field'}, - #{None:'string_field'}] - - ], ) def test_create_struct_series(data): @@ -491,11 +483,32 @@ def test_create_struct_series(data): [{'b':3, 'c':4}], - - ], ) -def test_struct_field(data): +def test_struct_field_a(data): expect = Series(data).struct.field('a') ds_got = dgd.from_cudf(Series(data), 2).struct.field('a') + assert_eq(expect, ds_got.compute()) + +@pytest.mark.parametrize( + "data", + [ + + [{"a":5, "b":10}, + {"a":3, "b":7}, + {"a":-3, "b":11}], + + [{"a":None, "b":1}, + {"a":None, "b":0}, + {"a":-3, "b":None}], + + [{'a':1, 'b':2}], + + [{'b':3, 'c':4}], + + ], +) +def test_struct_field_zero(data): + expect = Series(data).struct.field(0) + ds_got = dgd.from_cudf(Series(data), 2).struct.field(0) assert_eq(expect, ds_got.compute()) \ No newline at end of file From b80097c0bbec47ea2d168bac23f8329bc3f2fb67 Mon Sep 17 00:00:00 2001 From: NV-jpt <86264103+NV-jpt@users.noreply.github.com> Date: Wed, 28 Jul 2021 10:30:21 -0400 Subject: [PATCH 13/21] Update python/dask_cudf/dask_cudf/tests/test_accessor.py Co-authored-by: Richard (Rick) Zamora --- python/dask_cudf/dask_cudf/tests/test_accessor.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index d956b8cbc4d..89592d83392 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -439,9 +439,6 @@ def test_sorting(data, ascending, na_position, ignore_index): ) assert_eq(expect, got) -############################################################################# -# Struct Accessor # -############################################################################# @@ -511,4 +508,4 @@ def test_struct_field_a(data): def test_struct_field_zero(data): expect = Series(data).struct.field(0) ds_got = dgd.from_cudf(Series(data), 2).struct.field(0) - assert_eq(expect, ds_got.compute()) \ No newline at end of file + assert_eq(expect, ds_got.compute()) From 7fcd16dc621bbec9c0a16a1b3112eb3d204d31f4 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Wed, 28 Jul 2021 15:57:21 +0000 Subject: [PATCH 14/21] Moved return statement out of try/catch block --- python/dask_cudf/dask_cudf/accessors.py | 31 +++++++++++++++---------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py index 65fe5522a31..a8416789bce 100644 --- a/python/dask_cudf/dask_cudf/accessors.py +++ b/python/dask_cudf/dask_cudf/accessors.py @@ -1,8 +1,10 @@ # Copyright (c) 2021, NVIDIA CORPORATION. + class StructMethods: def __init__(self, d_series): self.d_series = d_series + def field(self, key): """ Extract children of the specified struct column @@ -28,26 +30,31 @@ def field(self, key): 1 3 dtype: int64 """ - try: + try: typ = self.d_series._meta.dtype.fields[key] - return self.d_series.map_partitions( - lambda s: s.struct.field(key), - meta=self.d_series._meta._constructor([], dtype=typ), - ) + except KeyError as e: if isinstance(key, int): - key_list = [dict_key for dict_key in self.d_series._meta.dtype.fields.keys()] + key_list = [ + dict_key + for dict_key in self.d_series._meta.dtype.fields.keys() + ] typ_key = key_list[key] typ = self.d_series._meta.dtype.fields[typ_key] - return self.d_series.map_partitions( - lambda s: s.struct.field(key), - meta=self.d_series._meta._constructor([], dtype=typ), - ) + else: - print('Field "' + str(key) + '" is not found in the set of existing keys.') + print( + 'Field "' + + str(key) + + '" is not found in the set of existing keys.' + ) raise e - + return self.d_series.map_partitions( + lambda s: s.struct.field(key), + meta=self.d_series._meta._constructor([], dtype=typ), + ) + class ListMethods: def __init__(self, d_series): From cde780818cb6af212a8810e959e735082a5cdcec Mon Sep 17 00:00:00 2001 From: sft-managed Date: Thu, 29 Jul 2021 12:59:21 +0000 Subject: [PATCH 15/21] Removed test case with nonexistent field - need to design another test for this --- .../dask_cudf/tests/test_accessor.py | 57 +++++-------------- 1 file changed, 15 insertions(+), 42 deletions(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 89592d83392..bec10ae9305 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -440,23 +440,13 @@ def test_sorting(data, ascending, na_position, ignore_index): assert_eq(expect, got) - - @pytest.mark.parametrize( "data", [ - [{"a":5, "b":10}, - {"a":3, "b":7}, - {"a":-3, "b":11}], - - [{"a":None, "b":1}, - {"a":None, "b":0}, - {"a":-3, "b":None}], - - [{'a':1, 'b':2}], - - [{'b':3, 'c':4}], - + [{"a": 5, "b": 10}, {"a": 3, "b": 7}, {"a": -3, "b": 11}], + [{"a": None, "b": 1}, {"a": None, "b": 0}, {"a": -3, "b": None}], + [{"a": 1, "b": 2}], + [{"b": 3, "c": 4}], ], ) def test_create_struct_series(data): @@ -464,45 +454,28 @@ def test_create_struct_series(data): ds_got = dgd.from_cudf(Series(data), 2) assert_eq(expect, ds_got.compute()) + @pytest.mark.parametrize( "data", [ - - [{"a":5, "b":10}, - {"a":3, "b":7}, - {"a":-3, "b":11}], - - [{"a":None, "b":1}, - {"a":None, "b":0}, - {"a":-3, "b":None}], - - [{'a':1, 'b':2}], - - [{'b':3, 'c':4}], - + [{"a": 5, "b": 10}, {"a": 3, "b": 7}, {"a": -3, "b": 11}], + [{"a": None, "b": 1}, {"a": None, "b": 0}, {"a": -3, "b": None}], + [{"a": 1, "b": 2}], ], ) def test_struct_field_a(data): - expect = Series(data).struct.field('a') - ds_got = dgd.from_cudf(Series(data), 2).struct.field('a') + expect = Series(data).struct.field("a") + ds_got = dgd.from_cudf(Series(data), 2).struct.field("a") assert_eq(expect, ds_got.compute()) + @pytest.mark.parametrize( "data", [ - - [{"a":5, "b":10}, - {"a":3, "b":7}, - {"a":-3, "b":11}], - - [{"a":None, "b":1}, - {"a":None, "b":0}, - {"a":-3, "b":None}], - - [{'a':1, 'b':2}], - - [{'b':3, 'c':4}], - + [{"a": 5, "b": 10}, {"a": 3, "b": 7}, {"a": -3, "b": 11}], + [{"a": None, "b": 1}, {"a": None, "b": 0}, {"a": -3, "b": None}], + [{"a": 1, "b": 2}], + [{"b": 3, "c": 4}], ], ) def test_struct_field_zero(data): From d30afbd87709ab715c2fafa522c2cad7e96f65c3 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Thu, 29 Jul 2021 16:35:37 +0000 Subject: [PATCH 16/21] Implemented some suggestions from Rick Zamora; thank you, Rickgit status --- python/dask_cudf/dask_cudf/accessors.py | 14 ++++------- .../dask_cudf/tests/test_accessor.py | 23 ++++++++++++------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py index a8416789bce..828529aee4a 100644 --- a/python/dask_cudf/dask_cudf/accessors.py +++ b/python/dask_cudf/dask_cudf/accessors.py @@ -35,20 +35,14 @@ def field(self, key): except KeyError as e: if isinstance(key, int): - key_list = [ - dict_key - for dict_key in self.d_series._meta.dtype.fields.keys() + typ = self.d_series._meta.dtype.fields[ + list(self.d_series._meta.dtype.fields)[key] ] - typ_key = key_list[key] - typ = self.d_series._meta.dtype.fields[typ_key] else: - print( - 'Field "' - + str(key) - + '" is not found in the set of existing keys.' + raise e( + f"Field '{key}' is not found in the set of existing keys" ) - raise e return self.d_series.map_partitions( lambda s: s.struct.field(key), diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index bec10ae9305..4c97ef5bba9 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -440,6 +440,11 @@ def test_sorting(data, ascending, na_position, ignore_index): assert_eq(expect, got) +############################################################################# +# Struct Accessor # +############################################################################# + + @pytest.mark.parametrize( "data", [ @@ -463,10 +468,11 @@ def test_create_struct_series(data): [{"a": 1, "b": 2}], ], ) -def test_struct_field_a(data): - expect = Series(data).struct.field("a") - ds_got = dgd.from_cudf(Series(data), 2).struct.field("a") - assert_eq(expect, ds_got.compute()) +def test_struct_field_str(data): + for test_key in ["a", "b"]: + expect = Series(data).struct.field(test_key) + ds_got = dgd.from_cudf(Series(data), 2).struct.field(test_key) + assert_eq(expect, ds_got.compute()) @pytest.mark.parametrize( @@ -478,7 +484,8 @@ def test_struct_field_a(data): [{"b": 3, "c": 4}], ], ) -def test_struct_field_zero(data): - expect = Series(data).struct.field(0) - ds_got = dgd.from_cudf(Series(data), 2).struct.field(0) - assert_eq(expect, ds_got.compute()) +def test_struct_field_integer(data): + for test_key in [0, 1]: + expect = Series(data).struct.field(test_key) + ds_got = dgd.from_cudf(Series(data), 2).struct.field(test_key) + assert_eq(expect, ds_got.compute()) From 99a18d5233cf909ef5168a1dbd791b05361a5deb Mon Sep 17 00:00:00 2001 From: sft-managed Date: Mon, 9 Aug 2021 17:53:32 +0000 Subject: [PATCH 17/21] Added better error handling and tests --- python/dask_cudf/dask_cudf/accessors.py | 25 +++++++++----- .../dask_cudf/tests/test_accessor.py | 34 +++++++++++++++++++ 2 files changed, 50 insertions(+), 9 deletions(-) diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py index 828529aee4a..f81b8140504 100644 --- a/python/dask_cudf/dask_cudf/accessors.py +++ b/python/dask_cudf/dask_cudf/accessors.py @@ -1,6 +1,9 @@ # Copyright (c) 2021, NVIDIA CORPORATION. +# from cudf.python.dask_cudf.dask_cudf.core import Index + + class StructMethods: def __init__(self, d_series): self.d_series = d_series @@ -33,15 +36,19 @@ def field(self, key): try: typ = self.d_series._meta.dtype.fields[key] - except KeyError as e: - if isinstance(key, int): - typ = self.d_series._meta.dtype.fields[ - list(self.d_series._meta.dtype.fields)[key] - ] - - else: - raise e( - f"Field '{key}' is not found in the set of existing keys" + except KeyError: + try: + if isinstance(key, int): + typ = self.d_series._meta.dtype.fields[ + list(self.d_series._meta.dtype.fields)[key] + ] + else: + raise KeyError( + f"Field '{key}' is not in the set of existing keys" + ) + except TypeError: + raise IndexError( + f"Index '{key}' is greater than the number of fields" ) return self.d_series.map_partitions( diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 4c97ef5bba9..b140dbe2724 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -489,3 +489,37 @@ def test_struct_field_integer(data): expect = Series(data).struct.field(test_key) ds_got = dgd.from_cudf(Series(data), 2).struct.field(test_key) assert_eq(expect, ds_got.compute()) + + +@pytest.mark.parametrize( + "data", + [ + [{"a": 5, "b": 10}, {"a": 3, "b": 7}, {"a": -3, "b": 11}], + [{"a": None, "b": 1}, {"a": None, "b": 0}, {"a": -3, "b": None}], + [{"a": 1, "b": 2}], + [{"b": 3, "c": 4}], + ], +) +def test_dask_struct_field_Key_Error(data): + got = dgd.from_cudf(Series(data), 2) + + # import pdb; pdb.set_trace() + with pytest.raises(KeyError): + got.struct.field("notakey").compute() + + +@pytest.mark.parametrize( + "data", + [ + [{"a": 5, "b": 10}, {"a": 3, "b": 7}, {"a": -3, "b": 11}], + [{"a": None, "b": 1}, {"a": None, "b": 0}, {"a": -3, "b": None}], + [{"a": 1, "b": 2}], + [{"b": 3, "c": 4}], + ], +) +def test_dask_struct_field_Int_Error(data): + # breakpoint() + got = dgd.from_cudf(Series(data), 2) + + with pytest.raises(IndexError): + got.struct.field(1000).compute() From 1a8270fb4e1fb5606d82495eda9c7cbd0777120a Mon Sep 17 00:00:00 2001 From: sft-managed Date: Fri, 13 Aug 2021 17:40:16 +0000 Subject: [PATCH 18/21] Consolidated try-except blocks into single block, and implemented appropriate error handling --- python/dask_cudf/dask_cudf/accessors.py | 26 ++++++++++++------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py index f81b8140504..7c93babe745 100644 --- a/python/dask_cudf/dask_cudf/accessors.py +++ b/python/dask_cudf/dask_cudf/accessors.py @@ -34,22 +34,20 @@ def field(self, key): dtype: int64 """ try: - typ = self.d_series._meta.dtype.fields[key] + + if isinstance(key, int): + typ = self.d_series._meta.dtype.fields[ + list(self.d_series._meta.dtype.fields)[key] + ] + else: + typ = self.d_series._meta.dtype.fields[key] except KeyError: - try: - if isinstance(key, int): - typ = self.d_series._meta.dtype.fields[ - list(self.d_series._meta.dtype.fields)[key] - ] - else: - raise KeyError( - f"Field '{key}' is not in the set of existing keys" - ) - except TypeError: - raise IndexError( - f"Index '{key}' is greater than the number of fields" - ) + raise KeyError(f"Field '{key}' is not in the set of existing keys") + except TypeError: + raise IndexError( + f"Index '{key}' is greater than the number of fields" + ) return self.d_series.map_partitions( lambda s: s.struct.field(key), From bb52b2ccf3f3ae57697701eed25f59570b872fc5 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Wed, 18 Aug 2021 16:22:30 +0000 Subject: [PATCH 19/21] Allow errors from invalid field keys to be thrown from cudf, instead of raising them at the dask-cudf layer --- python/dask_cudf/dask_cudf/accessors.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py index 7c93babe745..c0e8afe5844 100644 --- a/python/dask_cudf/dask_cudf/accessors.py +++ b/python/dask_cudf/dask_cudf/accessors.py @@ -33,21 +33,7 @@ def field(self, key): 1 3 dtype: int64 """ - try: - - if isinstance(key, int): - typ = self.d_series._meta.dtype.fields[ - list(self.d_series._meta.dtype.fields)[key] - ] - else: - typ = self.d_series._meta.dtype.fields[key] - - except KeyError: - raise KeyError(f"Field '{key}' is not in the set of existing keys") - except TypeError: - raise IndexError( - f"Index '{key}' is greater than the number of fields" - ) + typ = self.d_series._meta.struct.field(key).dtype return self.d_series.map_partitions( lambda s: s.struct.field(key), From c9af377ebb781973ad7266cd24574be10c97f2ca Mon Sep 17 00:00:00 2001 From: sft-managed Date: Wed, 18 Aug 2021 18:11:30 +0000 Subject: [PATCH 20/21] Cleaned up tests through removal of repeat definitions of struct data used for testing --- .../dask_cudf/tests/test_accessor.py | 45 +++++-------------- 1 file changed, 11 insertions(+), 34 deletions(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index b140dbe2724..8227023aa51 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -443,16 +443,16 @@ def test_sorting(data, ascending, na_position, ignore_index): ############################################################################# # Struct Accessor # ############################################################################# +struct_accessor_data_params = [ + [{"a": 5, "b": 10}, {"a": 3, "b": 7}, {"a": -3, "b": 11}], + [{"a": None, "b": 1}, {"a": None, "b": 0}, {"a": -3, "b": None}], + [{"a": 1, "b": 2}], + [{"a": 1, "b": 3, "c": 4}], +] @pytest.mark.parametrize( - "data", - [ - [{"a": 5, "b": 10}, {"a": 3, "b": 7}, {"a": -3, "b": 11}], - [{"a": None, "b": 1}, {"a": None, "b": 0}, {"a": -3, "b": None}], - [{"a": 1, "b": 2}], - [{"b": 3, "c": 4}], - ], + "data", struct_accessor_data_params, ) def test_create_struct_series(data): expect = pd.Series(data) @@ -461,12 +461,7 @@ def test_create_struct_series(data): @pytest.mark.parametrize( - "data", - [ - [{"a": 5, "b": 10}, {"a": 3, "b": 7}, {"a": -3, "b": 11}], - [{"a": None, "b": 1}, {"a": None, "b": 0}, {"a": -3, "b": None}], - [{"a": 1, "b": 2}], - ], + "data", struct_accessor_data_params, ) def test_struct_field_str(data): for test_key in ["a", "b"]: @@ -476,13 +471,7 @@ def test_struct_field_str(data): @pytest.mark.parametrize( - "data", - [ - [{"a": 5, "b": 10}, {"a": 3, "b": 7}, {"a": -3, "b": 11}], - [{"a": None, "b": 1}, {"a": None, "b": 0}, {"a": -3, "b": None}], - [{"a": 1, "b": 2}], - [{"b": 3, "c": 4}], - ], + "data", struct_accessor_data_params, ) def test_struct_field_integer(data): for test_key in [0, 1]: @@ -492,13 +481,7 @@ def test_struct_field_integer(data): @pytest.mark.parametrize( - "data", - [ - [{"a": 5, "b": 10}, {"a": 3, "b": 7}, {"a": -3, "b": 11}], - [{"a": None, "b": 1}, {"a": None, "b": 0}, {"a": -3, "b": None}], - [{"a": 1, "b": 2}], - [{"b": 3, "c": 4}], - ], + "data", struct_accessor_data_params, ) def test_dask_struct_field_Key_Error(data): got = dgd.from_cudf(Series(data), 2) @@ -509,13 +492,7 @@ def test_dask_struct_field_Key_Error(data): @pytest.mark.parametrize( - "data", - [ - [{"a": 5, "b": 10}, {"a": 3, "b": 7}, {"a": -3, "b": 11}], - [{"a": None, "b": 1}, {"a": None, "b": 0}, {"a": -3, "b": None}], - [{"a": 1, "b": 2}], - [{"b": 3, "c": 4}], - ], + "data", struct_accessor_data_params, ) def test_dask_struct_field_Int_Error(data): # breakpoint() From b552833253556aa6213af41aa057b6a2e7f28107 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Wed, 18 Aug 2021 18:18:44 +0000 Subject: [PATCH 21/21] Removal of commented-out import statement --- python/dask_cudf/dask_cudf/accessors.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py index c0e8afe5844..77973ee34ff 100644 --- a/python/dask_cudf/dask_cudf/accessors.py +++ b/python/dask_cudf/dask_cudf/accessors.py @@ -1,9 +1,6 @@ # Copyright (c) 2021, NVIDIA CORPORATION. -# from cudf.python.dask_cudf.dask_cudf.core import Index - - class StructMethods: def __init__(self, d_series): self.d_series = d_series