From e4d9132e2ae2e1b956f74a93bd56364f04e4eba4 Mon Sep 17 00:00:00 2001 From: Gabriel Moreira Date: Tue, 17 May 2022 15:40:40 -0300 Subject: [PATCH 1/2] Improved docstrings of GroupBy op to reinforce the required usage of dataset.shuffle_by_keys() before --- nvtabular/ops/groupby.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/nvtabular/ops/groupby.py b/nvtabular/ops/groupby.py index dc315b7c4b0..14614602a01 100644 --- a/nvtabular/ops/groupby.py +++ b/nvtabular/ops/groupby.py @@ -34,10 +34,13 @@ class Groupby(Operator): Example usage:: + groupby_cols = ['user_id', 'session_id'] + dataset = dataset.shuffle_by_keys(keys=groupby_cols) + groupby_features = [ 'user_id', 'session_id', 'month', 'prod_id', ] >> ops.Groupby( - groupby_cols=['user_id', 'session_id'], + groupby_cols=groupby_cols, sort_cols=['month'], aggs={ 'prod_id': 'list', @@ -46,10 +49,15 @@ class Groupby(Operator): ) processor = nvtabular.Workflow(groupby_features) + workflow.fit(dataset) + dataset_transformed = workflow.transform(dataset) + Parameters ----------- groupby_cols : str or list of str The column names to be used as groupby keys. + WARNING: Ensure the dataset was partitioned by those + groupby keys (see above an example). sort_cols : str or list of str Columns to be used to sort each partition before groupby aggregation is performed. If this argument From 3dbcf74005b9b20feba97db1ff97475afa807875 Mon Sep 17 00:00:00 2001 From: Karl Higley Date: Thu, 19 May 2022 21:27:50 -0400 Subject: [PATCH 2/2] Update nvtabular/ops/groupby.py Co-authored-by: Ben Frederickson --- nvtabular/ops/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nvtabular/ops/groupby.py b/nvtabular/ops/groupby.py index 14614602a01..b65a1800e98 100644 --- a/nvtabular/ops/groupby.py +++ b/nvtabular/ops/groupby.py @@ -57,7 +57,7 @@ class Groupby(Operator): groupby_cols : str or list of str The column names to be used as groupby keys. WARNING: Ensure the dataset was partitioned by those - groupby keys (see above an example). + groupby keys (see above for an example). sort_cols : str or list of str Columns to be used to sort each partition before groupby aggregation is performed. If this argument