From 458e76fcb1a60691a225f3f5e4a058a51490732d Mon Sep 17 00:00:00 2001 From: Kevin Tse Date: Fri, 21 Jan 2022 08:00:23 -0800 Subject: [PATCH] [DataPipe] Make GroupBy serializable with lambda function (#71497) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/71497 Related to https://github.com/pytorch/data/issues/172 cc VitalyFedyunin ejguan NivekT Test Plan: Imported from OSS Reviewed By: ejguan Differential Revision: D33668749 Pulled By: NivekT fbshipit-source-id: 6506614e9d4389dc645d8985c00fdb3402122d9b --- test/test_datapipe.py | 2 +- torch/utils/data/datapipes/iter/grouping.py | 39 +++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/test/test_datapipe.py b/test/test_datapipe.py index 4153c1951ae..433e7383271 100644 --- a/test/test_datapipe.py +++ b/test/test_datapipe.py @@ -468,7 +468,7 @@ def test_serializable_with_dill(self): (dp.iter.Collator, (lambda x: x,), {}), (dp.iter.Demultiplexer, (2, lambda x: x % 2,), {}), (dp.iter.Filter, (lambda x: x >= 5,), {}), - # (dp.iter.Grouper, (lambda x: x >= 5,), {}), # TODO: Need custom __getstate__ for Grouper + (dp.iter.Grouper, (lambda x: x >= 5,), {}), (dp.iter.Mapper, (lambda x: x, ), {}), ] if HAS_DILL: diff --git a/torch/utils/data/datapipes/iter/grouping.py b/torch/utils/data/datapipes/iter/grouping.py index 1707e3f2cc1..b9f6a9e77f0 100644 --- a/torch/utils/data/datapipes/iter/grouping.py +++ b/torch/utils/data/datapipes/iter/grouping.py @@ -1,8 +1,13 @@ from collections import defaultdict from torch.utils.data import IterDataPipe, functional_datapipe, DataChunk +from torch.utils.data.datapipes.utils.common import DILL_AVAILABLE, check_lambda_fn from typing import Any, Callable, DefaultDict, Iterator, List, Optional, Sized, TypeVar +if DILL_AVAILABLE: + import dill + dill.extend(use_dill=False) + T_co = TypeVar('T_co', covariant=True) @@ -157,6 +162,7 @@ def __init__(self, group_size: Optional[int] = None, guaranteed_group_size: Optional[int] = None, drop_remaining: bool = False): + check_lambda_fn(group_key_fn) self.datapipe = datapipe self.group_key_fn = group_key_fn self.buffer_size = buffer_size @@ -214,3 +220,36 @@ def __iter__(self): res = buffer_elements.pop(key) buffer_size -= len(res) yield self.wrapper_class(res) + + def __getstate__(self): + if IterDataPipe.getstate_hook is not None: + return IterDataPipe.getstate_hook(self) + + if DILL_AVAILABLE: + dill_function = dill.dumps(self.group_key_fn) + else: + dill_function = self.group_key_fn + state = ( + self.datapipe, + dill_function, + self.buffer_size, + self.group_size, + self.guaranteed_group_size, + self.drop_remaining, + ) + return state + + def __setstate__(self, state): + ( + self.datapipe, + dill_function, + self.buffer_size, + self.group_size, + self.guaranteed_group_size, + self.drop_remaining, + ) = state + if DILL_AVAILABLE: + self.group_key_fn = dill.loads(dill_function) # type: ignore[assignment] + else: + self.group_key_fn = dill_function # type: ignore[assignment] + self.wrapper_class = DataChunk