Skip to content

Commit

Permalink
[DataPipe] Make GroupBy serializable with lambda function (#71497)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: pytorch/pytorch#71497

Related to pytorch/data#172

cc VitalyFedyunin ejguan NivekT

Test Plan: Imported from OSS

Reviewed By: ejguan

Differential Revision: D33668749

Pulled By: NivekT

fbshipit-source-id: 6506614e9d4389dc645d8985c00fdb3402122d9b
  • Loading branch information
NivekT authored and facebook-github-bot committed Jan 21, 2022
1 parent d5f6fde commit 458e76f
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 1 deletion.
2 changes: 1 addition & 1 deletion test/test_datapipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,7 @@ def test_serializable_with_dill(self):
(dp.iter.Collator, (lambda x: x,), {}),
(dp.iter.Demultiplexer, (2, lambda x: x % 2,), {}),
(dp.iter.Filter, (lambda x: x >= 5,), {}),
# (dp.iter.Grouper, (lambda x: x >= 5,), {}), # TODO: Need custom __getstate__ for Grouper
(dp.iter.Grouper, (lambda x: x >= 5,), {}),
(dp.iter.Mapper, (lambda x: x, ), {}),
]
if HAS_DILL:
Expand Down
39 changes: 39 additions & 0 deletions torch/utils/data/datapipes/iter/grouping.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
from collections import defaultdict

from torch.utils.data import IterDataPipe, functional_datapipe, DataChunk
from torch.utils.data.datapipes.utils.common import DILL_AVAILABLE, check_lambda_fn
from typing import Any, Callable, DefaultDict, Iterator, List, Optional, Sized, TypeVar

if DILL_AVAILABLE:
import dill
dill.extend(use_dill=False)

T_co = TypeVar('T_co', covariant=True)


Expand Down Expand Up @@ -157,6 +162,7 @@ def __init__(self,
group_size: Optional[int] = None,
guaranteed_group_size: Optional[int] = None,
drop_remaining: bool = False):
check_lambda_fn(group_key_fn)
self.datapipe = datapipe
self.group_key_fn = group_key_fn
self.buffer_size = buffer_size
Expand Down Expand Up @@ -214,3 +220,36 @@ def __iter__(self):
res = buffer_elements.pop(key)
buffer_size -= len(res)
yield self.wrapper_class(res)

def __getstate__(self):
if IterDataPipe.getstate_hook is not None:
return IterDataPipe.getstate_hook(self)

if DILL_AVAILABLE:
dill_function = dill.dumps(self.group_key_fn)
else:
dill_function = self.group_key_fn
state = (
self.datapipe,
dill_function,
self.buffer_size,
self.group_size,
self.guaranteed_group_size,
self.drop_remaining,
)
return state

def __setstate__(self, state):
(
self.datapipe,
dill_function,
self.buffer_size,
self.group_size,
self.guaranteed_group_size,
self.drop_remaining,
) = state
if DILL_AVAILABLE:
self.group_key_fn = dill.loads(dill_function) # type: ignore[assignment]
else:
self.group_key_fn = dill_function # type: ignore[assignment]
self.wrapper_class = DataChunk

0 comments on commit 458e76f

Please sign in to comment.