pytorch · NivekT · Nov 3, 2022 · Nov 7, 2022 · Nov 10, 2022 · Nov 11, 2022
diff --git a/torchdata/dataloader2/communication/iter.py b/torchdata/dataloader2/communication/iter.py
@@ -9,6 +9,7 @@
 
 from torch.utils.data import IterDataPipe
 from torchdata.dataloader2 import communication
+from torchdata.dataloader2.graph import traverse_dps
 
 DEFAULT_NON_BLOCKING_SLEEP = 0.001
 
@@ -137,12 +138,30 @@ def DataPipeBehindQueues(source_datapipe, protocol, blocking_request_get=False,
             source_datapipe.reset_iterator()
             protocol.response_reset_iterator()
 
+        elif isinstance(request, communication.messages.FullStopRequest):
+            graph = traverse_dps(source_datapipe)
+            for dp, _ in graph.values():
+                if hasattr(dp, "full_stop") and callable(dp.full_stop):
+                    dp.full_stop()
+            protocol.response_full_stop()
+
+        elif isinstance(request, communication.messages.ResumeRequest):
+            graph = traverse_dps(source_datapipe)
+            for dp, _ in graph.values():
+                if hasattr(dp, "resume") and callable(dp.resume):
+                    dp.resume()
+            protocol.response_resume()
+
         elif isinstance(request, communication.messages.TerminateRequest):
             forever = False
             protocol.response_terminate()
 
         elif isinstance(request, communication.messages.GetNextRequest):
             while forever:
+                if protocol._full_stop:
+                    raise RuntimeError(
+                        "Cannot `GetNext` after `FullStop` has been called. " "`Resume` must be called first."
+                    )
                 try:
                     value = source_datapipe.nonblocking_next()
                 except NotAvailable:

diff --git a/torchdata/dataloader2/communication/messages.py b/torchdata/dataloader2/communication/messages.py
@@ -36,6 +36,22 @@ class ResetEpochResponse(Response):
     pass
 
 
+class FullStopRequest(Request):
+    pass
+
+
+class FullStopResponse(Response):
+    pass
+
+
+class ResumeRequest(Request):
+    pass
+
+
+class ResumeResponse(Response):
+    pass
+
+
 class TerminateRequest(Request):
     pass
 

diff --git a/torchdata/dataloader2/communication/protocol.py b/torchdata/dataloader2/communication/protocol.py
@@ -42,26 +42,42 @@ def request_sent(self, request=True):
 
     def request_served(self, result=None):
         if not self.waiting_for_response():
-            raise Exception("Expected no peding requests, but something got served", result)
+            raise Exception("Expected no pending requests, but something got served", result)
         self._req_sent = None
 
     def discard_existing_request(self):
         if self.waiting_for_response():
             response = self.response_queue.get(block=True)
             self.request_served(response)
 
+    def request_full_stop(self):
+        if not self.can_take_request():
+            raise Exception("Can not full stop while we are still waiting response for previous request")
+        request = communication.messages.FullStopRequest()
+        self.request_queue.put(request)
+        self.request_sent(request)
+
+    def request_resume(self):
+        if not self.can_take_request():
+            raise Exception("Can not full stop while we are still waiting response for previous request")
+        request = communication.messages.ResumeRequest()
+        self.request_queue.put(request)
+        self.request_sent(request)
+
 
 class ProtocolServer(Protocol):
     """
     ProtocolServer takes charge of getting requests from req_queue and fetching data from source datapipe.
     """
 
     _req_received = None
+    _full_stop = False  # When `True`, prevents `GetNext` in `DataPipeBehindQueues`.
 
     def __init__(self, request_queue, response_queue):
         self.request_queue = request_queue
         self.response_queue = response_queue
         self._req_received = None
+        self._full_stop = False
 
     def have_pending_request(self):
         return self._req_received is not None
@@ -93,6 +109,20 @@ def response_reset_epoch(self):
         self.response_queue.put(communication.messages.ResetEpochResponse())
         self._req_received = None
 
+    def response_full_stop(self):
+        if not self.have_pending_request():
+            raise Exception("Attempting to reply with pending request")
+        self._full_stop = True
+        self.response_queue.put(communication.messages.FullStopResponse())
+        self._req_received = None
+
+    def response_resume(self):
+        if not self.have_pending_request():
+            raise Exception("Attempting to reply with pending request")
+        self._full_stop = False
+        self.response_queue.put(communication.messages.ResumeResponse())
+        self._req_received = None
+
 
 class MapDataPipeQueueProtocolServer(ProtocolServer):
     def response_item(self, key, value):

diff --git a/torchdata/dataloader2/reading_service.py b/torchdata/dataloader2/reading_service.py
@@ -120,11 +120,12 @@ class _IterateQueueDataPipes(IterDataPipe):
 
     def __init__(self, datapipes):
         # TODO(VitalyFedyunin): Consider combining _IterateQueueDataPipes and QueueWrapper
-        # into one class, which supports any number of queues.
-        self.datapipes = datapipes
-        for dp in self.datapipes:
+        #                       into one class, which supports any number of queues.
+        for dp in datapipes:
             if not isinstance(dp, communication.iter.QueueWrapper):
                 raise Exception("Source datapipes should be an instance of iter.QueueWrapper")
+        self.datapipes = datapipes
+        self.res_buffers = [[] for _ in range(len(datapipes))]
 
     def __iter__(self):
         total_pipes = len(self.datapipes)
@@ -137,6 +138,9 @@ def __iter__(self):
         while cnt_disabled_pipes < total_pipes:
             for idx in range(total_pipes):
                 if not disabled_pipe[idx]:
+                    # Check if buffer of the DataPipe is empty before requesting next
+                    while len(self.res_buffers[idx]):
+                        yield self.res_buffers[idx].pop()
                     response = self.datapipes[idx].protocol.get_response_next(block=True)
                     if isinstance(response, communication.messages.StopIterationResponse):
                         disabled_pipe[idx] = True
@@ -164,6 +168,18 @@ def reset_epoch(self, *args):
         for dp in self.datapipes:
             dp.protocol.request_reset_epoch(*args)
 
+    def request_full_stop(self):
+        # Store results of pending requests
+        for idx, dp in enumerate(self.datapipes):
+            res = dp.protocol.get_response_next(block=True)
+            self.res_buffers[idx].append(res)
+        for dp in self.datapipes:
+            dp.protocol.request_full_stop()
+
+    def request_resume(self):
+        for dp in self.datapipes:
+            dp.protocol.request_resume()
+
 
 class PrototypeMultiProcessingReadingService(ReadingServiceInterface):
     r"""
@@ -347,6 +363,26 @@ def clean_me(process, req_queue, res_queue):
             dist.destroy_process_group(self._pg)
             self._pg = None
 
+    def full_stop(self):
+        """
+        Fully stop DataPipes' activities such as prefetching, in order to collect state.
+        """
+        if self.prefetch_mainloop > 0:
+            # Stop prefetching first
+            self.end_datapipe.full_stop()  # type: ignore[union-attr]
+            end_datapipe: DataPipe = self.end_datapipe.source_datapipe
+        else:
+            end_datapipe = self.end_datapipe
+        end_datapipe.request_full_stop()
+
+    def resume(self):
+        if self.prefetch_mainloop > 0:
+            self.end_datapipe.resume()  # type: ignore[union-attr]
+            end_datapipe: DataPipe = self.end_datapipe.source_datapipe
+        else:
+            end_datapipe = self.end_datapipe
+        end_datapipe.request_resume()
+
 
 class MultiProcessingReadingService(ReadingServiceInterface):
     r"""

diff --git a/torchdata/datapipes/iter/util/prefetch.py b/torchdata/datapipes/iter/util/prefetch.py
@@ -223,3 +223,11 @@ def __setstate__(self, state):
         self._error = None
         self._sync_counter = torch.tensor([0], dtype=torch.int32)
         self._done_callback = False
+
+    def full_stop(self):
+        if self._executor is not None:
+            self._executor.shutdown()
+            self._executor = None
+
+    def resume(self):
+        self._executor = _PrefetchExecutor(iter(self.datapipe), 1, self._callback_fn, self.timeout)
diff --git a/torchdata/datapipes/iter/util/prefetcher.py b/torchdata/datapipes/iter/util/prefetcher.py
@@ -32,12 +32,12 @@ class PrefetcherIterDataPipe(IterDataPipe):
     """
     Prefetches elements from the source DataPipe and puts them into a buffer (functional name: ``prefetch``).
     Prefetching performs the operations (e.g. I/O, computations) of the DataPipes up to this one ahead of time
-    and stores the result in the buffer, ready to be consume by the subsequent DataPipe. It has no effect aside
+    and stores the result in the buffer, ready to be consumed by the subsequent DataPipe. It has no effect aside
     from getting the sample ready ahead of time.
 
     This is used by ``PrototypeMultiProcessingReadingService`` when the arguments
     ``prefetch_worker`` (for prefetching at each worker process) or
-    ``prefetch_mainloop`` (for prefetching at the moain loop) are greater than 0.
+    ``prefetch_mainloop`` (for prefetching at the main loop) are greater than 0.
 
     Beyond the built-in use cases, this can be useful to put after I/O DataPipes that have
     expensive I/O operations (e.g. takes a long time to request a file from a remote server).
@@ -104,7 +104,7 @@ def __iter__(self):
 
     def __getstate__(self):
         """
-        Getting state in threading enviroment requires next operations:
+        Getting state in threading environment requires next operations:
             1) Stopping of the producer thread.
             2) Saving buffer.
             3) Adding lazy restart of producer thread when __next__ is called again
@@ -123,3 +123,18 @@ def reset(self):
         if self.thread is not None:
             self.prefetch_data.run_prefetcher = False
             self.thread.join()
+            self.thread = None
+
+    def full_stop(self):
+        if self.thread is not None:
+            # Note: the content of the buffer still exists in `prefetch_data.prefetch_buffer`
+            self.prefetch_data.run_prefetcher = False
+            self.thread.join()
+            self.thread = None
+
+    def resume(self):
+        self.thread = threading.Thread(
+            target=PrefetcherIterDataPipe.thread_worker, args=(self.prefetch_data,), daemon=True
+        )
+        self.prefetch_data.run_prefetcher = True
+        self.thread.start()