From 5cae73c5cf728cece0a6d42949a52557a2347248 Mon Sep 17 00:00:00 2001
From: Deependu Jha <deependujha21@gmail.com>
Date: Thu, 26 Sep 2024 09:39:10 +0530
Subject: [PATCH] Fix: Chunks deletion issue (#375)

* not sure if it works. hehe

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* added test to check behaviour in CI

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Bhimraj Yadav <bhimrajyadav977@gmail.com>
---
 src/litdata/processing/data_processor.py | 26 ++++++++---
 tests/processing/test_functions.py       | 55 ++++++++++++++++++++++++
 2 files changed, 75 insertions(+), 6 deletions(-)

diff --git a/src/litdata/processing/data_processor.py b/src/litdata/processing/data_processor.py
index f1af9afa..20427e23 100644
--- a/src/litdata/processing/data_processor.py
+++ b/src/litdata/processing/data_processor.py
@@ -456,6 +456,7 @@ def run(self) -> None:
         try:
             self._setup()
             self._loop()
+            self._terminate()
         except Exception:
             traceback_format = traceback.format_exc()
             self.error_queue.put(traceback_format)
@@ -469,6 +470,19 @@ def _setup(self) -> None:
         self._start_uploaders()
         self._start_remover()
 
+    def _terminate(self) -> None:
+        """Make sure all the uploaders, downloaders and removers are terminated."""
+        for uploader in self.uploaders:
+            if uploader.is_alive():
+                uploader.join()
+
+        for downloader in self.downloaders:
+            if downloader.is_alive():
+                downloader.join()
+
+        if self.remover and self.remover.is_alive():
+            self.remover.join()
+
     def _loop(self) -> None:
         num_downloader_finished = 0
 
@@ -795,7 +809,7 @@ def _done(self, size: int, delete_cached_files: bool, output_dir: Dir) -> _Resul
 
         chunks = [file for file in os.listdir(cache_dir) if file.endswith(".bin")]
         if chunks and delete_cached_files and output_dir.path is not None:
-            raise RuntimeError(f"All the chunks should have been deleted. Found {chunks}")
+            raise RuntimeError(f"All the chunks should have been deleted. Found {chunks} in cache: {cache_dir}")
 
         merge_cache = Cache(cache_dir, chunk_bytes=1)
         node_rank = _get_node_rank()
@@ -1110,6 +1124,10 @@ def run(self, data_recipe: DataRecipe) -> None:
 
             current_total = new_total
             if current_total == num_items:
+                # make sure all processes are terminated
+                for w in self.workers:
+                    if w.is_alive():
+                        w.join()
                 break
 
             if _IS_IN_STUDIO and node_rank == 0 and _ENABLE_STATUS:
@@ -1118,17 +1136,13 @@ def run(self, data_recipe: DataRecipe) -> None:
 
             # Exit early if all the workers are done.
             # This means there were some kinda of errors.
+            # TODO: Check whether this is still required.
             if all(not w.is_alive() for w in self.workers):
                 raise RuntimeError("One of the worker has failed")
 
         if _TQDM_AVAILABLE:
             pbar.close()
 
-        # TODO: Check whether this is still required.
-        if num_nodes == 1:
-            for w in self.workers:
-                w.join()
-
         print("Workers are finished.")
         result = data_recipe._done(len(user_items), self.delete_cached_files, self.output_dir)
 
diff --git a/tests/processing/test_functions.py b/tests/processing/test_functions.py
index 80eec0ba..42482b7e 100644
--- a/tests/processing/test_functions.py
+++ b/tests/processing/test_functions.py
@@ -1,10 +1,15 @@
+import glob
 import os
+import random
+import shutil
 import sys
+from pathlib import Path
 from unittest import mock
 
 import cryptography
 import numpy as np
 import pytest
+import requests
 from litdata import StreamingDataset, merge_datasets, optimize, walk
 from litdata.processing.functions import _get_input_dir, _resolve_dir
 from litdata.streaming.cache import Cache
@@ -475,3 +480,53 @@ def test_optimize_with_rsa_encryption(tmpdir):
     #     encryption=rsa,
     #     mode="overwrite",
     # )
+
+
+def tokenize(filename: str):
+    with open(filename, encoding="utf-8") as file:
+        text = file.read()
+    text = text.strip().split(" ")
+    word_to_int = {word: random.randint(1, 1000) for word in set(text)}  # noqa: S311
+    tokenized = [word_to_int[word] for word in text]
+
+    yield tokenized
+
+
+@pytest.mark.skipif(sys.platform == "win32", reason="Not tested on windows")
+def test_optimize_race_condition(tmpdir):
+    # issue: https://github.com/Lightning-AI/litdata/issues/367
+    # run_commands = [
+    #     "mkdir -p tempdir/custom_texts",
+    #     "curl https://www.gutenberg.org/cache/epub/24440/pg24440.txt --output tempdir/custom_texts/book1.txt",
+    #     "curl https://www.gutenberg.org/cache/epub/26393/pg26393.txt --output tempdir/custom_texts/book2.txt",
+    # ]
+    shutil.rmtree(f"{tmpdir}/custom_texts", ignore_errors=True)
+    os.makedirs(f"{tmpdir}/custom_texts", exist_ok=True)
+
+    urls = [
+        "https://www.gutenberg.org/cache/epub/24440/pg24440.txt",
+        "https://www.gutenberg.org/cache/epub/26393/pg26393.txt",
+    ]
+
+    for i, url in enumerate(urls):
+        print(f"downloading {i+1} file")
+        with requests.get(url, stream=True, timeout=10) as r:
+            r.raise_for_status()  # Raise an exception for bad status codes
+
+            with open(f"{tmpdir}/custom_texts/book{i+1}.txt", "wb") as f:
+                for chunk in r.iter_content(chunk_size=8192):
+                    f.write(chunk)
+
+    print("=" * 100)
+
+    train_files = sorted(glob.glob(str(Path(f"{tmpdir}/custom_texts") / "*.txt")))
+    print("=" * 100)
+    print(train_files)
+    print("=" * 100)
+    optimize(
+        fn=tokenize,
+        inputs=train_files,
+        output_dir=f"{tmpdir}/temp",
+        num_workers=1,
+        chunk_bytes="50MB",
+    )