Lightning-AI · lantiga · Feb 26, 2024 · Feb 23, 2024 · Feb 23, 2024 · Feb 23, 2024
@@ -41,10 +41,8 @@
 /src/lightning/pytorch/core/hooks.py         @williamfalcon @tchaton @awaelchli @carmocca
 /src/lightning/pytorch/core/module.py        @williamfalcon @tchaton @awaelchli @carmocca
 
-# Data Utilities
-/examples/data/           @tchaton @nohalon @justusschock @lantiga
+# Lightning Data
 /src/lightning/data/      @tchaton
-/tests/tests_data         @tchaton
 
 # Lightning Fabric
 /src/lightning/fabric                       @awaelchli @carmocca @justusschock

@@ -150,29 +150,6 @@ subprojects:
       - "build-pl (3.10, 2.0, 11.8.0)"
       - "build-pl (3.10, 2.1, 12.1.0)"
 
-  # SECTIONS: lightning_data
-
-  - id: "lightning_data: CPU workflow"
-    paths:
-      - ".actions/*"
-      - "requirements/data/**"
-      - "src/lightning/data/**"
-      - "src/lightning_data/*"
-      - "src/lightning/__init__.py"
-      - "src/lightning/__setup__.py"
-      - "src/lightning/__version__.py"
-      - "tests/tests_data/**"
-      - "examples/data/**"
-      - "pyproject.toml" # includes pytest config
-      - ".github/workflows/ci-tests-data.yml"
-      - "!requirements/*/docs.txt"
-      - "!*.md"
-      - "!**/*.md"
-    checks:
-      - "data-cpu (macOS-11, lightning, 3.10, 2.1)"
-      - "data-cpu (ubuntu-20.04, lightning, 3.10, 2.1)"
-      - "data-cpu (windows-2022, lightning, 3.10, 2.1)"
-
   # SECTION: lightning_fabric
 
   - id: "lightning_fabric: CPU workflow"

@@ -10,13 +10,6 @@ app:
           - "docs/source-app/**"
           - "requirements/app/**"
 
-data:
-  - changed-files:
-      - any-glob-to-any-file:
-          - "src/lightning/data/**"
-          - "tests/tests_data/**"
-          - "requirements/data/**"
-
 store:
   - changed-files:
       - any-glob-to-any-file:

@@ -193,7 +193,6 @@ our_model.tar
 test.png
 saved_models
 data/
-!src/lightning/data/
 !examples/data/
 !tests/tests_pytorch/utilities/data/
 !requirements/data/

@@ -102,9 +102,6 @@ lint.ignore-init-module-imports = true
     "RET504",  # todo:Unnecessary variable assignment before `return` statement
     "RET503",
 ]
-"src/lightning/data/**" = [
-    "S310",  # todo: Audit URL open for permitted schemes. Allowing use of `file:` or custom schemes is often unexpected.
-]
 "tests/**" = [
     "S101",  # Use of `assert` detected
     "S105", "S106",  # todo: Possible hardcoded password: ...

@@ -1,8 +1 @@
-# NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package
-#  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
-
-lightning-utilities >=0.8.0, <0.10.0
-# to be able to include also PL 2.0 and preserve `>` needed for CI min version bypass
-torch >0.14.0, <=2.2.0
-lightning-cloud
-filelock
+litdata >= 0.2.0rc
@@ -48,8 +48,7 @@ def _prepare_extras() -> Dict[str, Any]:
     extras["app-extra"] = extras["app-app"] + extras["app-cloud"] + extras["app-ui"] + extras["app-components"]
     extras["app-all"] = extras["app-extra"]
     extras["app-dev"] = extras["app-all"] + extras["app-test"]
-    extras["data-all"] = extras["data-data"] + extras["data-cloud"] + extras["data-examples"]
-    extras["data-dev"] = extras["data-all"] + extras["data-test"]
+    extras["data-all"] = extras["data-data"]
     extras["store-store"] = extras["app-app"]  # todo: consider cutting/leaning this dependency
 
     # merge per-project extras of the same category, e.g. `app-test` + `fabric-test`

@@ -17,7 +17,11 @@ Specifically crafted for multi-gpu & multi-node (with [DDP](https://lightning.ai
 
 The `StreamingDataset` is compatible with any data type, including **images, text, video, audio, geo-spatial, and multimodal data** and it is a drop-in replacement for your PyTorch [IterableDataset](https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset) class. For example, it is used by [Lit-GPT](https://github.com/Lightning-AI/lit-gpt/blob/main/pretrain/tinyllama.py) to pretrain LLMs.
 
-# 🚀 Benchmarks
+<br/>
+
+### **Info**: The code behind Lightning Data lives under [https://github.com/Lightning-AI/litdata](https://github.com/Lightning-AI/litdata).
+
+# Benchmarks
 
 [Imagenet-1.2M](https://www.image-net.org/) is a commonly used dataset to compare computer vision models. Its training dataset contains `1,281,167 images`.
 
@@ -27,27 +31,27 @@ Find the reproducible [Studio Benchmark](https://lightning.ai/lightning-ai/studi
 
 ### Imagenet-1.2M Streaming from AWS S3
 
-| Framework   | Images / sec  1st Epoch (float32)     | Images / sec   2nd Epoch (float32)    | Images / sec 1st Epoch (torch16)      | Images / sec 2nd Epoch (torch16)      |
-| ----------- | ------------------------------------- | ------------------------------------- | ------------------------------------- | ------------------------------------- |
-| PL Data     | ${\\textbf{\\color{Fuchsia}5800.34}}$ | ${\\textbf{\\color{Fuchsia}6589.98}}$ | ${\\textbf{\\color{Fuchsia}6282.17}}$ | ${\\textbf{\\color{Fuchsia}7221.88}}$ |
-| Web Dataset | 3134.42                               | 3924.95                               | 3343.40                               | 4424.62                               |
-| Mosaic ML   | 2898.61                               | 5099.93                               | 2809.69                               | 5158.98                               |
+| Framework   | Images / sec  1st Epoch (float32) | Images / sec   2nd Epoch (float32) | Images / sec 1st Epoch (torch16) | Images / sec 2nd Epoch (torch16) |
+| ----------- | --------------------------------- | ---------------------------------- | -------------------------------- | -------------------------------- |
+| PL Data     | **5800.34**                       | **6589.98**                        | **6282.17**                      | **7221.88**                      |
+| Web Dataset | 3134.42                           | 3924.95                            | 3343.40                          | 4424.62                          |
+| Mosaic ML   | 2898.61                           | 5099.93                            | 2809.69                          | 5158.98                          |
 
 Higher is better.
 
 ### Imagenet-1.2M Conversion
 
-| Framework   | Train Conversion Time                   | Val Conversion Time                     | Dataset Size                           | # Files |
-| ----------- | --------------------------------------- | --------------------------------------- | -------------------------------------- | ------- |
-| PL Data     | ${\\textbf{\\color{Fuchsia}10:05 min}}$ | ${\\textbf{\\color{Fuchsia}00:30 min}}$ | ${\\textbf{\\color{Fuchsia}143.1 GB}}$ | 2.339   |
-| Web Dataset | 32:36 min                               | 01:22 min                               | 147.8 GB                               | 1.144   |
-| Mosaic ML   | 49:49 min                               | 01:04 min                               | ${\\textbf{\\color{Fuchsia}143.1 GB}}$ | 2.298   |
+| Framework   | Train Conversion Time | Val Conversion Time | Dataset Size | # Files |
+| ----------- | --------------------- | ------------------- | ------------ | ------- |
+| PL Data     | **10:05 min**         | **00:30 min**       | **143.1 GB** | 2.339   |
+| Web Dataset | 32:36 min             | 01:22 min           | 147.8 GB     | 1.144   |
+| Mosaic ML   | 49:49 min             | 01:04 min           | **143.1 GB** | 2.298   |
 
 The dataset needs to be converted into an optimized format for cloud streaming. We measured how fast the 1.2 million images are converted.
 
 Faster is better.
 
-# 📚 Real World Examples
+# Real World Examples
 
 We have built end-to-end free [Studios](https://lightning.ai) showing all the steps to prepare the following datasets:
 
@@ -62,9 +66,9 @@ We have built end-to-end free [Studios](https://lightning.ai) showing all the st
 
 [Lightning Studios](https://lightning.ai) are fully reproducible cloud IDE with data, code, dependencies, etc...
 
-# 🎬 Getting Started
+# Getting Started
 
-## 💾 Installation
+## Installation
 
 Lightning Data can be installed with `pip`:
 
@@ -74,7 +78,7 @@ Lightning Data can be installed with `pip`:
 pip install --no-cache-dir git+https://github.com/Lightning-AI/lit-data.git@master
 ```
 
-## 🏁 Quick Start
+## Quick Start
 
 ### 1. Prepare Your Data
 
@@ -197,7 +201,7 @@ if __name__ == "__main__":
     )
 ```
 
-# 📈 Easily scale data processing
+# Easily scale data processing
 
 To scale data processing, create a free account on [lightning.ai](https://lightning.ai/) platform. With the platform, the `optimize` and `map` can start multiple machines to make data processing drastically faster as follows:
 
@@ -233,9 +237,9 @@ The Data Prep Job UI from the [LAION 400M Studio](https://lightning.ai/lightning
 
 </div>
 
-# 🔑 Key Features
+# Key Features
 
-## 🚀 Multi-GPU / Multi-Node
+## Multi-GPU / Multi-Node
 
 The `StreamingDataset` and `StreamingDataLoader` takes care of everything for you. They automatically make sure each rank receives different batch of data. There is nothing for you to do if you use them.
 
@@ -276,7 +280,7 @@ for batch in tqdm(train_dataloader):
     pass
 ```
 
-## 🔘 Stateful StreamingDataLoader
+## Stateful StreamingDataLoader
 
 Lightning Data provides a stateful `StreamingDataLoader`. This simplifies resuming training over large datasets.
 
@@ -303,7 +307,7 @@ for batch_idx, batch in enumerate(dataloader):
         torch.save(dataloader.state_dict(), "dataloader_state.pt")
 ```
 
-## 🎥 Profiling
+## Profiling
 
 The `StreamingDataLoader` supports profiling your data loading. Simply use the `profile_batches` argument as follows:
 
@@ -315,7 +319,7 @@ StreamingDataLoader(..., profile_batches=5)
 
 This generates a Chrome trace called `result.json`. You can visualize this trace by opening Chrome browser at the `chrome://tracing` URL and load the trace inside.
 
-## 🪇 Random access
+## Random access
 
 Access the data you need when you need it.
 
@@ -329,7 +333,7 @@ print(len(dataset)) # display the length of your data
 print(dataset[42]) # show the 42th element of the dataset
 ```
 
-## ✢ Use data transforms
+## Use data transforms
 
 ```python
 from lightning.data import StreamingDataset, StreamingDataLoader
@@ -349,7 +353,7 @@ for batch in dataloader:
     # Out: (4, 3, 224, 224)
 ```
 
-## ⚙️ Disk usage limits
+## Disk usage limits
 
 Limit the size of the cache holding the chunks.
 
@@ -359,7 +363,7 @@ from lightning.data import StreamingDataset
 dataset = StreamingDataset(..., max_cache_size="10GB")
 ```
 
-## 💾 Support yield
+## Support yield
 
 When processing large files like compressed [parquet files](https://en.wikipedia.org/wiki/Apache_Parquet), you can use python yield to process and store one item at the time.