diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index ad9469b26..24c6cbc76 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -9,6 +9,7 @@ Suppose that we want to load data from CSV files with the following steps: - List all CSV files in a directory - Load CSV files - Parse CSV file and yield rows +- Split our dataset into training and validation sets There are a few `built-in DataPipes `_ that can help us with the above operations. @@ -19,6 +20,8 @@ There are a few `built-in DataPipes `_ that can h streams `_ - ``CSVParser`` - `consumes file streams, parses the CSV contents, and returns one parsed line at a time `_ +- ``RandomSplitter`` - `randomly split samples from a source DataPipe into + groups `_ As an example, the source code for ``CSVParser`` looks something like this: @@ -48,9 +51,14 @@ class constructors. A pipeline can be assembled as the following: datapipe = dp.iter.FileLister([FOLDER]).filter(filter_fn=lambda filename: filename.endswith('.csv')) datapipe = dp.iter.FileOpener(datapipe, mode='rt') datapipe = datapipe.parse_csv(delimiter=',') + N_ROWS = 10000 # total number of rows of data + train, valid = datapipe.random_split(total_length=N_ROWS, weights={"train": 0.5, "valid": 0.5}, seed=0) - for d in datapipe: # Iterating through the data - pass + for x in train: # Iterating through the training dataset + pass + + for y in valid: # Iterating through the validation dataset + pass You can find the full list of built-in `IterDataPipes here `_ and `MapDataPipes here `_. @@ -422,5 +430,6 @@ directory ``curated/covid-19/ecdc_cases/latest``, belonging to account ``pandemi # [['date_rep', 'day', ..., 'iso_country', 'daterep'], # ['2020-12-14', '14', ..., 'AF', '2020-12-14'], # ['2020-12-13', '13', ..., 'AF', '2020-12-13']] + If necessary, you can also access data in Azure Data Lake Storage Gen1 by using URIs staring with ``adl://`` and ``abfs://``, as described in `README of adlfs repo `_ diff --git a/torchdata/datapipes/iter/util/randomsplitter.py b/torchdata/datapipes/iter/util/randomsplitter.py index 0b8635828..27732314f 100644 --- a/torchdata/datapipes/iter/util/randomsplitter.py +++ b/torchdata/datapipes/iter/util/randomsplitter.py @@ -16,7 +16,7 @@ @functional_datapipe("random_split") class RandomSplitterIterDataPipe(IterDataPipe): r""" - Randomly split samples from a source DataPipe into groups(functional name: ``random_split``). + Randomly split samples from a source DataPipe into groups (functional name: ``random_split``). Since there is no buffer, only ONE group of samples (i.e. one child DataPipe) can be iterated through at any time. Attempts to iterate through multiple of them simultaneously will fail.