Skip to content

Commit

Permalink
todo setup dataloaders wrt test train, validation and scalers
Browse files Browse the repository at this point in the history
  • Loading branch information
Saran-nns committed Jan 5, 2021
1 parent d055ef0 commit eb44e6f
Show file tree
Hide file tree
Showing 3 changed files with 153 additions and 67 deletions.
194 changes: 138 additions & 56 deletions traja/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import pandas as pd
from sklearn.utils import shuffle
from traja.datasets import utils
import random

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -277,7 +278,14 @@ class MultiModalDataLoader:
n_past (int): Input sequence length. Number of time steps from the past.
n_future (int): Target sequence length. Number of time steps to the future.
num_workers (int): Number of cpu subprocess occupied during data loading process
train_split_ratio (float):Should be between 0.0 and 1.0 and represent the proportion of the dataset-validation_dataset
to include in the train split.
validation_split_ratio (float): Should be between 0.0 and 1.0 and represent the proportion of the dataset
to include in the validation split.
num_target_categories: If validation_split_criteria is "category", then num_classes_in_validation_data should be not None.
N number of classes in dataset will be used in validation dataset
scale (bool): If True, scale the input and target and return the corresponding scalers in a dict.
Usage:
------
dataloaders, scalers = MultiModalDataLoader(df = data_frame, batch_size=32, n_past = 20, n_future = 10, num_workers=4)
Expand All @@ -292,64 +300,81 @@ def __init__(
num_workers: int,
train_split_ratio: float,
validation_split_ratio: float = None,
num_val_categories: int = None,
scale: bool = True,
test: bool = True,
):
self.df = df
self.batch_size = batch_size
self.n_past = n_past
self.n_future = n_future
self.num_workers = num_workers
self.test = test
self.train_split_ratio = train_split_ratio
self.validation_split_ratio = validation_split_ratio
self.scale = scale
self.num_val_categories = num_val_categories

if self.num_val_categories is not None:
assert (
self.validation_split_ratio is not None
), "Invalid validation argument, validation_split_ratio not supported for category based validation split"

self.set_validation()
if self.validation_split_ratio is not None:
# Prepare validation data before train and test and their splits
df_val = self.df.groupby("ID").tail(self.validation_split_ratio * len(df))
assert (
self.validation_split_ratio is not None
), "Invalid validation argument, num_val_categories not supported for sequence based validation split"
self.set_validation()

# Generate validation dataset
val_data, val_target, val_category = utils.generate_dataset(
df_val, self.n_past, self.n_future
)
# Train and test data from df-val_df
train_data, target_data, target_category = utils.generate_dataset(

This comment has been minimized.

Copy link
@WolfByttner

WolfByttner Jan 5, 2021

Contributor

We still cannot generate data with a single-step sliding window. In the current inplementation, data can be both in the ground truth fed to the model during training and in the predicted data during validation. Also, the overlap between 'before' and 'after' means we train up to 100x as long, with little benefit!

self.df, self.n_past, self.n_future
)

# Scale validation data:
(val_x, self.val_x_scaler), (val_y, self.val_y_scaler) = (
utils.scale_data(val_data, sequence_length=self.n_past),
utils.scale_data(val_target, sequence_length=self.n_future),
if test:
# Shuffle and split the data
[train_x, train_y, train_z], [test_x, test_y, test_z] = utils.shuffle_split(
train_data,
target_data,
target_category,
train_ratio=self.train_split_ratio,
split=True,
)
# Generate Pytorch dataset
val_dataset = TimeSeriesDataset(val_x, val_y, val_category)

self.validation_loader = torch.utils.data.DataLoader(
dataset=val_dataset,
shuffle=False,
batch_size=self.batch_size,
sampler=None,
drop_last=True,
num_workers=self.num_workers,
else:
[train_x, train_y, train_z] = utils.shuffle_split(
train_data, target_data, target_category, train_ratio=None, split=False
)

# Create new df for train and test; Difference of df with df_val
df = df.loc[df.index.difference(df_val.index)]
# Scale data
if self.scale and self.test:
(train_x, self.train_x_scaler), (train_y, self.train_y_scaler) = (
utils.scale_data(train_x, sequence_length=self.n_past),
utils.scale_data(train_y, sequence_length=self.n_future),
)

# Extract/generate data from the pandas df
train_data, target_data, target_category = utils.generate_dataset(
df, self.n_past, self.n_future
)
(test_x, self.test_x_scaler), (test_y, self.test_y_scaler) = (
utils.scale_data(test_x, sequence_length=self.n_past),
utils.scale_data(test_y, sequence_length=self.n_future),
)

# Shuffle and split the data
[train_x, train_y, train_z], [test_x, test_y, test_z] = utils.shuffle_split(
train_data, target_data, target_category, train_split_ratio=0.75
)
# Weighted Random Sampler
(
train_weighted_sampler,
test_weighted_sampler,
) = utils.weighted_random_samplers(train_z, test_z)

# Scale data
if self.scale and not self.test:
(train_x, self.train_x_scaler), (train_y, self.train_y_scaler) = (
utils.scale_data(train_x, sequence_length=self.n_past),
utils.scale_data(train_y, sequence_length=self.n_future),
)

(train_x, self.train_x_scaler), (train_y, self.train_y_scaler) = (
utils.scale_data(train_x, sequence_length=self.n_past),
utils.scale_data(train_y, sequence_length=self.n_future),
)
(test_x, self.test_x_scaler), (test_y, self.test_y_scaler) = (
utils.scale_data(test_x, sequence_length=self.n_past),
utils.scale_data(test_y, sequence_length=self.n_future),
)
# Weighted Random Sampler
(
train_weighted_sampler,
test_weighted_sampler,
) = utils.weighted_random_samplers(train_z, test_z)

# Weighted Random Sampler
train_weighted_sampler, test_weighted_sampler = utils.weighted_random_samplers(
Expand Down Expand Up @@ -377,33 +402,90 @@ def __init__(
drop_last=True,
num_workers=num_workers,
)
if self.validation_split_ratio is not None:
if self.validation_split_ratio is not None and self.test:

self.dataloaders = {
"train_loader": self.train_loader,
"test_loader": self.test_loader,
"validation_loader": self.validation_loader,
}
self.scalers = {
"train_data_scaler": self.train_x_scaler,
"train_target_scaler": self.train_y_scaler,
"test_data_scaler": self.test_x_scaler,
"test_target_scaler": self.test_y_scaler,
"val_data_scaler": self.val_x_scaler,
"val_target_scaler": self.val_y_scaler,
}
else:
if self.scale:

self.scalers = {
"train_data_scaler": self.train_x_scaler,
"train_target_scaler": self.train_y_scaler,
"test_data_scaler": self.test_x_scaler,
"test_target_scaler": self.test_y_scaler,
"val_data_scaler": self.val_x_scaler,
"val_target_scaler": self.val_y_scaler,
}
else:
self.scalers = None

elif self.test and self.validation_split_ratio is None:

self.dataloaders = {
"train_loader": self.train_loader,
"test_loader": self.test_loader,
}
self.scalers = {
"train_data_scaler": self.train_x_scaler,
"train_target_scaler": self.train_y_scaler,
"test_data_scaler": self.test_x_scaler,
"test_target_scaler": self.test_y_scaler,
}
if self.scale:

self.scalers = {
"train_data_scaler": self.train_x_scaler,
"train_target_scaler": self.train_y_scaler,
"test_data_scaler": self.test_x_scaler,
"test_target_scaler": self.test_y_scaler,
}
else:
self.scalers = None

elif self.validation_split_ratio is not None and not self.test:

This comment has been minimized.

Copy link
@WolfByttner

WolfByttner Jan 5, 2021

Contributor

Run pep8 on this file. Visual Studio Code and PyCharm both come with powerful tools to help you do this.




def set_validation(self):
"""[summary]
Args:
target_categories (list, optional): [description]. Defaults to None.
"""

if self.validation_split_ratio is None and self.num_val_categories is not None:
max_ID = self.df["ID"].max()
val_categories = random.sample(range(1, max_ID), self.num_val_categories)
self.df_val = self.df.loc[self.df["ID"].isin(val_categories)]

This comment has been minimized.

Copy link
@WolfByttner

WolfByttner Jan 5, 2021

Contributor

It seems like you removed self.df_val further up, so I'm not sure if this runs


if self.validation_split_ratio is not None and self.num_val_categories is None:
# Prepare validation data before train and test and their splits
self.df_val = self.df.groupby("ID").tail(

This comment has been minimized.

Copy link
@WolfByttner

WolfByttner Jan 5, 2021

Contributor

Train, test and validation all need to be grouped by category, if that is the mode set. So you will need to extract the unique IDs, then divide them into train, test and validation based on the ratios provided by the user.

self.validation_split_ratio * len(self.df)
)

# Generate validation dataset
val_x, val_y, val_z = utils.generate_dataset(
self.df_val, self.n_past, self.n_future
)
if self.scale:
# Scale validation data:
(val_x, self.val_x_scaler), (val_y, self.val_y_scaler) = (
utils.scale_data(val_x, sequence_length=self.n_past),
utils.scale_data(val_y, sequence_length=self.n_future),
)
# Generate Pytorch dataset
val_dataset = TimeSeriesDataset(val_x, val_y, val_z)

self.validation_loader = torch.utils.data.DataLoader(
dataset=val_dataset,
shuffle=False,
batch_size=self.batch_size,
sampler=None,
drop_last=True,
num_workers=self.num_workers,
)

# Create new df for train and test; Difference of df with df_val
self.df = self.df.loc[self.df.index.difference(self.df_val.index)]

def __new__(
cls,
Expand Down
24 changes: 14 additions & 10 deletions traja/datasets/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ def shuffle_split(
target_data: np.array,
target_category: np.array,
train_ratio: float,
split: bool = True,
):
"""[summary]
Expand All @@ -83,6 +84,7 @@ def shuffle_split(
target_data (np.array): [description]
target_category (np.array): [description]
train_ratio (float): [description]
split (bool): If True, split the data into train and test, else only shuffle the dataset and return it for training
Returns:
[type]: [description]
Expand All @@ -95,19 +97,21 @@ def shuffle_split(

assert train_ratio > 0, "Train data ratio should be greater than zero"
assert train_ratio <= 1.0, "Train data ratio should be less than or equal to 1 "
if split:
# Train test split
split = int(train_ratio * len(train_data))

# Train test split
split = int(train_ratio * len(train_data))
train_x = train_data[:split]

This comment has been minimized.

Copy link
@WolfByttner

WolfByttner Jan 5, 2021

Contributor

This approach still does not solve the problem that for some sequences (ODE-generated ones, for instance), you need to be able to recover the original sequence corresponding to a particular ID. To do this, you need to first generate one (unshuffled) dataset, then sample from it in the dataloader. The SubsetRandomSampler does this for you.

train_y = target_data[:split]
train_z = target_category[:split]

train_x = train_data[:split]
train_y = target_data[:split]
train_z = target_category[:split]
test_x = train_data[split:]
test_y = target_data[split:]
test_z = target_category[split:]

test_x = train_data[split:]
test_y = target_data[split:]
test_z = target_category[split:]

return [train_x, train_y, train_z], [test_x, test_y, test_z]
return [train_x, train_y, train_z], [test_x, test_y, test_z]
else:
return train_data, target_data, target_category


def scale_data(data, sequence_length):
Expand Down
2 changes: 1 addition & 1 deletion traja/models/visualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@


def DisplayLatentDynamics(latent):
"""Visualize the dynamics of combination of latents
"""Visualize the dynamics in latent space. Compatible only with the RNN latents
Args:
latent(tensor): Each point in the list is latent's state at the end of a sequence of each batch.
Latent shape (batch_size, latent_dim)
Expand Down

0 comments on commit eb44e6f

Please sign in to comment.