diff --git a/.codecov.yml b/.codecov.yml
deleted file mode 100644
index fb8538cb..00000000
--- a/.codecov.yml
+++ /dev/null
@@ -1,3 +0,0 @@
-ignore:
-  - "test_*.py"
-  - "traja-gui.py*"
diff --git a/.coveragerc b/.coveragerc
index c0f74596..723ba835 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -7,6 +7,4 @@ exclude_lines =
     if __name__ == .__main__.:
 omit =
     traja/tests/*
-    traja/contrib/*
-    traja/models/*
-    traja/rutils.py
\ No newline at end of file
+    traja/contrib/*
\ No newline at end of file
diff --git a/codecov.yml b/codecov.yml
new file mode 100644
index 00000000..44819a03
--- /dev/null
+++ b/codecov.yml
@@ -0,0 +1,25 @@
+codecov:
+  require_ci_to_pass: yes
+
+coverage:
+  precision: 2
+  round: down
+  range: "70...100"
+
+parsers:
+  gcov:
+    branch_detection:
+      conditional: yes
+      loop: yes
+      method: yes
+      macro: yes
+
+comment:
+  layout: "reach,diff,flags,files,footer"
+  behavior: default
+  require_changes: no
+
+ignore:
+  - "test_*.py"
+  - "traja-gui.py*"
+
diff --git a/docs/neuralnets/train_lstm.py b/docs/neuralnets/train_lstm.py
index ca84b243..d3f2ba3a 100644
--- a/docs/neuralnets/train_lstm.py
+++ b/docs/neuralnets/train_lstm.py
@@ -4,7 +4,7 @@
 """
 import traja
 from traja.model import LSTM 
-from traja.datasets import dataset
+from traja.dataset import dataset
 
 df = traja.TrajaDataFrame({"x": [0, 1, 2, 3, 4], "y": [1, 3, 2, 4, 5]})
 
diff --git a/docs/source/predictions.rst b/docs/source/predictions.rst
index 556ff06c..ec69fe88 100644
--- a/docs/source/predictions.rst
+++ b/docs/source/predictions.rst
@@ -29,13 +29,22 @@ via :class:`~traja.models.predictive_models.lstm.LSTM`.
     batch_size = 10 # How many sequences to train every step. Constrained by GPU memory.
     num_past = 10 # How many time steps from which to learn the time series
     num_future = 5 # How many time steps to predict
+    split_by_id = False # Whether to split data into training, test and validation sets based on
+                        # the animal's ID or not. If True, an animal's entire trajectory will only
+                        # be used for training, or only for testing and so on.
+                        # If your animals are territorial (like Jaguars) and you want to forecast
+                        # their trajectories, you want this to be false. If, however, you want to
+                        # classify the group membership of an animal, you want this to be true,
+                        # so that you can verify that previously unseen animals get assigned to
+                        # the correct class.
 
 
     data_loaders, scalers = dataset.MultiModalDataLoader(df,
                                                          batch_size=batch_size,
                                                          n_past=num_past,
                                                          n_future=num_future,
-                                                         num_workers=1)
+                                                         num_workers=1,
+                                                         split_by_id=split_by_id)
 
 .. note::
 
@@ -78,4 +87,78 @@ via :class:`~traja.models.predictive_models.lstm.LSTM`.
     # Train the model
     trainer.fit(data_loaders, model_save_path, epochs=10, training_mode='forecasting')
 
-.. image:: _static/rnn_prediction.png
\ No newline at end of file
+After training, you can determine the network's final performance with test data, if you want to pick
+the best model, or with validation data, if you want to determine the performance of your model.
+
+The data_loaders dictionary contains the 'sequential_test_loader' and 'sequential_validation_loader,
+that preserve the order of the original data. The dictionary also contains the 'test_loader' and
+'validation_loader' data loaders, where the order of the time series is randomised.
+
+.. code-block:: python
+
+    validation_loader = data_loaders['sequential_validation_loader']
+
+    trainer.validate(validation_loader)
+
+Finally, you can display your training results using the built-in plotting libraries.
+
+.. code-block:: python
+
+    from traja.plotting import plot_prediction
+
+    batch_index = 0  # The batch you want to plot
+    plot_prediction(model, validation_loader, batch_index)
+
+.. image:: _static/rnn_prediction.png
+
+Parameter searching
+-------------------
+
+When optimising neural networks, you often want to change the parameters. When training a forecaster,
+you have to reinitialise and retrain your model. However, when training a classifier or regressor, you
+can reset these on the fly, since they work directly on the latent space of your model.
+VAE models provide utility functions to make this easy.
+
+.. code-block:: python
+
+    from traja.models import MultiModelVAE
+    input_size = 2 # Number of input dimensions (normally x, y)
+    output_size = 2 # Same as input_size when predicting
+    num_layers = 2 # Number of LSTM layers. Deeper learns more complex patterns but overfits.
+    hidden_size = 32 # Width of layers. Wider learns bigger patterns but overfits. Try 32, 64, 128, 256, 512
+    dropout = 0.1 # Ignore some network connections. Improves generalisation.
+
+    # Classifier parameters
+    classifier_hidden_size = 32
+    num_classifier_layers = 4
+    num_classes = 42
+
+    # Regressor parameters
+    regressor_hidden_size = 18
+    num_regressor_layers = 1
+    num_regressor_parameters = 3
+
+    model = MultiModelVAE(input_size=input_size,
+                          hidden_size=hidden_size,
+                          num_layers=num_layers,
+                          output_size=output_size,
+                          dropout=dropout,
+                          batch_size=batch_size,
+                          num_future=num_future,
+                          classifier_hidden_size=classifier_hidden_size,
+                          num_classifier_layers=num_classifier_layers,
+                          num_classes=num_classes,
+                          regressor_hidden_size=regressor_hidden_size,
+                          num_regressor_layers=num_regressor_layers,
+                          num_regressor_parameters=num_regressor_parameters)
+
+    new_classifier_hidden_size = 64
+    new_num_classifier_layers = 2
+
+    model.reset_classifier(classifier_hidden_size=new_classifier_hidden_size,
+                           num_classifier_layers=new_num_classifier_layers)
+
+    new_regressor_hidden_size = 64
+    new_num_regressor_layers = 2
+    model.reset_regressor(regressor_hidden_size=new_regressor_hidden_size,
+                          num_regressor_layers=new_num_regressor_layers)
\ No newline at end of file
diff --git a/docs/source/reference.rst b/docs/source/reference.rst
index 32afd2a6..4f793973 100644
--- a/docs/source/reference.rst
+++ b/docs/source/reference.rst
@@ -52,7 +52,7 @@ The following methods are available via :mod:`traja.plotting`:
 
 .. automethod:: traja.plotting.polar_bar
 
-.. automethod:: traja.plotting.predict
+.. automethod:: traja.plotting.plot_prediction
 
 .. automethod:: traja.plotting.sans_serif
 
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 52d6cc2f..962c99a3 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -4,7 +4,7 @@ matplotlib
 shapely
 psutil
 scipy
-sklearn
+scikit-learn
 fastdtw
 plotly
 networkx
diff --git a/traja-gui.py b/traja-gui.py
index 5c7652b0..a84e9135 100644
--- a/traja-gui.py
+++ b/traja-gui.py
@@ -36,7 +36,7 @@ def __init__(self, filepath):
 
     @pyqtSlot()
     def read_in_chunks(self):
-        """ load datasets in parts and update the progess par """
+        """ load dataset in parts and update the progess par """
         chunksize = 10 ** 3
         lines_number = sum(1 for line in open(self.filepath))
         self.progressMaximum.emit(lines_number // chunksize)
diff --git a/traja/__init__.py b/traja/__init__.py
index 5a3f5355..300dbeb0 100644
--- a/traja/__init__.py
+++ b/traja/__init__.py
@@ -1,12 +1,12 @@
+import logging
+
+from traja import dataset
+from traja import models
 from .accessor import TrajaAccessor
 from .frame import TrajaDataFrame, TrajaCollection
 from .parsers import read_file, from_df
 from .plotting import *
 from .trajectory import *
-from traja import models
-from traja import datasets
-
-import logging
 
 __author__ = "justinshenk"
 __version__ = "0.2.3"
diff --git a/traja/accessor.py b/traja/accessor.py
index 0d9be885..4e82e31e 100644
--- a/traja/accessor.py
+++ b/traja/accessor.py
@@ -50,7 +50,7 @@ def bounds(self):
         return (xlim, ylim)
 
     def night(self, begin: str = "19:00", end: str = "7:00"):
-        """Get nighttime datasets between `begin` and `end`.
+        """Get nighttime dataset between `begin` and `end`.
 
         Args:
           begin (str):  (Default value = '19:00')
@@ -63,7 +63,7 @@ def night(self, begin: str = "19:00", end: str = "7:00"):
         return self.between(begin, end)
 
     def day(self, begin: str = "7:00", end: str = "19:00"):
-        """Get daytime datasets between `begin` and `end`.
+        """Get daytime dataset between `begin` and `end`.
 
         Args:
           begin (str):  (Default value = '7:00')
@@ -141,14 +141,14 @@ def rediscretize_points(self, R, **kwargs):
         return traja.trajectory.rediscretize_points(self, _obj, R=R, **kwargs)
 
     def trip_grid(
-        self,
-        bins: Union[int, tuple] = 10,
-        log: bool = False,
-        spatial_units=None,
-        normalize: bool = False,
-        hist_only: bool = False,
-        plot: bool = True,
-        **kwargs,
+            self,
+            bins: Union[int, tuple] = 10,
+            log: bool = False,
+            spatial_units=None,
+            normalize: bool = False,
+            hist_only: bool = False,
+            plot: bool = True,
+            **kwargs,
     ):
         """Returns a 2D histogram of trip.
 
@@ -325,9 +325,9 @@ def get_derivatives(self) -> pd.DataFrame:
         return derivs
 
     def speed_intervals(
-        self,
-        faster_than: Union[float, int] = None,
-        slower_than: Union[float, int] = None,
+            self,
+            faster_than: Union[float, int] = None,
+            slower_than: Union[float, int] = None,
     ):
         """Returns ``TrajaDataFrame`` with speed time intervals.
 
diff --git a/traja/contrib/rdp.py b/traja/contrib/rdp.py
index e00c4179..66e6ed37 100644
--- a/traja/contrib/rdp.py
+++ b/traja/contrib/rdp.py
@@ -105,10 +105,10 @@ def _rdp_iter(M, start_index, last_index, epsilon, dist=pldist):
 
 
 def rdp_iter(
-    M: Union[list, np.ndarray],
-    epsilon: float,
-    dist: Callable = pldist,
-    return_mask: bool = False,
+        M: Union[list, np.ndarray],
+        epsilon: float,
+        dist: Callable = pldist,
+        return_mask: bool = False,
 ):
     """
     Simplifies a given array of points.
@@ -135,11 +135,11 @@ def rdp_iter(
 
 
 def rdp(
-    M: Union[list, np.ndarray],
-    epsilon: float = 0,
-    dist: Callable = pldist,
-    algo: str = "iter",
-    return_mask: bool = False,
+        M: Union[list, np.ndarray],
+        epsilon: float = 0,
+        dist: Callable = pldist,
+        algo: str = "iter",
+        return_mask: bool = False,
 ):
     """
     Simplifies a given array of points using the Ramer-Douglas-Peucker
diff --git a/traja/data/__init__.py b/traja/data/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/traja/data/loader.py b/traja/data/loader.py
deleted file mode 100644
index 2c691cc3..00000000
--- a/traja/data/loader.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from torch.utils.data import DataLoader
-
-from ..datasets.dataset import TrajectoryDataset, seq_collate
-
-
-def data_loader(args, path):
-    dset = TrajectoryDataset(
-        path,
-        obs_len=args.obs_len,
-        pred_len=args.pred_len,
-        skip=args.skip,
-        delim=args.delim,
-    )
-
-    loader = DataLoader(
-        dset,
-        batch_size=args.batch_size,
-        shuffle=True,
-        num_workers=args.loader_num_workers,
-        collate_fn=seq_collate,
-    )
-    return dset, loader
diff --git a/traja/dataset/__init__.py b/traja/dataset/__init__.py
new file mode 100644
index 00000000..686575fe
--- /dev/null
+++ b/traja/dataset/__init__.py
@@ -0,0 +1,2 @@
+from . import example
+from .dataset import TimeSeriesDataset, MultiModalDataLoader
diff --git a/traja/dataset/dataset.py b/traja/dataset/dataset.py
new file mode 100644
index 00000000..8639a0c8
--- /dev/null
+++ b/traja/dataset/dataset.py
@@ -0,0 +1,341 @@
+"""
+Modified from https://github.com/agrimgupta92/sgan/blob/master/sgan/data/trajectories.py.
+
+This module contains:
+
+Classes:
+1. Pytorch Time series dataset class instance
+2. Weighted train and test dataset loader with respect to class distribution
+
+Helpers:
+1. Class distribution in the dataset
+
+"""
+import logging
+import math
+from collections import defaultdict
+
+import numpy as np
+import pandas as pd
+import sklearn
+import torch
+from sklearn.base import TransformerMixin
+from sklearn.preprocessing import MinMaxScaler
+from torch.utils.data import Dataset
+from torch.utils.data.sampler import SubsetRandomSampler, WeightedRandomSampler
+
+from traja.dataset import generator
+from traja.dataset.generator import get_indices_from_sequence_ids
+
+logger = logging.getLogger(__name__)
+
+
+class TimeSeriesDataset(Dataset):
+    r"""Pytorch Dataset object
+
+    Args:
+        Dataset (torch.utils.data.Dataset): Pyptorch dataset object
+    """
+
+    def __init__(self, data, target, category=None, parameters=None, scaler: TransformerMixin = None):
+        r"""
+        Args:
+            data (array): Data
+            target (array): Target
+            category (array): Category
+            parameters (array): Parameters
+            scaler (sklearn.base.TransformerMixin)
+        """
+
+        self.data = data
+        self.target = target
+        self.category = category
+        self.parameters = parameters
+        self.scaler = scaler
+
+    def __getitem__(self, index):
+        x = self.data[index]
+        y = self.target[index]
+        z = self.category[index] if self.category else torch.zeros(1)
+        w = self.parameters[index] if self.parameters else torch.zeros(1)
+
+        if self.scaler is not None:
+            x = torch.tensor(self.scaler.transform(x))
+            y = torch.tensor(self.scaler.transform(y))
+        return x, y, z, w
+
+    def __len__(self):
+        return len(self.data)
+
+
+class MultiModalDataLoader:
+    """
+    MultiModalDataLoader wraps the following data preparation steps,
+    
+    1. Data generator: Extract x and y time series and corresponding ID (category) in the dataset. This process split the dataset into 
+                        i) Train samples with sequence length equals n_past
+                        ii) Target samples with sequence length equals n_future 
+                        iii) Target category(ID) of both train and target data
+    2. Data scalling: Scale the train and target data columns between the range (-1,1) using MinMaxScalers; TODO: It is more optimal to scale data for each ID(category)
+    3. Data shuffling: Shuffle the order of samples in the dataset without loosing the train<->target<->category combination
+    4. Create train test split: Split the shuffled batches into train (data, target, category) and test(data, target, category)
+    5. Weighted Random sampling: Apply weights with respect to category counts in the dataset: category_sample_weight = 1/num_category_samples; This avoid model overfit to category appear often in the dataset 
+    6. Create pytorch Dataset instances
+    7. Returns the train and test data loader instances along with their scalers as a dictionaries given the dataset instances and batch size
+
+        Args:
+            df (pd.DataFrame): Dataset
+            batch_size (int): Number of samples per batch of data
+            n_past (int): Input sequence length. Number of time steps from the past. 
+            n_future (int): Target sequence length. Number of time steps to the future. 
+            num_workers (int): Number of cpu subprocess occupied during data loading process
+            train_split_ratio (float):Should be between 0.0 and 1.0 and represent the proportion of the dataset-validation_dataset 
+                                      to include in the train split. 
+            validation_split_ratio (float): Should be between 0.0 and 1.0 and represent the proportion of the dataset 
+                                      to include in the validation split.
+            stride: Size of the sliding window. Defaults to sequence_length
+            split_by_id (bool): Whether to split data based on the sequence's category (default) or ID
+            scale (bool): If True, scale the input and target and return the corresponding scalers in a dict.
+            parameter_columns (list): Columns in data frame with regression parameters.
+            weighted_sampling (bool): Whether to weigh the likelihood of picking each sample by the sequence length.
+                                      This balances the accuracy if trajectories have different lengths.
+
+        Usage:
+        ------
+        dataloaders, scalers = MultiModalDataLoader(df = data_frame, batch_size=32, n_past = 20, n_future = 10, num_workers=4)
+        """
+
+    def __init__(
+            self,
+            df: pd.DataFrame,
+            batch_size: int,
+            n_past: int,
+            n_future: int,
+            num_workers: int,
+            train_split_ratio: float = 0.4,
+            validation_split_ratio: float = 0.2,
+            stride: int = None,
+            split_by_id: bool = True,
+            scale: bool = True,
+            test: bool = True,
+            parameter_columns: list = (),
+            weighted_sampling: bool = False,
+    ):
+        self.df = df
+        self.batch_size = batch_size
+        self.n_past = n_past
+        self.n_future = n_future
+        self.num_workers = num_workers
+        self.test = test
+        self.train_split_ratio = train_split_ratio
+        self.validation_split_ratio = validation_split_ratio
+        self.split_by_id = split_by_id
+        self.scale = scale
+        self.stride = stride
+
+        # Train and test data from df-val_df
+        train_data, target_data, target_ids, target_parameters, samples_in_sequence_id = generator.generate_dataset(
+            self.df, self.n_past,
+            self.n_future, stride=self.stride,
+            parameter_columns=parameter_columns
+        )
+
+        if self.scale:
+            scaler = MinMaxScaler(feature_range=(-1, 1))
+            scaler.fit(np.vstack(train_data + target_data))
+        else:
+            scaler = None
+
+        # Dataset
+        dataset = TimeSeriesDataset(train_data, target_data, target_ids, target_parameters, scaler=scaler)
+
+        # We initialise sample weights in case we need them to weigh samples.
+        train_weights = defaultdict(float)
+        test_weights = defaultdict(float)
+        validation_weights = defaultdict(float)
+
+        if self.split_by_id:
+            ids = list(set(target_ids))
+            np.random.shuffle(ids)
+
+            train_split_index = round(train_split_ratio * len(ids))
+            validation_split_index = round((1 - validation_split_ratio) * len(ids))
+
+            train_ids = np.sort(ids[:train_split_index])
+            test_ids = np.sort(ids[train_split_index:validation_split_index])
+            validation_ids = np.sort(ids[validation_split_index:])
+
+            train_indices, train_weights = get_indices_from_sequence_ids(train_ids, samples_in_sequence_id)
+            test_indices, test_weights = get_indices_from_sequence_ids(test_ids, samples_in_sequence_id)
+            validation_indices, validation_weights = get_indices_from_sequence_ids(validation_ids,
+                                                                                   samples_in_sequence_id)
+
+        else:  # Do not sample by sequence ID
+            if stride is None:
+                stride = n_past + n_future
+
+            sequence_length = n_past + n_future
+            train_indices = list()
+            test_indices = list()
+            validation_indices = list()
+            id_start_index = 0
+            for sequence_index, sequence_count in enumerate(samples_in_sequence_id):
+                overlap = math.ceil(sequence_length / stride)
+
+                start_test_index = round(sequence_count * train_split_ratio)
+                end_train_index = start_test_index - overlap
+
+                start_validation_index = round(sequence_count * (1 - validation_split_ratio))
+                end_test_index = start_validation_index - overlap
+
+                train_indices.extend(list(range(id_start_index, id_start_index + end_train_index)))
+                test_indices.extend(list(range(id_start_index + start_test_index, id_start_index + end_test_index)))
+                validation_indices.extend(
+                    list(range(id_start_index + start_validation_index, id_start_index + sequence_count)))
+
+                train_weights[sequence_index] = 1.0 / end_train_index if end_train_index > 0 else 0
+                test_weights[sequence_index] = 1.0 / (end_test_index - start_test_index) if (
+                                                                                                    end_test_index - start_test_index) > 0 else 0
+                validation_weights[sequence_index] = 1.0 / (sequence_count - start_validation_index) if (
+                                                                                                                sequence_count - start_validation_index) > 0 else 0
+
+                id_start_index += sequence_count
+
+        sequential_train_dataset = torch.utils.data.Subset(dataset, np.sort(train_indices[:]))
+        sequential_test_dataset = torch.utils.data.Subset(dataset, np.sort(test_indices[:]))
+        sequential_validation_dataset = torch.utils.data.Subset(dataset, np.sort(validation_indices[:]))
+
+        if weighted_sampling:
+            train_index_weights = list()
+            test_index_weights = list()
+            validation_index_weights = list()
+
+            for data, target, sequence_id, parameters in sequential_train_dataset:
+                train_index_weights.append(train_weights[sequence_id])
+            for data, target, sequence_id, parameters in sequential_test_dataset:
+                test_index_weights.append(test_weights[sequence_id])
+            for data, target, sequence_id, parameters in sequential_validation_dataset:
+                validation_index_weights.append(validation_weights[sequence_id])
+
+            train_dataset = sequential_train_dataset
+            test_dataset = sequential_test_dataset
+            validation_dataset = sequential_validation_dataset
+
+            train_sampler = WeightedRandomSampler(weights=train_index_weights, num_samples=len(train_index_weights),
+                                                  replacement=True)
+            test_sampler = WeightedRandomSampler(weights=test_index_weights, num_samples=len(test_index_weights),
+                                                 replacement=True)
+            validation_sampler = WeightedRandomSampler(weights=validation_index_weights,
+                                                       num_samples=len(validation_index_weights), replacement=True)
+
+        else:
+            train_dataset = dataset
+            test_dataset = dataset
+            validation_dataset = dataset
+
+            np.random.shuffle(train_indices)
+            np.random.shuffle(test_indices)
+            np.random.shuffle(validation_indices)
+
+            train_sampler = SubsetRandomSampler(train_indices)
+            test_sampler = SubsetRandomSampler(test_indices)
+            validation_sampler = SubsetRandomSampler(validation_indices)
+
+        # Dataloader
+        self.train_loader = torch.utils.data.DataLoader(
+            dataset=train_dataset,
+            shuffle=False,
+            batch_size=self.batch_size,
+            sampler=train_sampler,
+            drop_last=True,
+            num_workers=num_workers,
+        )
+        self.test_loader = torch.utils.data.DataLoader(
+            dataset=test_dataset,
+            shuffle=False,
+            batch_size=self.batch_size,
+            sampler=test_sampler,
+            drop_last=True,
+            num_workers=num_workers,
+        )
+        self.validation_loader = torch.utils.data.DataLoader(
+            dataset=validation_dataset,
+            shuffle=False,
+            batch_size=self.batch_size,
+            sampler=validation_sampler,
+            drop_last=True,
+            num_workers=num_workers,
+        )
+        self.sequential_loader = torch.utils.data.DataLoader(
+            dataset=dataset,
+            shuffle=False,
+            batch_size=self.batch_size,
+            drop_last=True,
+            num_workers=num_workers,
+        )
+        self.sequential_train_loader = torch.utils.data.DataLoader(
+            dataset=sequential_train_dataset,
+            shuffle=False,
+            batch_size=self.batch_size,
+            drop_last=True,
+            num_workers=num_workers,
+        )
+        self.sequential_test_loader = torch.utils.data.DataLoader(
+            dataset=sequential_test_dataset,
+            shuffle=False,
+            batch_size=self.batch_size,
+            drop_last=True,
+            num_workers=num_workers,
+        )
+        self.sequential_validation_loader = torch.utils.data.DataLoader(
+            dataset=sequential_validation_dataset,
+            shuffle=False,
+            batch_size=self.batch_size,
+            drop_last=True,
+            num_workers=num_workers,
+        )
+
+        self.dataloaders = {
+            "train_loader": self.train_loader,
+            "test_loader": self.test_loader,
+            "validation_loader": self.validation_loader,
+            "sequential_loader": self.sequential_loader,
+            "sequential_train_loader": self.sequential_train_loader,
+            "sequential_test_loader": self.sequential_test_loader,
+            "sequential_validation_loader": self.sequential_validation_loader
+        }
+
+    def __new__(
+            cls,
+            df: pd.DataFrame,
+            batch_size: int,
+            n_past: int,
+            n_future: int,
+            num_workers: int,
+            split_by_id: bool = True,
+            stride: int = None,
+            train_split_ratio: float = 0.4,
+            validation_split_ratio: float = 0.2,
+            scale: bool = True,
+            parameter_columns: list = list(),
+            weighted_sampling: bool = False,
+    ):
+        """Constructor of MultiModalDataLoader"""
+        # Loader instance
+        loader_instance = super(MultiModalDataLoader, cls).__new__(cls)
+        loader_instance.__init__(
+            df,
+            batch_size,
+            n_past,
+            n_future,
+            num_workers,
+            train_split_ratio=train_split_ratio,
+            validation_split_ratio=validation_split_ratio,
+            split_by_id=split_by_id,
+            stride=stride,
+            scale=scale,
+            parameter_columns=parameter_columns,
+            weighted_sampling=weighted_sampling,
+        )
+        # Return train and test loader attributes
+        return loader_instance.dataloaders
diff --git a/traja/datasets/example.py b/traja/dataset/example.py
similarity index 95%
rename from traja/datasets/example.py
rename to traja/dataset/example.py
index dda48f0f..9003ba5d 100644
--- a/traja/datasets/example.py
+++ b/traja/dataset/example.py
@@ -1,6 +1,5 @@
 import pandas as pd
 
-
 default_cache_url = 'dataset_cache'
 
 
@@ -8,4 +7,4 @@ def jaguar(cache_url=default_cache_url):
     # Sample data
     data_url = "https://raw.githubusercontent.com/traja-team/traja-research/dataset_und_notebooks/dataset_analysis/jaguar5.csv"
     df = pd.read_csv(data_url, error_bad_lines=False)
-    return df
\ No newline at end of file
+    return df
diff --git a/traja/dataset/generator.py b/traja/dataset/generator.py
new file mode 100644
index 00000000..a41a0157
--- /dev/null
+++ b/traja/dataset/generator.py
@@ -0,0 +1,91 @@
+import logging
+from collections import defaultdict
+
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+
+def generate_dataset(df, n_past: int, n_future: int, stride: int = None, parameter_columns: list = list()):
+    """
+    df : Dataframe
+    n_past: Number of past observations
+    n_future: Number of future observations
+    stride: Size of the sliding window. Defaults to sequence_length
+    Returns:
+    X: Past steps
+    Y: Future steps (Sequence target)
+    Z: Sequence ID"""
+
+    # Split the dataframe with respect to IDs
+    sequence_ids = dict(
+        tuple(df.groupby("ID"))
+    )  # Dict of ids as keys and x,y,id as values
+
+    train_data, target_data, target_category, target_parameters = list(), list(), list(), list()
+
+    if stride is None:
+        stride = n_past + n_future
+
+    assert n_past >= 1, 'n_past has to be positive!'
+    assert n_future >= 1, 'n_past has to be positive!'
+    assert stride >= 1, 'Stride has to be positive!'
+
+    samples_in_sequence_id = list()
+
+    for ID in sequence_ids.keys():
+        xx, yy, zz, ww = list(), list(), list(), list()
+        # Drop the column ids and convert the pandas into arrays
+        non_parameter_columns = [column for column in df.columns if column not in parameter_columns]
+        series = sequence_ids[ID].drop(columns=['ID'] + parameter_columns).to_numpy()
+        parameters = sequence_ids[ID].drop(columns=non_parameter_columns).to_numpy()[0, :]
+        window_start = 0
+        sequences_in_category = 0
+        while window_start <= len(series):
+            past_end = window_start + n_past
+            future_end = past_end + n_future
+            if not future_end >= len(series):
+                # slicing the past and future parts of the window
+                past, future = series[window_start:past_end, :], series[past_end:future_end, :]
+                # past, future = series[window_start:future_end, :], series[past_end:future_end, :]
+                xx.append(past)
+                yy.append(future)
+                # For each sequence length set target category
+                zz.append(int(ID), )
+                ww.append(parameters)
+                sequences_in_category += 1
+            window_start += stride
+
+        train_data.extend(np.array(xx))
+        target_data.extend(np.array(yy))
+        target_category.extend(np.array(zz))
+        target_parameters.extend(np.array(ww))
+        samples_in_sequence_id.append(sequences_in_category)
+    return train_data, target_data, target_category, target_parameters, samples_in_sequence_id
+
+
+def get_indices_from_sequence_ids(sequence_ids: list, samples_in_sequence_id: list):
+    indices = list()
+
+    # We compute weights since it is cheap and they are used when weighing samples.
+    weights = defaultdict(float)
+    sequence_index = 0
+    start_index = 0
+
+    for sequence_id in sequence_ids:
+        # We need to compute the start of each sequence's samples. To do this, we
+        # compute the start of all sequences' sample starts. start_index
+        # keeps track of where each sequence's samples start.
+        while sequence_index < len(samples_in_sequence_id) and sequence_index < sequence_id:
+            start_index += samples_in_sequence_id[sequence_index]
+            sequence_index += 1
+        if sequence_index >= len(samples_in_sequence_id):
+            break
+        if sequence_index == sequence_id:
+            # The weight is simply one over the number of samples in this sequence.
+            # We can never divide by zero - empty categories are implicitly excluded
+            weights[sequence_id] = 1.0 / samples_in_sequence_id[sequence_id]
+            indices += list(range(start_index, start_index + samples_in_sequence_id[sequence_id]))
+        start_index += samples_in_sequence_id[sequence_index]
+        sequence_index += 1
+    return indices, weights
diff --git a/traja/datasets/__init__.py b/traja/datasets/__init__.py
deleted file mode 100644
index f091d76d..00000000
--- a/traja/datasets/__init__.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import subprocess
-import glob
-import os
-from typing import List
-import pandas as pd
-from traja.datasets import dataset
-from traja.datasets import example
-import traja
-
-
-def load_ped_datasets() -> List[str]:
-    """Returns paths after downloading pedestrian datasets."""
-    if not os.path.exists("datasets"):
-        subprocess.call(
-            ["wget", "https://www.dropbox.com/s/8n02xqv3l9q18r1/datasets.zip"]
-        )
-        subprocess.call(["unzip", "-q", "datasets.zip"])
-        subprocess.call(["rm", "-rf", "datasets.zip"])
-    else:
-        print("Directory 'datasets' exists, skipping download")
-
-    return glob.glob(f"datasets/*/*")
-
-
-def load_ped_data(dataset_name=None, aspaths=False) -> dict:
-    """Returns pedestrian (ETH, Zara1, Zara2, Univ, Hotel) datasets as dataframe or as paths.
-
-    Args:
-        dataset_name: Optional(str) - returns specific dataset
-                        eth
-                        zara1
-                        zara2
-                        univ
-                        hotel
-        aspaths: (bool) - Returns paths only
-
-    Returns:
-        paths/dfs (dict) - train/val/test split for paths or dfs, depending on `aspaths` value
-
-
-    Paths are .txt files with format <frame_id> <ped_id> <x> <y>.
-    """
-    paths = load_ped_datasets()
-
-    if dataset_name:
-        # Get subset of data
-        paths = [path for path in paths if dataset_name in path]
-
-    train_dir = [path for path in paths if "train" in path][0]
-    val_dir = [path for path in paths if "val" in path][0]
-    test_dir = [path for path in paths if "test" in path][0]
-
-    train_paths = glob.glob(os.path.join(train_dir, "*.txt"))
-    val_paths = glob.glob(os.path.join(val_dir, "*.txt"))
-    test_paths = glob.glob(os.path.join(test_dir, "*.txt"))
-
-    paths = {"train": train_paths, "val": val_paths, "test": test_paths}
-    if aspaths:
-        return paths
-
-    col_names = ["frame_id", "ped_id", "x", "y"]
-    dfs = {
-        "train": [pd.read_csv(path, sep="\t", names=col_names) for path in train_paths],
-        "val": [pd.read_csv(path, sep="\t", names=col_names) for path in train_paths],
-        "test": [pd.read_csv(path, sep="\t", names=col_names) for path in train_paths],
-    }
-    return dfs
-
-
-def load_geolife(folder: str, as_traja=True, lat=(32, 48.0), lon=(114, 120)):
-    """Read geolife data from folder. Default mask in UTM Zone 50 (Beijing)"""
-    import traja.datasets.geolife as geolife
-
-    df = geolife.read_all_users(folder)
-    if as_traja:
-        # Convert lat/long to utm coordinates
-        if lat and lon:
-            geomask = (
-                (df["lon"] > lon[0])
-                & (df["lon"] < lon[1])
-                & (df["lat"] > lat[0])
-                & (df["lat"] < lat[1])
-            )
-            df = df[geomask]
-        df = traja.to_utm(df)
-    return df
diff --git a/traja/datasets/dataset.py b/traja/datasets/dataset.py
deleted file mode 100644
index bb65a42b..00000000
--- a/traja/datasets/dataset.py
+++ /dev/null
@@ -1,367 +0,0 @@
-"""
-Modified from https://github.com/agrimgupta92/sgan/blob/master/sgan/data/trajectories.py.
-
-This module contains:
-
-Classes:
-1. Pytorch Time series dataset class instance
-2. Weighted train and test dataset loader with respect to class distribution
-
-Helpers:
-1. Class distribution in the dataset
-
-"""
-import logging
-import os
-import math
-import numpy as np
-import torch
-from torch.utils.data import Dataset
-from collections import Counter
-from torch.utils.data.sampler import WeightedRandomSampler
-import pandas as pd
-from sklearn.utils import shuffle
-from traja.datasets import utils
-
-logger = logging.getLogger(__name__)
-
-
-def seq_collate(data):
-    (
-        obs_seq_list,
-        pred_seq_list,
-        obs_seq_rel_list,
-        pred_seq_rel_list,
-        non_linear_ped_list,
-        loss_mask_list,
-    ) = zip(*data)
-
-    _len = [len(seq) for seq in obs_seq_list]
-    cum_start_idx = [0] + np.cumsum(_len).tolist()
-    seq_start_end = [
-        [start, end] for start, end in zip(cum_start_idx, cum_start_idx[1:])
-    ]
-
-    # Data format: batch, input_size, seq_len
-    # LSTM input format: seq_len, batch, input_size
-    obs_traj = torch.cat(obs_seq_list, dim=0).permute(2, 0, 1)
-    pred_traj = torch.cat(pred_seq_list, dim=0).permute(2, 0, 1)
-    obs_traj_rel = torch.cat(obs_seq_rel_list, dim=0).permute(2, 0, 1)
-    pred_traj_rel = torch.cat(pred_seq_rel_list, dim=0).permute(2, 0, 1)
-    non_linear_ped = torch.cat(non_linear_ped_list)
-    loss_mask = torch.cat(loss_mask_list, dim=0)
-    seq_start_end = torch.LongTensor(seq_start_end)
-    out = [
-        obs_traj,
-        pred_traj,
-        obs_traj_rel,
-        pred_traj_rel,
-        non_linear_ped,
-        loss_mask,
-        seq_start_end,
-    ]
-
-    return tuple(out)
-
-
-def read_file(_path, delim="\t"):
-    data = []
-    if delim == "tab":
-        delim = "\t"
-    elif delim == "space":
-        delim = " "
-    with open(_path, "r") as f:
-        for line in f:
-            line = line.strip().split(delim)
-            line = [float(i) for i in line]
-            data.append(line)
-    return np.asarray(data)
-
-
-def poly_fit(traj, traj_len, threshold):
-    """
-    Input:
-    - traj: Numpy array of shape (2, traj_len)
-    - traj_len: Len of trajectory
-    - threshold: Minimum error to be considered for non linear traj
-    Output:
-    - int: 1 -> Non Linear 0-> Linear
-    """
-    t = np.linspace(0, traj_len - 1, traj_len)
-    res_x = np.polyfit(t, traj[0, -traj_len:], 2, full=True)[1]
-    res_y = np.polyfit(t, traj[1, -traj_len:], 2, full=True)[1]
-    if res_x + res_y >= threshold:
-        return 1.0
-    else:
-        return 0.0
-
-
-class TrajectoryDataset(Dataset):
-    """Dataloader for the Trajectory datasets"""
-
-    def __init__(
-        self,
-        data_dir,
-        obs_len=8,
-        pred_len=12,
-        skip=1,
-        threshold=0.002,
-        min_ped=1,
-        delim="\t",
-    ):
-        """
-        Args:
-        - data_dir: Directory containing dataset files in the format
-        <frame_id> <ped_id> <x> <y>
-        - obs_len: Number of time-steps in input trajectories
-        - pred_len: Number of time-steps in output trajectories
-        - skip: Number of frames to skip while making the dataset
-        - threshold: Minimum error to be considered for non linear traj
-        when using a linear predictor
-        - min_ped: Minimum number of pedestrians that should be in a seqeunce
-        - delim: Delimiter in the dataset files
-        """
-        super(TrajectoryDataset, self).__init__()
-
-        self.data_dir = data_dir
-        self.obs_len = obs_len
-        self.pred_len = pred_len
-        self.skip = skip
-        self.seq_len = self.obs_len + self.pred_len
-        self.delim = delim
-
-        all_files = os.listdir(self.data_dir)
-        all_files = [os.path.join(self.data_dir, _path) for _path in all_files]
-        num_peds_in_seq = []
-        seq_list = []
-        seq_list_rel = []
-        loss_mask_list = []
-        non_linear_ped = []
-        for path in all_files:
-            data = read_file(path, delim)
-            frames = np.unique(data[:, 0]).tolist()
-            frame_data = []
-            for frame in frames:
-                frame_data.append(data[frame == data[:, 0], :])
-            num_sequences = int(math.ceil((len(frames) - self.seq_len + 1) / skip))
-
-            for idx in range(0, num_sequences * self.skip + 1, skip):
-                curr_seq_data = np.concatenate(
-                    frame_data[idx : idx + self.seq_len], axis=0
-                )
-                peds_in_curr_seq = np.unique(curr_seq_data[:, 1])
-                curr_seq_rel = np.zeros((len(peds_in_curr_seq), 2, self.seq_len))
-                curr_seq = np.zeros((len(peds_in_curr_seq), 2, self.seq_len))
-                curr_loss_mask = np.zeros((len(peds_in_curr_seq), self.seq_len))
-                num_peds_considered = 0
-                _non_linear_ped = []
-                for _, ped_id in enumerate(peds_in_curr_seq):
-                    curr_ped_seq = curr_seq_data[curr_seq_data[:, 1] == ped_id, :]
-                    curr_ped_seq = np.around(curr_ped_seq, decimals=4)
-                    pad_front = frames.index(curr_ped_seq[0, 0]) - idx
-                    pad_end = frames.index(curr_ped_seq[-1, 0]) - idx + 1
-                    if pad_end - pad_front != self.seq_len:
-                        continue
-                    curr_ped_seq = np.transpose(curr_ped_seq[:, 2:])
-                    curr_ped_seq = curr_ped_seq
-                    # Make coordinates relative
-                    rel_curr_ped_seq = np.zeros(curr_ped_seq.shape)
-                    rel_curr_ped_seq[:, 1:] = curr_ped_seq[:, 1:] - curr_ped_seq[:, :-1]
-                    _idx = num_peds_considered
-                    curr_seq[_idx, :, pad_front:pad_end] = curr_ped_seq
-                    curr_seq_rel[_idx, :, pad_front:pad_end] = rel_curr_ped_seq
-                    # Linear vs Non-Linear Trajectory
-                    _non_linear_ped.append(poly_fit(curr_ped_seq, pred_len, threshold))
-                    curr_loss_mask[_idx, pad_front:pad_end] = 1
-                    num_peds_considered += 1
-
-                if num_peds_considered > min_ped:
-                    non_linear_ped += _non_linear_ped
-                    num_peds_in_seq.append(num_peds_considered)
-                    loss_mask_list.append(curr_loss_mask[:num_peds_considered])
-                    seq_list.append(curr_seq[:num_peds_considered])
-                    seq_list_rel.append(curr_seq_rel[:num_peds_considered])
-
-        self.num_seq = len(seq_list)
-        seq_list = np.concatenate(seq_list, axis=0)
-        seq_list_rel = np.concatenate(seq_list_rel, axis=0)
-        loss_mask_list = np.concatenate(loss_mask_list, axis=0)
-        non_linear_ped = np.asarray(non_linear_ped)
-
-        # Convert numpy -> Torch Tensor
-        self.obs_traj = torch.from_numpy(seq_list[:, :, : self.obs_len]).type(
-            torch.float
-        )
-        self.pred_traj = torch.from_numpy(seq_list[:, :, self.obs_len :]).type(
-            torch.float
-        )
-        self.obs_traj_rel = torch.from_numpy(seq_list_rel[:, :, : self.obs_len]).type(
-            torch.float
-        )
-        self.pred_traj_rel = torch.from_numpy(seq_list_rel[:, :, self.obs_len :]).type(
-            torch.float
-        )
-        self.loss_mask = torch.from_numpy(loss_mask_list).type(torch.float)
-        self.non_linear_ped = torch.from_numpy(non_linear_ped).type(torch.float)
-        cum_start_idx = [0] + np.cumsum(num_peds_in_seq).tolist()
-        self.seq_start_end = [
-            (start, end) for start, end in zip(cum_start_idx, cum_start_idx[1:])
-        ]
-
-    def __len__(self):
-        return self.num_seq
-
-    def __getitem__(self, index):
-        start, end = self.seq_start_end[index]
-        out = [
-            self.obs_traj[start:end, :],
-            self.pred_traj[start:end, :],
-            self.obs_traj_rel[start:end, :],
-            self.pred_traj_rel[start:end, :],
-            self.non_linear_ped[start:end],
-            self.loss_mask[start:end, :],
-        ]
-        return out
-
-
-class TimeSeriesDataset(Dataset):
-    r"""Pytorch Dataset object
-
-    Args:
-        Dataset (torch.utils.data.Dataset): Pyptorch dataset object
-    """
-
-    def __init__(self, data, target, category=None, parameters=None):
-        r"""
-        Args:
-            data (array): Data
-            target (array): Target
-            category (array): Category
-            parameters (array): Parameters
-        """
-
-        self.data = data
-        self.target = target
-        self.category = category
-        self.parameters = parameters
-
-    def __getitem__(self, index):
-        x = self.data[index]
-        y = self.target[index]
-        z = self.category[index] if self.category else torch.zeros(1)
-        w = self.parameters[index] if self.parameters else torch.zeros(1)
-        return x, y, z, w
-
-    def __len__(self):
-        return len(self.data)
-
-
-class MultiModalDataLoader:
-    """
-    MultiModalDataLoader wraps the following data preparation steps,
-    
-    1. Data generator: Extract x and y time series and corresponding ID (category) in the dataset. This process split the dataset into 
-                        i) Train samples with sequence length equals n_past
-                        ii) Target samples with sequence length equals n_future 
-                        iii) Target category(ID) of both train and target data
-    2. Data scalling: Scale the train and target data columns between the range (-1,1) using MinMaxScalers; TODO: It is more optimal to scale data for each ID(category)
-    3. Data shuffling: Shuffle the order of samples in the dataset without loosing the train<->target<->category combination
-    4. Create train test split: Split the shuffled batches into train (data, target, category) and test(data, target, category)
-    5. Weighted Random sampling: Apply weights with respect to category counts in the dataset: category_sample_weight = 1/num_category_samples; This avoid model overfit to category appear often in the dataset 
-    6. Create pytorch Dataset instances
-    7. Returns the train and test data loader instances along with their scalers as a dictionaries given the dataset instances and batch size
-
-        Args:
-            df (pd.DataFrame): Dataset
-            batch_size (int): Number of samples per batch of data
-            n_past (int): Input sequence length. Number of time steps from the past. 
-            n_future (int): Target sequence length. Number of time steps to the future. 
-            num_workers (int): Number of cpu subprocess occupied during data loading process
-        
-        Usage:
-        ------
-        dataloaders, scalers = MultiModalDataLoader(df = data_frame, batch_size=32, n_past = 20, n_future = 10, num_workers=4)
-        """
-
-    def __init__(
-        self,
-        df: pd.DataFrame,
-        batch_size: int,
-        n_past: int,
-        n_future: int,
-        num_workers: int,
-    ):
-
-        # Extract/generate data from the pandas df
-        train_data, target_data, target_category = utils.generate_dataset(
-            df, n_past, n_future
-        )
-
-        # Shuffle and split the data
-        [train_x, train_y, train_z], [test_x, test_y, test_z] = utils.shuffle_split(
-            train_data, target_data, target_category, train_ratio=0.75
-        )
-
-        # Scale data
-        (train_x, self.train_x_scaler), (train_y, self.train_y_scaler) = (
-            utils.scale_data(train_x, sequence_length=n_past),
-            utils.scale_data(train_y, sequence_length=n_future),
-        )
-        (test_x, self.test_x_scaler), (test_y, self.test_y_scaler) = (
-            utils.scale_data(test_x, sequence_length=n_past),
-            utils.scale_data(test_y, sequence_length=n_future),
-        )
-
-        # Weighted Random Sampler
-        train_weighted_sampler, test_weighted_sampler = utils.weighted_random_samplers(
-            train_z, test_z
-        )
-
-        # Dataset
-        train_dataset = TimeSeriesDataset(train_x, train_y, train_z)
-        test_dataset = TimeSeriesDataset(test_x, test_y, test_z)
-
-        # Dataloader with weighted samplers
-        self.train_loader = torch.utils.data.DataLoader(
-            dataset=train_dataset,
-            shuffle=False,
-            batch_size=batch_size,
-            sampler=train_weighted_sampler,
-            drop_last=True,
-            num_workers=num_workers,
-        )
-        self.test_loader = torch.utils.data.DataLoader(
-            dataset=test_dataset,
-            shuffle=False,
-            batch_size=batch_size,
-            sampler=test_weighted_sampler,
-            drop_last=True,
-            num_workers=num_workers,
-        )
-
-        self.dataloaders = {
-            "train_loader": self.train_loader,
-            "test_loader": self.test_loader,
-        }
-        self.scalers = {
-            "train_data_scaler": self.train_x_scaler,
-            "train_target_scaler": self.train_y_scaler,
-            "test_data_scaler": self.test_x_scaler,
-            "test_target_scaler": self.test_y_scaler,
-        }
-
-    def __new__(
-        cls,
-        df: pd.DataFrame,
-        batch_size: int,
-        n_past: int,
-        n_future: int,
-        num_workers: int,
-    ):
-        """Constructor of MultiModalDataLoader"""
-        # Loader instance
-        loader_instance = super(MultiModalDataLoader, cls).__new__(cls)
-        loader_instance.__init__(df, batch_size, n_past, n_future, num_workers)
-        # Return train and test loader attributes
-        return loader_instance.dataloaders, loader_instance.scalers
-
diff --git a/traja/datasets/utils.py b/traja/datasets/utils.py
deleted file mode 100644
index 0beba662..00000000
--- a/traja/datasets/utils.py
+++ /dev/null
@@ -1,164 +0,0 @@
-import logging
-import os
-import math
-import numpy as np
-import torch
-from torch import long
-from torch.utils.data import Dataset
-from collections import Counter
-from torch.utils.data.sampler import WeightedRandomSampler
-import pandas as pd
-from sklearn.utils import shuffle
-from sklearn.preprocessing import MinMaxScaler
-import torch
-
-logger = logging.getLogger(__name__)
-
-
-def get_class_distribution(targets):
-    """Compute class distribution, returns number of classes and their count in the targets
-
-    Args:
-        targets ([type]): [description]
-
-    Returns:
-        [type]: [description]
-    """
-    targets_ = np.unique(targets, return_counts=True)
-    return targets_[0], targets_[1]
-
-
-def generate_dataset(df, n_past, n_future):
-    """
-    df : Dataframe
-    n_past: Number of past observations
-    n_future: Number of future observations
-    Returns:
-    X: Past steps
-    Y: Future steps (Sequence target)
-    Z: Sequence category"""
-
-    # Split the dataframe with respect to IDs
-    series_ids = dict(tuple(df.groupby('ID')))  # Dict of ids as keys and x,y,id as values
-    train_data, target_data, target_category = list(), list(), list()
-
-    for id in series_ids.keys():
-        X, Y, Z = list(), list(), list()
-        # Drop the column ids and convert the pandas into arrays 
-        series = series_ids[id].drop(columns=['ID']).to_numpy()
-        for window_start in range(len(series)):
-            past_end = window_start + n_past
-            future_end = past_end + n_future
-            if not future_end > len(series):
-                # slicing the past and future parts of the window
-                past, future = series[window_start:past_end, :], series[past_end:future_end, :]
-                X.append(past)
-                Y.append(future)
-                # For each sequence length set target category
-                Z.append(int(id))
-
-        train_data.extend(np.array(X))
-        target_data.extend(np.array(Y))
-        target_category.extend(np.array(Z))
-
-    return train_data, target_data, target_category
-
-
-def shuffle_split(train_data: np.array, target_data: np.array, target_category: np.array, train_ratio: float):
-    """[summary]
-
-    Args:
-        train_data (np.array): [description]
-        target_data (np.array): [description]
-        target_category (np.array): [description]
-        train_ratio (float): [description]
-
-    Returns:
-        [type]: [description]
-    """
-
-    # Shuffle the IDs and the corresponding sequence , preserving the order
-    train_data, target_data, target_category = shuffle(train_data, target_data, target_category)
-
-    assert train_ratio > 0, "Train data ratio should be greater than zero"
-    assert train_ratio <= 1.0, "Train data ratio should be less than or equal to 1 "
-
-    # Train test split
-    split = int(train_ratio * len(train_data))
-
-    train_x = train_data[:split]
-    train_y = target_data[:split]
-    train_z = target_category[:split]
-
-    test_x = train_data[split:]
-    test_y = target_data[split:]
-    test_z = target_category[split:]
-
-    return [train_x, train_y, train_z], [test_x, test_y, test_z]
-
-
-def scale_data(data, sequence_length):
-    """[summary]
-
-    Args:
-        data ([type]): [description]
-        sequence_length ([type]): [description]
-
-    Returns:
-        [type]: [description]
-    """
-    assert len(data[0].shape) == 2
-    scalers = {}
-    data = np.vstack(data)
-
-    for i in range(data.shape[1]):
-        scaler = MinMaxScaler(feature_range=(-1, 1))
-        s_s = scaler.fit_transform(data[:, i].reshape(-1, 1))
-        s_s = np.reshape(s_s, len(s_s))
-        scalers['scaler_' + str(i)] = scaler
-        data[:, i] = s_s
-    # Slice the data into batches
-    data = [data[i:i + sequence_length] for i in range(0, len(data), sequence_length)]
-    return data, scalers
-
-
-def weighted_random_samplers(train_z, test_z):
-    """[summary]
-
-    Args:
-        train_z ([type]): [description]
-        test_z ([type]): [description]
-
-    Returns:
-        [type]: [description]
-    """
-
-    # Prepare weighted random sampler: 
-    train_target_list = torch.tensor(train_z).type(torch.LongTensor)
-    test_target_list = torch.tensor(test_z).type(torch.LongTensor)
-
-    # Number of classes and their frequencies
-    train_targets_, train_class_count = get_class_distribution(train_target_list)
-    test_targets_, test_class_count = get_class_distribution(test_target_list)
-
-    # Compute class weights
-    train_class_weights = 1. / torch.tensor(train_class_count, dtype=torch.float)
-    test_class_weights = 1. / torch.tensor(test_class_count, dtype=torch.float)
-
-    # Assign weights to original target list
-    train_class_weights_all = train_class_weights[train_target_list - 1]  # Note the targets start from 1, to python idx
-    # to apply,-1
-    test_class_weights_all = test_class_weights[test_target_list - 1]
-
-    # Weighted samplers
-    train_weighted_sampler = WeightedRandomSampler(
-        weights=train_class_weights_all,
-        num_samples=len(train_class_weights_all),
-        replacement=True
-    )
-    test_weighted_sampler = WeightedRandomSampler(
-        weights=test_class_weights_all,
-        num_samples=len(test_class_weights_all),
-        replacement=True
-    )
-    return train_weighted_sampler, test_weighted_sampler
diff --git a/traja/frame.py b/traja/frame.py
index 3c189b52..3a29d4fe 100644
--- a/traja/frame.py
+++ b/traja/frame.py
@@ -1,6 +1,5 @@
-import copy
 import logging
-from typing import Optional, List, Union, Tuple
+from typing import Optional, Union, Tuple
 
 import numpy as np
 import pandas as pd
@@ -163,10 +162,10 @@ class TrajaCollection(TrajaDataFrame):
     ]
 
     def __init__(
-        self,
-        trjs: Union[TrajaDataFrame, pd.DataFrame, dict],
-        id_col: Optional[str] = None,
-        **kwargs,
+            self,
+            trjs: Union[TrajaDataFrame, pd.DataFrame, dict],
+            id_col: Optional[str] = None,
+            **kwargs,
     ):
         """Initialize with trajectories with x, y, and time columns.
 
@@ -243,10 +242,10 @@ def apply_all(self, method, **kwargs):
 
 class StaticObject(object):
     def __init__(
-        self,
-        x: Optional[float] = None,
-        y: Optional[float] = None,
-        bounding_box: Tuple[float] = None,
+            self,
+            x: Optional[float] = None,
+            y: Optional[float] = None,
+            bounding_box: Tuple[float] = None,
     ):
         ...
         pass
diff --git a/traja/models/__init__.py b/traja/models/__init__.py
index 6f89c5c8..49969c7e 100644
--- a/traja/models/__init__.py
+++ b/traja/models/__init__.py
@@ -1,8 +1,8 @@
-# from .nn import LSTM
 from traja.models.generative_models.vae import MultiModelVAE
 from traja.models.generative_models.vaegan import MultiModelVAEGAN
 from traja.models.predictive_models.ae import MultiModelAE
 from traja.models.predictive_models.irl import MultiModelIRL
 from traja.models.predictive_models.lstm import LSTM
 from .inference import *
+from .train import HybridTrainer
 from .utils import TimeDistributed, read_hyperparameters, save, load
diff --git a/traja/models/base_models/MLPClassifier.py b/traja/models/base_models/MLPClassifier.py
new file mode 100644
index 00000000..cd0240c8
--- /dev/null
+++ b/traja/models/base_models/MLPClassifier.py
@@ -0,0 +1,51 @@
+import torch
+from torch import nn
+
+
+class MLPClassifier(torch.nn.Module):
+    """ MLP classifier: Classify the input data using the latent embeddings
+            input_size: The number of expected latent size
+            hidden_size: The number of features in the hidden state h
+            output_size: Size of labels or the number of sequence_ids in the data
+            dropout:  If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer,
+                            with dropout probability equal to dropout
+            num_layers: Number of hidden layers in the classifier
+            """
+
+    def __init__(
+            self,
+            input_size: int,
+            hidden_size: int,
+            output_size: int,
+            num_layers: int,
+            dropout: float,
+    ):
+        super(MLPClassifier, self).__init__()
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_classes = output_size
+        self.num_layers = num_layers
+        self.dropout = dropout
+
+        # Classifier layers
+        layers = list()
+
+        layers.append(nn.Linear(self.input_size, self.hidden_size))
+        layers.append(nn.ReLU())
+        torch.nn.Dropout(p=dropout)
+
+        for layer in range(1, self.num_layers):
+            layers.append(nn.Linear(self.hidden_size, self.hidden_size))
+            layers.append(nn.ReLU())
+            torch.nn.Dropout(p=dropout)
+
+        layers.append(nn.Linear(self.hidden_size, self.num_classes))
+
+        self.hidden = nn.Sequential(*layers)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        x = self.hidden(x)
+        output = self.sigmoid(x)
+        return output
diff --git a/traja/models/base_models/MLPRegressor.py b/traja/models/base_models/MLPRegressor.py
new file mode 100644
index 00000000..527df236
--- /dev/null
+++ b/traja/models/base_models/MLPRegressor.py
@@ -0,0 +1,49 @@
+import torch
+from torch import nn
+
+
+class MLPRegressor(torch.nn.Module):
+    """ MLP regressor: Regress the input data using the latent embeddings
+            input_size: The number of expected latent size
+            hidden_size: The number of features in the hidden state h
+            output_size: Size of labels or the number of sequence_ids in the data
+            dropout:  If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer,
+                            with dropout probability equal to dropout
+            num_layers: Number of hidden layers in the classifier
+            """
+
+    def __init__(
+            self,
+            input_size: int,
+            hidden_size: int,
+            output_size: int,
+            num_layers: int,
+            dropout: float,
+    ):
+        super(MLPRegressor, self).__init__()
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.output_size = output_size
+        self.num_layers = num_layers
+        self.dropout = dropout
+
+        # Classifier layers
+        layers = list()
+
+        layers.append(nn.Linear(self.input_size, self.hidden_size))
+        layers.append(nn.ReLU())
+        torch.nn.Dropout(p=dropout)
+
+        for layer in range(1, self.num_layers):
+            layers.append(nn.Linear(self.hidden_size, self.hidden_size))
+            layers.append(nn.ReLU())
+            torch.nn.Dropout(p=dropout)
+
+        layers.append(nn.Linear(self.hidden_size, self.output_size))
+
+        self.hidden = nn.Sequential(*layers)
+
+    def forward(self, x):
+        output = self.hidden(x)
+        return output
diff --git a/traja/models/experiment.py b/traja/models/experiment.py
deleted file mode 100644
index 05f596a1..00000000
--- a/traja/models/experiment.py
+++ /dev/null
@@ -1,929 +0,0 @@
-#! /usr/local/env python3
-"""Pytorch visualization code modified from Chad Jensen's implementation
-(https://discuss.pytorch.org/t/lstm-for-sequence-prediction/22021/3)."""
-import logging
-
-import matplotlib.pyplot as plt
-import numpy as np
-
-try:
-    import torch
-except ImportError:
-    raise ImportError(
-        "Missing optional dependency 'pytorch'. Install it via pytorch.org"
-    )
-import torch.nn as nn
-import torch.optim as optim
-import os
-import pandas as pd
-from time import time
-from datetime import datetime
-
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-
-class Trainer:
-    def __init__(self, model,
-                 train_loader,
-                 test_loader,
-                 epochs=200,
-                 batch_size=60,
-                 run_id=0,
-                 logs_dir='logs',
-                 device='cpu',
-                 optimizer='None',
-                 plot=True,
-                 downsampling=None):
-        self.device = device
-        self.model = model
-        self.epochs = epochs
-        self.plot = plot
-
-        self.train_loader = train_loader
-        self.test_loader = test_loader
-
-        self.criterion = torch.nn.MSELoss()
-        print('Checking for optimizer for {}'.format(optimizer))
-        if optimizer == "adam":
-            print('Using adam')
-            self.optimizer = optim.Adam(model.parameters())
-        elif optimizer == "adam_lr":
-            print("Using adam with higher learning rate")
-            self.optimizer = optim.Adam(model.parameters(), lr=0.01)
-        elif optimizer == 'adam_lr2':
-            print('Using adam with to large learning rate')
-            self.optimizer = optim.Adam(model.parameters(), lr=0.0001)
-        elif optimizer == "SGD":
-            print('Using SGD')
-            self.optimizer = optim.SGD(model.parameters(), momentum=0.9, weight_decay=5e-4)
-        elif optimizer == "LRS":
-            print('Using LRS')
-            self.optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
-            self.lr_scheduler = optim.lr_scheduler.StepLR(self.optimizer, self.epochs // 3)
-        elif optimizer == "radam":
-            print('Using radam')
-            self.optimizer = RAdam(model.parameters())
-        elif optimizer == "RMSprop":
-            print('Using RMSprop')
-            self.optimizer = optim.RMSprop(model.parameters())
-        else:
-            raise ValueError('Unknown optimizer {}'.format(optimizer))
-        self.opt_name = optimizer
-        save_dir = os.path.join(logs_dir, model.name, train_loader.name)
-        if not os.path.exists(save_dir):
-            os.makedirs(save_dir)
-
-        self.savepath = os.path.join(save_dir,
-                                     f'{model.name}_bs{batch_size}_e{epochs}_dspl{downsampling}_id{run_id}.csv')
-        self.experiment_done = False
-        if os.path.exists(self.savepath):
-            trained_epochs = len(pd.read_csv(self.savepath, sep=';'))
-
-            if trained_epochs >= epochs:
-                self.experiment_done = True
-                print(
-                    f'Experiment Logs for the exact same experiment with identical run_id was detected, training will be skipped, consider using another run_id')
-        if os.path.exists((self.savepath.replace('.csv', '.pt'))):
-            self.model.load_state_dict(torch.load(self.savepath.replace('.csv', '.pt'))['model_state_dict'])
-            self.model = self.model.to(self.device)
-
-            self.optimizer.load_state_dict(torch.load(self.savepath.replace('.csv', '.pt'))['optimizer'])
-            self.start_epoch = torch.load(self.savepath.replace('.csv', '.pt'))['epoch'] + 1
-        else:
-
-            self.start_epoch = 0
-            self.model = self.model.to(self.device)
-
-    def _infer_initial_epoch(self, savepath):
-        if not os.path.exists(savepath):
-            return 0
-        else:
-            df = pd.read_csv(savepath, sep=';', index_col=0)
-            print(len(df) + 1)
-            return len(df)
-
-    def train(self):
-        if self.experiment_done:
-            return
-        for epoch in range(self.start_epoch, self.epochs):
-
-            print('Start training epoch', epoch)
-            print("{} Epoch {}, training loss: {}".format(datetime.now(), epoch, self.train_epoch()))
-            self.test(epoch=epoch)
-            if self.opt_name == "LRS":
-                print('LRS step')
-                self.lr_scheduler.step()
-        return self.savepath + '.csv'
-
-    def train_epoch(self):
-        self.model.train()
-        total = 0
-        running_loss = 0
-        old_time = time()
-        for batch, data in enumerate(self.train_loader):
-            inputs, targets = data[0].to(self.device).float(), data[1].to(self.device).float()
-            self.optimizer.zero_grad()
-            outputs = self.model(inputs)
-            loss = self.criterion(outputs, targets)
-            loss.backward()
-            self.optimizer.step()
-            running_loss += loss.item()
-
-            if batch % 10 == 0 and batch != 0:
-                print(batch, 'of', len(self.train_loader), 'processing time', time() - old_time, 'loss:',
-                      running_loss / total)
-                old_time = time()
-
-            # Increment number of batches
-            total += 1
-        return running_loss / total
-
-    def test(self, epoch, save=True):
-        self.model.eval()
-        total = 0
-        test_loss = 0
-        with torch.no_grad():
-            for batch, data in enumerate(self.test_loader):
-                if batch % 10 == 0:
-                    print('Processing eval batch', batch, 'of', len(self.test_loader))
-                inputs, targets = data[0].to(self.device).float(), data[1].to(self.device).float()
-                outputs = self.model(inputs)
-                loss = self.criterion(outputs, targets)
-                total += 1
-                test_loss += loss.item()
-
-        if save:
-            torch.save({
-                'model_state_dict': self.model.state_dict(),
-                'optimizer': self.optimizer.state_dict(),
-                'epoch': epoch,
-                'test_loss': test_loss / total
-            }, self.savepath.replace('.csv', '.pt'))
-        return test_loss / total
-
-
-class LSTM(nn.Module):
-    """ Deep LSTM network. This implementation
-    returns output_size outputs.
-
-
-    Args:
-        input_size: The number of expected features in the input `x`
-        hidden_size: The number of features in the hidden state `h`
-        num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
-            would mean stacking two LSTMs together to form a `stacked LSTM`,
-            with the second LSTM taking in outputs of the first LSTM and
-            computing the final results. Default: 1
-        output_size: The number of output dimensions
-        dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
-            LSTM layer except the last layer, with dropout probability equal to
-            :attr:`dropout`. Default: 0
-        bidirectional: If ``True``, becomes a bidirectional LSTM. Default: ``False``
-    """
-
-    name = "LSTM"
-
-    def __init__(self, input_size: int, hidden_size: int, num_layers: int,
-                 output_size: int, dropout: float, bidirectional: bool):
-        super(LSTM, self).__init__()
-
-        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
-                            num_layers=num_layers, dropout=dropout,
-                            bidirectional=bidirectional, )
-
-        self.head = nn.Linear(hidden_size, output_size)
-
-    def forward(self, x):
-        x, state = self.lstm(x)
-        # Use the last hidden state of last layer
-        x = state[0][-1]
-        x = self.head(x)
-        return x
-
-
-class TrajectoryLSTM:
-    def __init__(
-            self, xy, nb_steps=10, epochs=1000, batch_size=1, criterion=nn.MSELoss()
-    ):
-        fig, ax = plt.subplots(2, 1)
-        self.fig = fig
-        self.ax = ax
-        assert xy.shape[1] is 2, f"xy should be an N x 2 array, but is {xy.shape}"
-        self.xy = xy
-        self.nb_steps = nb_steps
-        self.epochs = epochs
-        self.batch_size = batch_size
-        self.criterion = criterion
-        self.rnn = LSTM()
-
-    def load_batch(self, batch_size=32):
-        t_1_b = np.zeros((self.nb_steps, self.batch_size, 2))
-        t_b = np.zeros((self.nb_steps * self.batch_size, 2))
-
-        inds = np.random.randint(0, len(self.xy) - self.nb_steps, (self.batch_size))
-        for i, ind in enumerate(inds):
-            t_1_b[:, i] = self.xy[ind: ind + self.nb_steps]
-            t_b[i * nb_steps: (i + 1) * self.nb_steps] = self.xy[
-                                                         ind + 1: ind + nb_steps + 1
-                                                         ]
-        return torch.from_numpy(t_1_b).float(), torch.from_numpy(t_b).float()
-
-    def train(self):
-        self.mean_loss = 0.0
-        for epoch in range(1, self.epochs + 1):
-            t_1_b, t_b = self.load_batch(self.batch_size)
-
-            def closure():
-                global loss
-                optimizer.zero_grad()
-                pred = self.rnn(t_1_b)
-                shaped_pred = pred.reshape(-1, 2)
-                loss = self.criterion(abs(shaped_pred), abs(t_b))
-                loss.backward()
-
-                return loss
-
-            optimizer = optim.Adam(self.rnn.parameters(), 1e-3)
-            optimizer.step(closure)
-            self.mean_loss += loss.item()
-
-            if epoch % 100 == 0:
-                print("Epoch: {} | Loss: {:.6f}".format(epoch, self.mean_loss))
-                self.mean_loss = 0
-
-    def savefig(self, filepath):
-        self.fig.savefig(filepath)
-
-    def _plot(self):
-        t_1_b, t_b = self.load_batch(1)
-        pred = self.rnn(t_1_b).detach().numpy().reshape(-1, 2)
-
-        real = t_1_b.numpy().reshape(-1, 2)
-        x, y = self.xy.T
-        self.ax[0].plot(x, y, label="Real")
-        self.ax[0].plot(real[:, 0], real[:, 1], label="Real batch")
-        self.ax[0].plot(pred[:, 0], pred[:, 1], label="Pred")
-
-        self.ax[1].scatter(real[:, 0], real[:, 1], label="Real")
-        self.ax[1].scatter(pred[:, 0], pred[:, 1], label="Pred")
-
-        for a in self.ax:
-            a.legend()
-
-    def plot(self, interactive=True):
-        if interactive and (plt.get_backend() == "agg"):
-            logging.ERROR("Not able to use interactive plotting in mpl `agg` mode.")
-            # interactive = False
-        elif interactive:
-            while True:
-                for a in self.ax:
-                    a.clear()
-                self._plot()
-                plt.pause(1)
-                plt.show(block=False)
-        else:
-            self._plot()
-            return self.fig
-
-
-def make_mlp(dim_list, activation="relu", batch_norm=True, dropout=0):
-    layers = []
-    for dim_in, dim_out in zip(dim_list[:-1], dim_list[1:]):
-        layers.append(nn.Linear(dim_in, dim_out))
-        if batch_norm:
-            layers.append(nn.BatchNorm1d(dim_out))
-        if activation == "relu":
-            layers.append(nn.ReLU())
-        elif activation == "leakyrelu":
-            layers.append(nn.LeakyReLU())
-        if dropout > 0:
-            layers.append(nn.Dropout(p=dropout))
-    return nn.Sequential(*layers)
-
-
-def get_noise(shape, noise_type):
-    if noise_type == "gaussian":
-        return torch.randn(*shape).cuda()
-    elif noise_type == "uniform":
-        return torch.rand(*shape).sub_(0.5).mul_(2.0).cuda()
-    raise ValueError('Unrecognized noise type "%s"' % noise_type)
-
-
-class Encoder(nn.Module):
-    """Encoder is part of both TrajectoryGenerator and
-    TrajectoryDiscriminator"""
-
-    def __init__(
-            self, embedding_dim=64, h_dim=64, mlp_dim=1024, num_layers=1, dropout=0.0
-    ):
-        super(Encoder, self).__init__()
-
-        self.mlp_dim = 1024
-        self.h_dim = h_dim
-        self.embedding_dim = embedding_dim
-        self.num_layers = num_layers
-
-        self.encoder = nn.LSTM(embedding_dim, h_dim, num_layers, dropout=dropout)
-
-        self.spatial_embedding = nn.Linear(2, embedding_dim)
-
-    def init_hidden(self, batch):
-        return (
-            torch.zeros(self.num_layers, batch, self.h_dim).cuda(),
-            torch.zeros(self.num_layers, batch, self.h_dim).cuda(),
-        )
-
-    def forward(self, obs_traj):
-        """
-        Inputs:
-        - obs_traj: Tensor of shape (obs_len, batch, 2)
-        Output:
-        - final_h: Tensor of shape (self.num_layers, batch, self.h_dim)
-        """
-        # Encode observed Trajectory
-        batch = obs_traj.size(1)
-        obs_traj_embedding = self.spatial_embedding(obs_traj.view(-1, 2))
-        obs_traj_embedding = obs_traj_embedding.view(-1, batch, self.embedding_dim)
-        state_tuple = self.init_hidden(batch)
-        output, state = self.encoder(obs_traj_embedding, state_tuple)
-        final_h = state[0]
-        return final_h
-
-
-class Decoder(nn.Module):
-    """Decoder is part of TrajectoryGenerator"""
-
-    def __init__(
-            self,
-            seq_len,
-            embedding_dim=64,
-            h_dim=128,
-            mlp_dim=1024,
-            num_layers=1,
-            pool_every_timestep=True,
-            dropout=0.0,
-            bottleneck_dim=1024,
-            activation="relu",
-            batch_norm=True,
-            pooling_type="pool_net",
-            neighborhood_size=2.0,
-            grid_size=8,
-    ):
-        super(Decoder, self).__init__()
-
-        self.seq_len = seq_len
-        self.mlp_dim = mlp_dim
-        self.h_dim = h_dim
-        self.embedding_dim = embedding_dim
-        self.pool_every_timestep = pool_every_timestep
-
-        self.decoder = nn.LSTM(embedding_dim, h_dim, num_layers, dropout=dropout)
-
-        if pool_every_timestep:
-            if pooling_type == "pool_net":
-                self.pool_net = PoolHiddenNet(
-                    embedding_dim=self.embedding_dim,
-                    h_dim=self.h_dim,
-                    mlp_dim=mlp_dim,
-                    bottleneck_dim=bottleneck_dim,
-                    activation=activation,
-                    batch_norm=batch_norm,
-                    dropout=dropout,
-                )
-            elif pooling_type == "spool":
-                self.pool_net = SocialPooling(
-                    h_dim=self.h_dim,
-                    activation=activation,
-                    batch_norm=batch_norm,
-                    dropout=dropout,
-                    neighborhood_size=neighborhood_size,
-                    grid_size=grid_size,
-                )
-
-            mlp_dims = [h_dim + bottleneck_dim, mlp_dim, h_dim]
-            self.mlp = make_mlp(
-                mlp_dims, activation=activation, batch_norm=batch_norm, dropout=dropout
-            )
-
-        self.spatial_embedding = nn.Linear(2, embedding_dim)
-        self.hidden2pos = nn.Linear(h_dim, 2)
-
-    def forward(self, last_pos, last_pos_rel, state_tuple, seq_start_end):
-        """
-        Inputs:
-        - last_pos: Tensor of shape (batch, 2)
-        - last_pos_rel: Tensor of shape (batch, 2)
-        - state_tuple: (hh, ch) each tensor of shape (num_layers, batch, h_dim)
-        - seq_start_end: A list of tuples which delimit sequences within batch
-        Output:
-        - pred_traj: tensor of shape (self.seq_len, batch, 2)
-        """
-        batch = last_pos.size(0)
-        pred_traj_fake_rel = []
-        decoder_input = self.spatial_embedding(last_pos_rel)
-        decoder_input = decoder_input.view(1, batch, self.embedding_dim)
-
-        for _ in range(self.seq_len):
-            output, state_tuple = self.decoder(decoder_input, state_tuple)
-            rel_pos = self.hidden2pos(output.view(-1, self.h_dim))
-            curr_pos = rel_pos + last_pos
-
-            if self.pool_every_timestep:
-                decoder_h = state_tuple[0]
-                pool_h = self.pool_net(decoder_h, seq_start_end, curr_pos)
-                decoder_h = torch.cat([decoder_h.view(-1, self.h_dim), pool_h], dim=1)
-                decoder_h = self.mlp(decoder_h)
-                decoder_h = torch.unsqueeze(decoder_h, 0)
-                state_tuple = (decoder_h, state_tuple[1])
-
-            embedding_input = rel_pos
-
-            decoder_input = self.spatial_embedding(embedding_input)
-            decoder_input = decoder_input.view(1, batch, self.embedding_dim)
-            pred_traj_fake_rel.append(rel_pos.view(batch, -1))
-            last_pos = curr_pos
-
-        pred_traj_fake_rel = torch.stack(pred_traj_fake_rel, dim=0)
-        return pred_traj_fake_rel, state_tuple[0]
-
-
-class PoolHiddenNet(nn.Module):
-    """Pooling module as proposed in our paper"""
-
-    def __init__(
-            self,
-            embedding_dim=64,
-            h_dim=64,
-            mlp_dim=1024,
-            bottleneck_dim=1024,
-            activation="relu",
-            batch_norm=True,
-            dropout=0.0,
-    ):
-        super(PoolHiddenNet, self).__init__()
-
-        self.mlp_dim = 1024
-        self.h_dim = h_dim
-        self.bottleneck_dim = bottleneck_dim
-        self.embedding_dim = embedding_dim
-
-        mlp_pre_dim = embedding_dim + h_dim
-        mlp_pre_pool_dims = [mlp_pre_dim, 512, bottleneck_dim]
-
-        self.spatial_embedding = nn.Linear(2, embedding_dim)
-        self.mlp_pre_pool = make_mlp(
-            mlp_pre_pool_dims,
-            activation=activation,
-            batch_norm=batch_norm,
-            dropout=dropout,
-        )
-
-    def repeat(self, tensor, num_reps):
-        """
-        Inputs:
-        -tensor: 2D tensor of any shape
-        -num_reps: Number of times to repeat each row
-        Outpus:
-        -repeat_tensor: Repeat each row such that: R1, R1, R2, R2
-        """
-        col_len = tensor.size(1)
-        tensor = tensor.unsqueeze(dim=1).repeat(1, num_reps, 1)
-        tensor = tensor.view(-1, col_len)
-        return tensor
-
-    def forward(self, h_states, seq_start_end, end_pos):
-        """
-        Inputs:
-        - h_states: Tensor of shape (num_layers, batch, h_dim)
-        - seq_start_end: A list of tuples which delimit sequences within batch
-        - end_pos: Tensor of shape (batch, 2)
-        Output:
-        - pool_h: Tensor of shape (batch, bottleneck_dim)
-        """
-        pool_h = []
-        for _, (start, end) in enumerate(seq_start_end):
-            start = start.item()
-            end = end.item()
-            num_ped = end - start
-            curr_hidden = h_states.view(-1, self.h_dim)[start:end]
-            curr_end_pos = end_pos[start:end]
-            # Repeat -> H1, H2, H1, H2
-            curr_hidden_1 = curr_hidden.repeat(num_ped, 1)
-            # Repeat position -> P1, P2, P1, P2
-            curr_end_pos_1 = curr_end_pos.repeat(num_ped, 1)
-            # Repeat position -> P1, P1, P2, P2
-            curr_end_pos_2 = self.repeat(curr_end_pos, num_ped)
-            curr_rel_pos = curr_end_pos_1 - curr_end_pos_2
-            curr_rel_embedding = self.spatial_embedding(curr_rel_pos)
-            mlp_h_input = torch.cat([curr_rel_embedding, curr_hidden_1], dim=1)
-            curr_pool_h = self.mlp_pre_pool(mlp_h_input)
-            curr_pool_h = curr_pool_h.view(num_ped, num_ped, -1).max(1)[0]
-            pool_h.append(curr_pool_h)
-        pool_h = torch.cat(pool_h, dim=0)
-        return pool_h
-
-
-class SocialPooling(nn.Module):
-    """Current state of the art pooling mechanism:
-    http://cvgl.stanford.edu/papers/CVPR16_Social_LSTM.pdf"""
-
-    def __init__(
-            self,
-            h_dim=64,
-            activation="relu",
-            batch_norm=True,
-            dropout=0.0,
-            neighborhood_size=2.0,
-            grid_size=8,
-            pool_dim=None,
-    ):
-        super(SocialPooling, self).__init__()
-        self.h_dim = h_dim
-        self.grid_size = grid_size
-        self.neighborhood_size = neighborhood_size
-        if pool_dim:
-            mlp_pool_dims = [grid_size * grid_size * h_dim, pool_dim]
-        else:
-            mlp_pool_dims = [grid_size * grid_size * h_dim, h_dim]
-
-        self.mlp_pool = make_mlp(
-            mlp_pool_dims, activation=activation, batch_norm=batch_norm, dropout=dropout
-        )
-
-    def get_bounds(self, ped_pos):
-        top_left_x = ped_pos[:, 0] - self.neighborhood_size / 2
-        top_left_y = ped_pos[:, 1] + self.neighborhood_size / 2
-        bottom_right_x = ped_pos[:, 0] + self.neighborhood_size / 2
-        bottom_right_y = ped_pos[:, 1] - self.neighborhood_size / 2
-        top_left = torch.stack([top_left_x, top_left_y], dim=1)
-        bottom_right = torch.stack([bottom_right_x, bottom_right_y], dim=1)
-        return top_left, bottom_right
-
-    def get_grid_locations(self, top_left, other_pos):
-        cell_x = torch.floor(
-            ((other_pos[:, 0] - top_left[:, 0]) / self.neighborhood_size)
-            * self.grid_size
-        )
-        cell_y = torch.floor(
-            ((top_left[:, 1] - other_pos[:, 1]) / self.neighborhood_size)
-            * self.grid_size
-        )
-        grid_pos = cell_x + cell_y * self.grid_size
-        return grid_pos
-
-    def repeat(self, tensor, num_reps):
-        """
-        Inputs:
-        -tensor: 2D tensor of any shape
-        -num_reps: Number of times to repeat each row
-        Outpus:
-        -repeat_tensor: Repeat each row such that: R1, R1, R2, R2
-        """
-        col_len = tensor.size(1)
-        tensor = tensor.unsqueeze(dim=1).repeat(1, num_reps, 1)
-        tensor = tensor.view(-1, col_len)
-        return tensor
-
-    def forward(self, h_states, seq_start_end, end_pos):
-        """
-        Inputs:
-        - h_states: Tesnsor of shape (num_layers, batch, h_dim)
-        - seq_start_end: A list of tuples which delimit sequences within batch.
-        - end_pos: Absolute end position of obs_traj (batch, 2)
-        Output:
-        - pool_h: Tensor of shape (batch, h_dim)
-        """
-        pool_h = []
-        for _, (start, end) in enumerate(seq_start_end):
-            start = start.item()
-            end = end.item()
-            num_ped = end - start
-            grid_size = self.grid_size * self.grid_size
-            curr_hidden = h_states.view(-1, self.h_dim)[start:end]
-            curr_hidden_repeat = curr_hidden.repeat(num_ped, 1)
-            curr_end_pos = end_pos[start:end]
-            curr_pool_h_size = (num_ped * grid_size) + 1
-            curr_pool_h = curr_hidden.new_zeros((curr_pool_h_size, self.h_dim))
-            # curr_end_pos = curr_end_pos.data
-            top_left, bottom_right = self.get_bounds(curr_end_pos)
-
-            # Repeat position -> P1, P2, P1, P2
-            curr_end_pos = curr_end_pos.repeat(num_ped, 1)
-            # Repeat bounds -> B1, B1, B2, B2
-            top_left = self.repeat(top_left, num_ped)
-            bottom_right = self.repeat(bottom_right, num_ped)
-
-            grid_pos = self.get_grid_locations(top_left, curr_end_pos).type_as(
-                seq_start_end
-            )
-            # Make all positions to exclude as non-zero
-            # Find which peds to exclude
-            x_bound = (curr_end_pos[:, 0] >= bottom_right[:, 0]) + (
-                    curr_end_pos[:, 0] <= top_left[:, 0]
-            )
-            y_bound = (curr_end_pos[:, 1] >= top_left[:, 1]) + (
-                    curr_end_pos[:, 1] <= bottom_right[:, 1]
-            )
-
-            within_bound = x_bound + y_bound
-            within_bound[0:: num_ped + 1] = 1  # Don't include the ped itself
-            within_bound = within_bound.view(-1)
-
-            # This is a tricky way to get scatter add to work. Helps me avoid a
-            # for loop. Offset everything by 1. Use the initial 0 position to
-            # dump all uncessary adds.
-            grid_pos += 1
-            total_grid_size = self.grid_size * self.grid_size
-            offset = torch.arange(
-                0, total_grid_size * num_ped, total_grid_size
-            ).type_as(seq_start_end)
-
-            offset = self.repeat(offset.view(-1, 1), num_ped).view(-1)
-            grid_pos += offset
-            grid_pos[within_bound != 0] = 0
-            grid_pos = grid_pos.view(-1, 1).expand_as(curr_hidden_repeat)
-
-            curr_pool_h = curr_pool_h.scatter_add(0, grid_pos, curr_hidden_repeat)
-            curr_pool_h = curr_pool_h[1:]
-            pool_h.append(curr_pool_h.view(num_ped, -1))
-
-        pool_h = torch.cat(pool_h, dim=0)
-        pool_h = self.mlp_pool(pool_h)
-        return pool_h
-
-
-class TrajectoryGenerator(nn.Module):
-    """Modified from @agrimgupta92's https://github.com/agrimgupta92/sgan/blob/master/sgan/models.py."""
-
-    def __init__(
-            self,
-            obs_len,
-            pred_len,
-            embedding_dim=64,
-            encoder_h_dim=64,
-            decoder_h_dim=128,
-            mlp_dim=1024,
-            num_layers=1,
-            noise_dim=(0,),
-            noise_type="gaussian",
-            noise_mix_type="ped",
-            pooling_type=None,
-            pool_every_timestep=True,
-            dropout=0.0,
-            bottleneck_dim=1024,
-            activation="relu",
-            batch_norm=True,
-            neighborhood_size=2.0,
-            grid_size=8,
-    ):
-        super(TrajectoryGenerator, self).__init__()
-
-        if pooling_type and pooling_type.lower() == "none":
-            pooling_type = None
-
-        self.obs_len = obs_len
-        self.pred_len = pred_len
-        self.mlp_dim = mlp_dim
-        self.encoder_h_dim = encoder_h_dim
-        self.decoder_h_dim = decoder_h_dim
-        self.embedding_dim = embedding_dim
-        self.noise_dim = noise_dim
-        self.num_layers = num_layers
-        self.noise_type = noise_type
-        self.noise_mix_type = noise_mix_type
-        self.pooling_type = pooling_type
-        self.noise_first_dim = 0
-        self.pool_every_timestep = pool_every_timestep
-        self.bottleneck_dim = 1024
-
-        self.encoder = Encoder(
-            embedding_dim=embedding_dim,
-            h_dim=encoder_h_dim,
-            mlp_dim=mlp_dim,
-            num_layers=num_layers,
-            dropout=dropout,
-        )
-
-        self.decoder = Decoder(
-            pred_len,
-            embedding_dim=embedding_dim,
-            h_dim=decoder_h_dim,
-            mlp_dim=mlp_dim,
-            num_layers=num_layers,
-            pool_every_timestep=pool_every_timestep,
-            dropout=dropout,
-            bottleneck_dim=bottleneck_dim,
-            activation=activation,
-            batch_norm=batch_norm,
-            pooling_type=pooling_type,
-            grid_size=grid_size,
-            neighborhood_size=neighborhood_size,
-        )
-
-        if pooling_type == "pool_net":
-            self.pool_net = PoolHiddenNet(
-                embedding_dim=self.embedding_dim,
-                h_dim=encoder_h_dim,
-                mlp_dim=mlp_dim,
-                bottleneck_dim=bottleneck_dim,
-                activation=activation,
-                batch_norm=batch_norm,
-            )
-        elif pooling_type == "spool":
-            self.pool_net = SocialPooling(
-                h_dim=encoder_h_dim,
-                activation=activation,
-                batch_norm=batch_norm,
-                dropout=dropout,
-                neighborhood_size=neighborhood_size,
-                grid_size=grid_size,
-            )
-
-        if self.noise_dim[0] == 0:
-            self.noise_dim = None
-        else:
-            self.noise_first_dim = noise_dim[0]
-
-        # Decoder Hidden
-        if pooling_type:
-            input_dim = encoder_h_dim + bottleneck_dim
-        else:
-            input_dim = encoder_h_dim
-
-        if self.mlp_decoder_needed():
-            mlp_decoder_context_dims = [
-                input_dim,
-                mlp_dim,
-                decoder_h_dim - self.noise_first_dim,
-            ]
-
-            self.mlp_decoder_context = make_mlp(
-                mlp_decoder_context_dims,
-                activation=activation,
-                batch_norm=batch_norm,
-                dropout=dropout,
-            )
-
-    def add_noise(self, _input, seq_start_end, user_noise=None):
-        """
-        Inputs:
-        - _input: Tensor of shape (_, decoder_h_dim - noise_first_dim)
-        - seq_start_end: A list of tuples which delimit sequences within batch.
-        - user_noise: Generally used for inference when you want to see
-        relation between different types of noise and outputs.
-        Outputs:
-        - decoder_h: Tensor of shape (_, decoder_h_dim)
-        """
-        if not self.noise_dim:
-            return _input
-
-        if self.noise_mix_type == "global":
-            noise_shape = (seq_start_end.size(0),) + self.noise_dim
-        else:
-            noise_shape = (_input.size(0),) + self.noise_dim
-
-        if user_noise is not None:
-            z_decoder = user_noise
-        else:
-            z_decoder = get_noise(noise_shape, self.noise_type)
-
-        if self.noise_mix_type == "global":
-            _list = []
-            for idx, (start, end) in enumerate(seq_start_end):
-                start = start.item()
-                end = end.item()
-                _vec = z_decoder[idx].view(1, -1)
-                _to_cat = _vec.repeat(end - start, 1)
-                _list.append(torch.cat([_input[start:end], _to_cat], dim=1))
-            decoder_h = torch.cat(_list, dim=0)
-            return decoder_h
-
-        decoder_h = torch.cat([_input, z_decoder], dim=1)
-
-        return decoder_h
-
-    def mlp_decoder_needed(self):
-        if (
-                self.noise_dim
-                or self.pooling_type
-                or self.encoder_h_dim != self.decoder_h_dim
-        ):
-            return True
-        else:
-            return False
-
-    def forward(self, obs_traj, obs_traj_rel, seq_start_end, user_noise=None):
-        """
-        Inputs:
-        - obs_traj: Tensor of shape (obs_len, batch, 2)
-        - obs_traj_rel: Tensor of shape (obs_len, batch, 2)
-        - seq_start_end: A list of tuples which delimit sequences within batch.
-        - user_noise: Generally used for inference when you want to see
-        relation between different types of noise and outputs.
-        Output:
-        - pred_traj_rel: Tensor of shape (self.pred_len, batch, 2)
-        """
-        batch = obs_traj_rel.size(1)
-        # Encode seq
-        final_encoder_h = self.encoder(obs_traj_rel)
-        # Pool States
-        if self.pooling_type:
-            end_pos = obs_traj[-1, :, :]
-            pool_h = self.pool_net(final_encoder_h, seq_start_end, end_pos)
-            # Construct input hidden states for decoder
-            mlp_decoder_context_input = torch.cat(
-                [final_encoder_h.view(-1, self.encoder_h_dim), pool_h], dim=1
-            )
-        else:
-            mlp_decoder_context_input = final_encoder_h.view(-1, self.encoder_h_dim)
-
-        # Add Noise
-        if self.mlp_decoder_needed():
-            noise_input = self.mlp_decoder_context(mlp_decoder_context_input)
-        else:
-            noise_input = mlp_decoder_context_input
-        decoder_h = self.add_noise(noise_input, seq_start_end, user_noise=user_noise)
-        decoder_h = torch.unsqueeze(decoder_h, 0)
-
-        decoder_c = torch.zeros(self.num_layers, batch, self.decoder_h_dim).cuda()
-
-        state_tuple = (decoder_h, decoder_c)
-        last_pos = obs_traj[-1]
-        last_pos_rel = obs_traj_rel[-1]
-        # Predict Trajectory
-
-        decoder_out = self.decoder(last_pos, last_pos_rel, state_tuple, seq_start_end)
-        pred_traj_fake_rel, final_decoder_h = decoder_out
-
-        return pred_traj_fake_rel
-
-
-class TrajectoryDiscriminator(nn.Module):
-    def __init__(
-            self,
-            obs_len,
-            pred_len,
-            embedding_dim=64,
-            h_dim=64,
-            mlp_dim=1024,
-            num_layers=1,
-            activation="relu",
-            batch_norm=True,
-            dropout=0.0,
-            d_type="local",
-    ):
-        super(TrajectoryDiscriminator, self).__init__()
-
-        self.obs_len = obs_len
-        self.pred_len = pred_len
-        self.seq_len = obs_len + pred_len
-        self.mlp_dim = mlp_dim
-        self.h_dim = h_dim
-        self.d_type = d_type
-
-        self.encoder = Encoder(
-            embedding_dim=embedding_dim,
-            h_dim=h_dim,
-            mlp_dim=mlp_dim,
-            num_layers=num_layers,
-            dropout=dropout,
-        )
-
-        real_classifier_dims = [h_dim, mlp_dim, 1]
-        self.real_classifier = make_mlp(
-            real_classifier_dims,
-            activation=activation,
-            batch_norm=batch_norm,
-            dropout=dropout,
-        )
-        if d_type == "global":
-            mlp_pool_dims = [h_dim + embedding_dim, mlp_dim, h_dim]
-            self.pool_net = PoolHiddenNet(
-                embedding_dim=embedding_dim,
-                h_dim=h_dim,
-                mlp_dim=mlp_pool_dims,
-                bottleneck_dim=h_dim,
-                activation=activation,
-                batch_norm=batch_norm,
-            )
-
-    def forward(self, traj, traj_rel, seq_start_end=None):
-        """
-        Inputs:
-        - traj: Tensor of shape (obs_len + pred_len, batch, 2)
-        - traj_rel: Tensor of shape (obs_len + pred_len, batch, 2)
-        - seq_start_end: A list of tuples which delimit sequences within batch
-        Output:
-        - scores: Tensor of shape (batch,) with real/fake scores
-        """
-        final_h = self.encoder(traj_rel)
-        # Note: In case of 'global' option we are using start_pos as opposed to
-        # end_pos. The intution being that hidden state has the whole
-        # trajectory and relative postion at the start when combined with
-        # trajectory information should help in discriminative behavior.
-        if self.d_type == "local":
-            classifier_input = final_h.squeeze()
-        else:
-            classifier_input = self.pool_net(final_h.squeeze(), seq_start_end, traj[0])
-        scores = self.real_classifier(classifier_input)
-        return scores
diff --git a/traja/models/generative_models/vae.py b/traja/models/generative_models/vae.py
index 70027b00..8c0add07 100644
--- a/traja/models/generative_models/vae.py
+++ b/traja/models/generative_models/vae.py
@@ -1,31 +1,11 @@
 """ This module implement the Variational Autoencoder model for 
 both forecasting and classification of time series data.
-
-```USAGE``` to train AE model:
-trainer = Trainer(model_type='vae',
-                 device=device,
-                 input_size=input_size, 
-                 output_size=output_size, 
-                 lstm_hidden_size=lstm_hidden_size, 
-                 lstm_num_layers=lstm_num_layers,
-                 reset_state=True,
-                 num_classes=num_classes,
-                 latent_size=latent_size,
-                 dropout=0.1,
-                 num_layers=num_layers,
-                 epochs=epochs,
-                 batch_size=batch_size,
-                 num_future=num_future,
-                 num_past=num_past,
-                 bidirectional =False,
-                 batch_first =True,
-                 loss_type = 'huber')
-
-trainer.train_latent_model(train_dataloader, test_dataloader, model_save_path=PATH)"""
+"""
 
 import torch
-from torch import nn
 
+from traja.models.base_models.MLPClassifier import MLPClassifier
+from traja.models.base_models.MLPRegressor import MLPRegressor
 from traja.models.utils import TimeDistributed
 
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -210,51 +190,6 @@ def forward(self, x, num_future=None):
         return output
 
 
-class MLPClassifier(torch.nn.Module):
-    """ MLP classifier: Classify the input data using the latent embeddings
-            input_size: The number of expected latent size
-            hidden_size: The number of features in the hidden state h
-            num_classes: Size of labels or the number of categories in the data
-            dropout:  If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer,
-                            with dropout probability equal to dropout
-            num_classifier_layers: Number of hidden layers in the classifier
-            """
-
-    def __init__(
-            self,
-            input_size: int,
-            hidden_size: int,
-            num_classes: int,
-            latent_size: int,
-            num_classifier_layers: int,
-            dropout: float,
-    ):
-        super(MLPClassifier, self).__init__()
-
-        self.input_size = input_size
-        self.hidden_size = hidden_size
-        self.num_classes = num_classes
-        self.num_classifier_layers = num_classifier_layers
-        self.dropout = dropout
-
-        # Classifier layers
-        self.hidden = nn.ModuleList([nn.Linear(self.input_size, self.hidden_size)])
-        self.hidden.extend(
-            [
-                nn.Linear(self.hidden_size, self.hidden_size)
-                for _ in range(1, self.num_classifier_layers - 1)
-            ]
-        )
-        self.hidden = nn.Sequential(*self.hidden)
-        self.out = nn.Linear(self.hidden_size, self.num_classes)
-        self.dropout = torch.nn.Dropout(p=dropout)
-
-    def forward(self, x):
-        x = self.dropout(self.hidden(x))
-        out = self.out(x)
-        return out
-
-
 class MultiModelVAE(torch.nn.Module):
     """Implementation of Multimodel Variational autoencoders; This Module wraps the Variational Autoencoder
     models [Encoder,Latent[Sampler],Decoder]. If classify=True, then the wrapper also include classification layers
@@ -314,6 +249,8 @@ def __init__(
         self.regressor_hidden_size = regressor_hidden_size
         self.num_regressor_parameters = num_regressor_parameters
 
+        self.latent_output_disabled = False  # Manually override latent output
+
         # Let the trainer know what kind of model this is
         self.model_type = 'vae'
 
@@ -352,22 +289,67 @@ def __init__(
             self.classifier = MLPClassifier(
                 input_size=self.latent_size,
                 hidden_size=self.classifier_hidden_size,
-                num_classes=self.num_classes,
-                latent_size=self.latent_size,
-                num_classifier_layers=self.num_classifier_layers,
+                output_size=self.num_classes,
+                num_layers=self.num_classifier_layers,
                 dropout=self.dropout,
             )
 
         if self.num_regressor_parameters is not None:
-            self.regressor = MLPClassifier(
+            self.regressor = MLPRegressor(
                 input_size=self.latent_size,
                 hidden_size=self.regressor_hidden_size,
-                num_classes=self.num_regressor_parameters,
-                latent_size=self.latent_size,
-                num_classifier_layers=self.num_regressor_layers,
+                output_size=self.num_regressor_parameters,
+                num_layers=self.num_regressor_layers,
                 dropout=self.dropout,
             )
 
+    def reset_classifier(self, classifier_hidden_size: int, num_classifier_layers: int):
+        """Reset the classifier, with a new hidden size and depth.
+        This is useful when parameter searching.
+
+        classifier_hidden_size: The number of units in each classifier layer
+        num_layers: Number of layers in the classifier
+        """
+        self.classifier_hidden_size = classifier_hidden_size
+        self.num_classifier_layers = num_classifier_layers
+
+        self.classifier = MLPClassifier(
+            input_size=self.latent_size,
+            hidden_size=self.classifier_hidden_size,
+            output_size=self.num_classes,
+            num_layers=self.num_classifier_layers,
+            dropout=self.dropout,
+        )
+
+    def reset_regressor(self, regressor_hidden_size: int, num_regressor_layers: int):
+        """Reset the regressor, with a new hidden size and depth.
+        This is useful when parameter searching.
+
+        regressor_hidden_size: The number of units in each classifier layer
+        num_regressor_layers: Number of layers in the classifier
+        """
+        self.num_regressor_layers = num_regressor_layers
+        self.regressor_hidden_size = regressor_hidden_size
+
+        self.regressor = MLPRegressor(
+            input_size=self.latent_size,
+            hidden_size=self.regressor_hidden_size,
+            output_size=self.num_regressor_parameters,
+            num_layers=self.num_regressor_layers,
+            dropout=self.dropout,
+        )
+
+    def disable_latent_output(self):
+        """Disable latent output, to make the VAE behave like a standard autoencoder while training.
+        This modifies the training loss computed. """
+        self.latent_output_disabled = True
+
+    def enable_latent_output(self):
+        """Enable latent output, to make the VAE behave like a variational autoencoder while training.
+        This modifies the training loss computed.
+        NOTE: By default, latent output is enabled."""
+        self.latent_output_disabled = False
+
     def forward(self, data, training=True, classify=False, regress=False, latent=True):
         """
         Parameters:
@@ -454,6 +436,11 @@ def forward(self, data, training=True, classify=False, regress=False, latent=Tru
             latent_out, mu, logvar = self.latent(enc_out, training=training)
 
             regressor_out = self.regressor(mu)  # Deterministic
+
+            if self.latent_output_disabled:
+                mu = None
+                logvar = None
+
             if latent:
                 return regressor_out, latent_out, mu, logvar
             else:
diff --git a/traja/models/inference.py b/traja/models/inference.py
index 6a39fabd..5e5cfa3d 100644
--- a/traja/models/inference.py
+++ b/traja/models/inference.py
@@ -15,11 +15,11 @@
 
 class Generator:
     def __init__(
-        self,
-        model_type: str = None,
-        model_path: str = None,
-        model_hyperparameters: dict = None,
-        model: torch.nn.Module = None,
+            self,
+            model_type: str = None,
+            model_path: str = None,
+            model_hyperparameters: dict = None,
+            model: torch.nn.Module = None,
     ):
         """Generate a batch of future steps from a random latent state of Multi variate multi label models 
 
@@ -46,7 +46,7 @@ def __init__(
 
         (self.generated_category, self.generated_data,) = (None, None)
 
-    def generate(self, num_steps, classify=True, scaler=None):
+    def generate(self, num_steps, classify=True, scaler=None, plot_data=True):
 
         self.model.to(device)
         if self.model_type == "vae":
@@ -56,8 +56,8 @@ def generate(self, num_steps, classify=True, scaler=None):
                     self.model_hyperparameters["batch_size"],
                     self.model_hyperparameters["latent_size"],
                 )
-                .normal_(mean=0, std=0.1)
-                .to(device)
+                    .normal_(mean=0, std=0.1)
+                    .to(device)
             )
             # Generate trajectories from the noise
             self.generated_data = (
@@ -77,23 +77,15 @@ def generate(self, num_steps, classify=True, scaler=None):
                 except Exception as error:
                     print("Classifier not found: " + repr(error))
 
-            fig, ax = plt.subplots(nrows=2, ncols=5, figsize=(16, 5), sharey=True)
-            fig.set_size_inches(20, 5)
-
             # Scale original data and generated data
 
             # Rescaling predicted data
-            for i in range(self.generated_data.shape[1]):
-                s_s = scaler[f"scaler_{i}"].inverse_transform(
-                    self.generated_data[:, i].reshape(-1, 1)
-                )
-                s_s = np.reshape(s_s, len(s_s))
-                self.generated_data[:, i] = s_s
+            self.generated_data = scaler.inverse_transform(self.generated_data)
 
             # TODO:Depreself.generated_categoryed;Slicing the data into batches
             self.generated_data = np.array(
                 [
-                    self.generated_data[i : i + num_steps]
+                    self.generated_data[i: i + num_steps]
                     for i in range(0, len(self.generated_data), num_steps)
                 ]
             )
@@ -104,31 +96,35 @@ def generate(self, num_steps, classify=True, scaler=None):
                 self.generated_data.shape[2],
             )
 
-            for i in range(2):
-                for j in range(5):
-                    if classify:
-                        try:
-                            label = "Animal ID {}".format(
-                                (
-                                    torch.max(self.generated_category, 1).indices + 1
-                                ).detach()[i + j]
-                            )
-                        except Exception as error:
-                            print("Classifier not found:" + repr(error))
-                    else:
-                        label = ""
-                    ax[i, j].plot(
-                        self.generated_data[:, 0][
-                            (i + j) * num_steps : (i + j) * num_steps + num_steps
-                        ],
-                        self.generated_data[:, 1][
-                            (i + j) * num_steps : (i + j) * num_steps + num_steps
-                        ],
-                        label=label,
-                        color="g",
-                    )
-                    ax[i, j].legend()
-            plt.show()
+            if plot_data:
+                fig, ax = plt.subplots(nrows=2, ncols=5, figsize=(16, 5), sharey=True)
+                fig.set_size_inches(20, 5)
+
+                for i in range(2):
+                    for j in range(5):
+                        if classify:
+                            try:
+                                label = "Animal ID {}".format(
+                                    (
+                                            torch.max(self.generated_category, 1).indices + 1
+                                    ).detach()[i + j]
+                                )
+                            except Exception as error:
+                                print("Classifier not found:" + repr(error))
+                        else:
+                            label = ""
+                        ax[i, j].plot(
+                            self.generated_data[:, 0][
+                            (i + j) * num_steps: (i + j) * num_steps + num_steps
+                            ],
+                            self.generated_data[:, 1][
+                            (i + j) * num_steps: (i + j) * num_steps + num_steps
+                            ],
+                            label=label,
+                            color="g",
+                        )
+                        ax[i, j].legend()
+                plt.show()
 
             return self.generated_data
 
@@ -150,11 +146,11 @@ def generate_timeseries(self, num_steps):
 
 class Predictor:
     def __init__(
-        self,
-        model_type: str = None,
-        model_path: str = None,
-        model_hyperparameters: dict = None,
-        model: torch.nn.Module = None,
+            self,
+            model_type: str = None,
+            model_path: str = None,
+            model_hyperparameters: dict = None,
+            model: torch.nn.Module = None,
     ):
         """Generate a batch of future steps from a random latent state of Multi variate multi label models 
 
@@ -231,7 +227,7 @@ def predict(self, data_loader, num_steps, scaler, classify=True):
 
                 # Rescaling predicted data
                 for i in range(self.predicted_data.shape[1]):
-                    s_s = scaler[f"scaler_{i}"].inverse_transform(
+                    s_s = scaler.inverse_transform(
                         self.predicted_data[:, i].reshape(-1, 1)
                     )
                     s_s = np.reshape(s_s, len(s_s))
@@ -240,14 +236,14 @@ def predict(self, data_loader, num_steps, scaler, classify=True):
                 # TODO:Depreself.generated_categoryed;Slicing the data into batches
                 predicted_data = np.array(
                     [
-                        self.predicted_data[i : i + num_steps]
+                        self.predicted_data[i: i + num_steps]
                         for i in range(0, len(self.predicted_data), num_steps)
                     ]
                 )
                 # Rescaling target data
                 self.target_data = target.copy()
                 for i in range(self.target_data.shape[1]):
-                    s_s = scaler["scaler_{}".format(i)].inverse_transform(
+                    s_s = scaler.inverse_transform(
                         self.target_data[:, i].reshape(-1, 1)
                     )
                     s_s = np.reshape(s_s, len(s_s))
@@ -255,7 +251,7 @@ def predict(self, data_loader, num_steps, scaler, classify=True):
                 # TODO:Depreself.generated_categoryed;Slicing the data into batches
                 self.target_data = np.array(
                     [
-                        self.target_data[i : i + num_steps]
+                        self.target_data[i: i + num_steps]
                         for i in range(0, len(self.target_data), num_steps)
                     ]
                 )
@@ -276,20 +272,20 @@ def predict(self, data_loader, num_steps, scaler, classify=True):
                     for j in range(5):
                         ax[i, j].plot(
                             predicted_data_[:, 0][
-                                (i + j) * num_steps : (i + j) * num_steps + num_steps
+                            (i + j) * num_steps: (i + j) * num_steps + num_steps
                             ],
                             predicted_data_[:, 1][
-                                (i + j) * num_steps : (i + j) * num_steps + num_steps
+                            (i + j) * num_steps: (i + j) * num_steps + num_steps
                             ],
-                            label=f"Predicted ID {self.generated_categoryegory[i+j]}",
+                            label=f"Predicted ID {self.generated_categoryegory[i + j]}",
                         )
 
                         ax[i, j].plot(
                             self.target_data_[:, 0][
-                                (i + j) * num_steps : (i + j) * num_steps + num_steps
+                            (i + j) * num_steps: (i + j) * num_steps + num_steps
                             ],
                             self.target_data_[:, 1][
-                                (i + j) * num_steps : (i + j) * num_steps + num_steps
+                            (i + j) * num_steps: (i + j) * num_steps + num_steps
                             ],
                             label=f"Target ID {self.generated_category[i + j]}",
                             color="g",
diff --git a/traja/models/losses.py b/traja/models/losses.py
index 0d05fd2d..66222e2b 100644
--- a/traja/models/losses.py
+++ b/traja/models/losses.py
@@ -14,26 +14,9 @@ def __init__(self):
         self.mse_loss = torch.nn.MSELoss()
         self.crossentropy_loss = torch.nn.CrossEntropyLoss()
 
-    def RMSELoss(self, predicted, target):
-        return torch.sqrt(self.mse_loss(predicted, target))
-
-    def ae_criterion(self, predicted, target, loss_type="huber"):
-        """ Implements the Autoencoder loss for time series forecasting
-        :param predicted: Predicted time series by the model
-        :param target: Target time series
-        :param loss_type: Type of criterion; Defaults: 'huber'
-        :return:
-        """
-
-        if loss_type == "huber":
-            loss = self.huber_loss(predicted, target)
-            return loss
-        else:  # Root MSE
-            return torch.sqrt(torch.mean((predicted - target) ** 2))
-
-    def vae_criterion(self, predicted, target, mu, logvar, loss_type="huber"):
-        """ Time series generative model loss function
-        Provides both vae loss functions (huber, manhattan, mse)
+    def forecasting_criterion(self, predicted, target, mu=None, logvar=None, loss_type="huber"):
+        """ Time series forecasting model loss function
+        Provides loss functions huber, manhattan, mse. Adds KL divergence if mu and logvar specified.
         and ae loss functions (huber_ae, manhattan_ae, mse_ae).
         :param predicted: Predicted time series by the model
         :param target: Target time series
@@ -43,20 +26,17 @@ def vae_criterion(self, predicted, target, mu, logvar, loss_type="huber"):
         :return: Reconstruction loss + KLD loss (if not ae)
         """
 
-        KLD = -0.5 * torch.sum(1 + logvar - mu ** 2 - logvar.exp())
+        if mu is not None and logvar is not None:
+            kld = -0.5 * torch.sum(1 + logvar - mu ** 2 - logvar.exp())
+        else:
+            kld = 0
 
         if loss_type == "huber":
-            loss = self.huber_loss(predicted, target) + KLD
+            loss = self.huber_loss(predicted, target) + kld
         elif loss_type == "manhattan":
-            loss = self.manhattan_loss(predicted, target) + KLD
+            loss = self.manhattan_loss(predicted, target) + kld
         elif loss_type == "mse":
-            loss = self.mse_loss(predicted, target) + KLD
-        elif loss_type == "huber_ae":
-            loss = self.huber_loss(predicted, target)
-        elif loss_type == "manhattan_ae":
-            loss = self.manhattan_loss(predicted, target)
-        elif loss_type == "mse_ae":
-            loss = self.mse_loss(predicted, target)
+            loss = self.mse_loss(predicted, target) + kld
         else:
             raise Exception("Loss type '{}' is unknown!".format(loss_type))
         return loss
@@ -82,11 +62,3 @@ def regressor_criterion(self, predicted, target):
 
         loss = self.mse_loss(predicted, target)
         return loss
-
-    def lstm_criterion(self, predicted, target):
-
-        loss = self.huber_loss(predicted, target)
-        return loss
-
-    def vaegan_criterion(self):
-        return NotImplementedError
diff --git a/traja/models/nn.py b/traja/models/nn.py
deleted file mode 100644
index 6740d398..00000000
--- a/traja/models/nn.py
+++ /dev/null
@@ -1,949 +0,0 @@
-#! /usr/local/env python3
-"""Pytorch visualization code modified from Chad Jensen's implementation
-(https://discuss.pytorch.org/t/lstm-for-sequence-prediction/22021/3)."""
-import logging
-
-import matplotlib.pyplot as plt
-import numpy as np
-
-try:
-    import torch
-except ImportError:
-    raise ImportError(
-        "Missing optional dependency 'pytorch'. Install it via pytorch.org"
-    )
-import torch.nn as nn
-import torch.optim as optim
-import os
-import pandas as pd
-from time import time
-from datetime import datetime
-
-nb_steps = 10
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-
-class LossMse:
-    """
-    Calculate the Mean Squared Error between y_true and y_pred
-
-    y_true is the desired output.
-    y_pred is the model's output.
-    """
-
-    def __init__(self) -> None:
-        pass
-
-    def __call__(self, y_pred, y_true):
-        # Calculate the Mean Squared Error and use it as loss.
-        mse = torch.mean(torch.square(y_true - y_pred))
-
-        return mse
-
-
-class Trainer:
-    def __init__(self, model,
-                 train_loader,
-                 test_loader,
-                 epochs=200,
-                 batch_size=60,
-                 run_id=0,
-                 logs_dir='logs',
-                 device='cpu',
-                 optimizer='None',
-                 plot=True,
-                 downsampling=None):
-        self.device = device
-        self.model = model
-        self.epochs = epochs
-        self.plot = plot
-
-        self.train_loader = train_loader
-        self.test_loader = test_loader
-
-        self.criterion = LossMse()
-        print('Checking for optimizer for {}'.format(optimizer))
-        if optimizer == "adam":
-            print('Using adam')
-            self.optimizer = optim.Adam(model.parameters())
-        elif optimizer == "adam_lr":
-            print("Using adam with higher learning rate")
-            self.optimizer = optim.Adam(model.parameters(), lr=0.01)
-        elif optimizer == 'adam_lr2':
-            print('Using adam with to large learning rate')
-            self.optimizer = optim.Adam(model.parameters(), lr=0.0001)
-        elif optimizer == "SGD":
-            print('Using SGD')
-            self.optimizer = optim.SGD(model.parameters(), momentum=0.9, weight_decay=5e-4)
-        elif optimizer == "LRS":
-            print('Using LRS')
-            self.optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
-            self.lr_scheduler = optim.lr_scheduler.StepLR(self.optimizer, self.epochs // 3)
-        elif optimizer == "radam":
-            print('Using radam')
-            self.optimizer = RAdam(model.parameters())
-        elif optimizer == "RMSprop":
-            print('Using RMSprop')
-            self.optimizer = optim.RMSprop(model.parameters())
-        else:
-            raise ValueError('Unknown optimizer {}'.format(optimizer))
-        self.opt_name = optimizer
-        save_dir = os.path.join(logs_dir, model.name, train_loader.name)
-        if not os.path.exists(save_dir):
-            os.makedirs(save_dir)
-
-        self.savepath = os.path.join(save_dir,
-                                     f'{model.name}_bs{batch_size}_e{epochs}_dspl{downsampling}_id{run_id}.csv')
-        self.experiment_done = False
-        if os.path.exists(self.savepath):
-            trained_epochs = len(pd.read_csv(self.savepath, sep=';'))
-
-            if trained_epochs >= epochs:
-                self.experiment_done = True
-                print(
-                    f'Experiment Logs for the exact same experiment with identical run_id was detected, training will be skipped, consider using another run_id')
-        if os.path.exists((self.savepath.replace('.csv', '.pt'))):
-            self.model.load_state_dict(torch.load(self.savepath.replace('.csv', '.pt'))['model_state_dict'])
-            self.model = self.model.to(self.device)
-
-            self.optimizer.load_state_dict(torch.load(self.savepath.replace('.csv', '.pt'))['optimizer'])
-            self.start_epoch = torch.load(self.savepath.replace('.csv', '.pt'))['epoch'] + 1
-        else:
-
-            self.start_epoch = 0
-            self.model = self.model.to(self.device)
-
-    def _infer_initial_epoch(self, savepath):
-        if not os.path.exists(savepath):
-            return 0
-        else:
-            df = pd.read_csv(savepath, sep=';', index_col=0)
-            print(len(df) + 1)
-            return len(df)
-
-    def train(self):
-        if self.experiment_done:
-            return
-        for epoch in range(self.start_epoch, self.epochs):
-
-            print('Start training epoch', epoch)
-            print("{} Epoch {}, training loss: {}".format(datetime.now(), epoch, self.train_epoch()))
-            self.test(epoch=epoch)
-            if self.opt_name == "LRS":
-                print('LRS step')
-                self.lr_scheduler.step()
-        return self.savepath + '.csv'
-
-    def train_epoch(self):
-        self.model.train()
-        total = 0
-        running_loss = 0
-        old_time = time()
-        for batch, data in enumerate(self.train_loader):
-
-            inputs, targets = data[0].to(self.device).float(), data[1].to(self.device).float()
-            self.optimizer.zero_grad()
-            outputs = self.model(inputs)
-            loss = self.criterion(outputs, targets)
-            loss.backward()
-            self.optimizer.step()
-            running_loss += loss.item()
-
-            if batch % 10 == 0 and batch != 0:
-                print(batch, 'of', len(self.train_loader), 'processing time', time() - old_time, 'loss:',
-                      running_loss / total)
-                old_time = time()
-
-            # Increment number of batches
-            total += 1
-        return running_loss / total
-
-    def test(self, epoch, save=True):
-        self.model.eval()
-        total = 0
-        test_loss = 0
-        with torch.no_grad():
-            for batch, data in enumerate(self.test_loader):
-                if batch % 10 == 0:
-                    print('Processing eval batch', batch, 'of', len(self.test_loader))
-                inputs, targets = data[0].to(self.device).float(), data[1].to(self.device).float()
-                outputs = self.model(inputs)
-                loss = self.criterion(outputs, targets)
-                total += 1
-                test_loss += loss.item()
-
-        if save:
-            torch.save({
-                'model_state_dict': self.model.state_dict(),
-                'optimizer': self.optimizer.state_dict(),
-                'epoch': epoch,
-                'test_loss': test_loss / total
-            }, self.savepath.replace('.csv', '.pt'))
-        return test_loss / total
-
-
-class LSTM(nn.Module):
-    """ Deep LSTM network. This implementation
-    returns output_size outputs.
-
-
-    Args:
-        input_size: The number of expected features in the input `x`
-        hidden_size: The number of features in the hidden state `h`
-        num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
-            would mean stacking two LSTMs together to form a `stacked LSTM`,
-            with the second LSTM taking in outputs of the first LSTM and
-            computing the final results. Default: 1
-        output_size: The number of output dimensions
-        dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
-            LSTM layer except the last layer, with dropout probability equal to
-            :attr:`dropout`. Default: 0
-        bidirectional: If ``True``, becomes a bidirectional LSTM. Default: ``False``
-    """
-
-    name = "LSTM"
-
-    def __init__(self, input_size: int, hidden_size: int, num_layers: int,
-                 output_size: int, dropout: float, bidirectional: bool):
-        super(LSTM, self).__init__()
-
-        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
-                            num_layers=num_layers, dropout=dropout,
-                            bidirectional=bidirectional, )
-
-        self.head = nn.Linear(hidden_size, output_size)
-
-    def forward(self, x):
-        x, state = self.lstm(x)
-        # Use the last hidden state of last layer
-        x = state[0][-1]
-        x = self.head(x)
-        return x
-
-
-class TrajectoryLSTM:
-    def __init__(
-            self, xy, nb_steps=10, epochs=1000, batch_size=1, criterion=nn.MSELoss()
-    ):
-        fig, ax = plt.subplots(2, 1)
-        self.fig = fig
-        self.ax = ax
-        assert xy.shape[1] is 2, f"xy should be an N x 2 array, but is {xy.shape}"
-        self.xy = xy
-        self.nb_steps = nb_steps
-        self.epochs = epochs
-        self.batch_size = batch_size
-        self.criterion = criterion
-        self.rnn = LSTM()
-
-    def load_batch(self, batch_size=32):
-        t_1_b = np.zeros((self.nb_steps, self.batch_size, 2))
-        t_b = np.zeros((self.nb_steps * self.batch_size, 2))
-
-        inds = np.random.randint(0, len(self.xy) - self.nb_steps, (self.batch_size))
-        for i, ind in enumerate(inds):
-            t_1_b[:, i] = self.xy[ind: ind + self.nb_steps]
-            t_b[i * nb_steps: (i + 1) * self.nb_steps] = self.xy[
-                                                         ind + 1: ind + nb_steps + 1
-                                                         ]
-        return torch.from_numpy(t_1_b).float(), torch.from_numpy(t_b).float()
-
-    def train(self):
-        self.mean_loss = 0.0
-        for epoch in range(1, self.epochs + 1):
-            t_1_b, t_b = self.load_batch(self.batch_size)
-
-            def closure():
-                global loss
-                optimizer.zero_grad()
-                pred = self.rnn(t_1_b)
-                shaped_pred = pred.reshape(-1, 2)
-                loss = self.criterion(abs(shaped_pred), abs(t_b))
-                loss.backward()
-
-                return loss
-
-            optimizer = optim.Adam(self.rnn.parameters(), 1e-3)
-            optimizer.step(closure)
-            self.mean_loss += loss.item()
-
-            if epoch % 100 == 0:
-                print("Epoch: {} | Loss: {:.6f}".format(epoch, self.mean_loss))
-                self.mean_loss = 0
-
-    def savefig(self, filepath):
-        self.fig.savefig(filepath)
-
-    def _plot(self):
-        t_1_b, t_b = self.load_batch(1)
-        pred = self.rnn(t_1_b).detach().numpy().reshape(-1, 2)
-
-        real = t_1_b.numpy().reshape(-1, 2)
-        x, y = self.xy.T
-        self.ax[0].plot(x, y, label="Real")
-        self.ax[0].plot(real[:, 0], real[:, 1], label="Real batch")
-        self.ax[0].plot(pred[:, 0], pred[:, 1], label="Pred")
-
-        self.ax[1].scatter(real[:, 0], real[:, 1], label="Real")
-        self.ax[1].scatter(pred[:, 0], pred[:, 1], label="Pred")
-
-        for a in self.ax:
-            a.legend()
-
-    def plot(self, interactive=True):
-        if interactive and (plt.get_backend() == "agg"):
-            logging.ERROR("Not able to use interactive plotting in mpl `agg` mode.")
-            # interactive = False
-        elif interactive:
-            while True:
-                for a in self.ax:
-                    a.clear()
-                self._plot()
-                plt.pause(1)
-                plt.show(block=False)
-        else:
-            self._plot()
-            return self.fig
-
-
-def make_mlp(dim_list, activation="relu", batch_norm=True, dropout=0):
-    layers = []
-    for dim_in, dim_out in zip(dim_list[:-1], dim_list[1:]):
-        layers.append(nn.Linear(dim_in, dim_out))
-        if batch_norm:
-            layers.append(nn.BatchNorm1d(dim_out))
-        if activation == "relu":
-            layers.append(nn.ReLU())
-        elif activation == "leakyrelu":
-            layers.append(nn.LeakyReLU())
-        if dropout > 0:
-            layers.append(nn.Dropout(p=dropout))
-    return nn.Sequential(*layers)
-
-
-def get_noise(shape, noise_type):
-    if noise_type == "gaussian":
-        return torch.randn(*shape).cuda()
-    elif noise_type == "uniform":
-        return torch.rand(*shape).sub_(0.5).mul_(2.0).cuda()
-    raise ValueError('Unrecognized noise type "%s"' % noise_type)
-
-
-class Encoder(nn.Module):
-    """Encoder is part of both TrajectoryGenerator and
-    TrajectoryDiscriminator"""
-
-    def __init__(
-            self, embedding_dim=64, h_dim=64, mlp_dim=1024, num_layers=1, dropout=0.0
-    ):
-        super(Encoder, self).__init__()
-
-        self.mlp_dim = 1024
-        self.h_dim = h_dim
-        self.embedding_dim = embedding_dim
-        self.num_layers = num_layers
-
-        self.encoder = nn.LSTM(embedding_dim, h_dim, num_layers, dropout=dropout)
-
-        self.spatial_embedding = nn.Linear(2, embedding_dim)
-
-    def init_hidden(self, batch):
-        return (
-            torch.zeros(self.num_layers, batch, self.h_dim).cuda(),
-            torch.zeros(self.num_layers, batch, self.h_dim).cuda(),
-        )
-
-    def forward(self, obs_traj):
-        """
-        Inputs:
-        - obs_traj: Tensor of shape (obs_len, batch, 2)
-        Output:
-        - final_h: Tensor of shape (self.num_layers, batch, self.h_dim)
-        """
-        # Encode observed Trajectory
-        batch = obs_traj.size(1)
-        obs_traj_embedding = self.spatial_embedding(obs_traj.view(-1, 2))
-        obs_traj_embedding = obs_traj_embedding.view(-1, batch, self.embedding_dim)
-        state_tuple = self.init_hidden(batch)
-        output, state = self.encoder(obs_traj_embedding, state_tuple)
-        final_h = state[0]
-        return final_h
-
-
-class Decoder(nn.Module):
-    """Decoder is part of TrajectoryGenerator"""
-
-    def __init__(
-            self,
-            seq_len,
-            embedding_dim=64,
-            h_dim=128,
-            mlp_dim=1024,
-            num_layers=1,
-            pool_every_timestep=True,
-            dropout=0.0,
-            bottleneck_dim=1024,
-            activation="relu",
-            batch_norm=True,
-            pooling_type="pool_net",
-            neighborhood_size=2.0,
-            grid_size=8,
-    ):
-        super(Decoder, self).__init__()
-
-        self.seq_len = seq_len
-        self.mlp_dim = mlp_dim
-        self.h_dim = h_dim
-        self.embedding_dim = embedding_dim
-        self.pool_every_timestep = pool_every_timestep
-
-        self.decoder = nn.LSTM(embedding_dim, h_dim, num_layers, dropout=dropout)
-
-        if pool_every_timestep:
-            if pooling_type == "pool_net":
-                self.pool_net = PoolHiddenNet(
-                    embedding_dim=self.embedding_dim,
-                    h_dim=self.h_dim,
-                    mlp_dim=mlp_dim,
-                    bottleneck_dim=bottleneck_dim,
-                    activation=activation,
-                    batch_norm=batch_norm,
-                    dropout=dropout,
-                )
-            elif pooling_type == "spool":
-                self.pool_net = SocialPooling(
-                    h_dim=self.h_dim,
-                    activation=activation,
-                    batch_norm=batch_norm,
-                    dropout=dropout,
-                    neighborhood_size=neighborhood_size,
-                    grid_size=grid_size,
-                )
-
-            mlp_dims = [h_dim + bottleneck_dim, mlp_dim, h_dim]
-            self.mlp = make_mlp(
-                mlp_dims, activation=activation, batch_norm=batch_norm, dropout=dropout
-            )
-
-        self.spatial_embedding = nn.Linear(2, embedding_dim)
-        self.hidden2pos = nn.Linear(h_dim, 2)
-
-    def forward(self, last_pos, last_pos_rel, state_tuple, seq_start_end):
-        """
-        Inputs:
-        - last_pos: Tensor of shape (batch, 2)
-        - last_pos_rel: Tensor of shape (batch, 2)
-        - state_tuple: (hh, ch) each tensor of shape (num_layers, batch, h_dim)
-        - seq_start_end: A list of tuples which delimit sequences within batch
-        Output:
-        - pred_traj: tensor of shape (self.seq_len, batch, 2)
-        """
-        batch = last_pos.size(0)
-        pred_traj_fake_rel = []
-        decoder_input = self.spatial_embedding(last_pos_rel)
-        decoder_input = decoder_input.view(1, batch, self.embedding_dim)
-
-        for _ in range(self.seq_len):
-            output, state_tuple = self.decoder(decoder_input, state_tuple)
-            rel_pos = self.hidden2pos(output.view(-1, self.h_dim))
-            curr_pos = rel_pos + last_pos
-
-            if self.pool_every_timestep:
-                decoder_h = state_tuple[0]
-                pool_h = self.pool_net(decoder_h, seq_start_end, curr_pos)
-                decoder_h = torch.cat([decoder_h.view(-1, self.h_dim), pool_h], dim=1)
-                decoder_h = self.mlp(decoder_h)
-                decoder_h = torch.unsqueeze(decoder_h, 0)
-                state_tuple = (decoder_h, state_tuple[1])
-
-            embedding_input = rel_pos
-
-            decoder_input = self.spatial_embedding(embedding_input)
-            decoder_input = decoder_input.view(1, batch, self.embedding_dim)
-            pred_traj_fake_rel.append(rel_pos.view(batch, -1))
-            last_pos = curr_pos
-
-        pred_traj_fake_rel = torch.stack(pred_traj_fake_rel, dim=0)
-        return pred_traj_fake_rel, state_tuple[0]
-
-
-class PoolHiddenNet(nn.Module):
-    """Pooling module as proposed in our paper"""
-
-    def __init__(
-            self,
-            embedding_dim=64,
-            h_dim=64,
-            mlp_dim=1024,
-            bottleneck_dim=1024,
-            activation="relu",
-            batch_norm=True,
-            dropout=0.0,
-    ):
-        super(PoolHiddenNet, self).__init__()
-
-        self.mlp_dim = 1024
-        self.h_dim = h_dim
-        self.bottleneck_dim = bottleneck_dim
-        self.embedding_dim = embedding_dim
-
-        mlp_pre_dim = embedding_dim + h_dim
-        mlp_pre_pool_dims = [mlp_pre_dim, 512, bottleneck_dim]
-
-        self.spatial_embedding = nn.Linear(2, embedding_dim)
-        self.mlp_pre_pool = make_mlp(
-            mlp_pre_pool_dims,
-            activation=activation,
-            batch_norm=batch_norm,
-            dropout=dropout,
-        )
-
-    def repeat(self, tensor, num_reps):
-        """
-        Inputs:
-        -tensor: 2D tensor of any shape
-        -num_reps: Number of times to repeat each row
-        Outpus:
-        -repeat_tensor: Repeat each row such that: R1, R1, R2, R2
-        """
-        col_len = tensor.size(1)
-        tensor = tensor.unsqueeze(dim=1).repeat(1, num_reps, 1)
-        tensor = tensor.view(-1, col_len)
-        return tensor
-
-    def forward(self, h_states, seq_start_end, end_pos):
-        """
-        Inputs:
-        - h_states: Tensor of shape (num_layers, batch, h_dim)
-        - seq_start_end: A list of tuples which delimit sequences within batch
-        - end_pos: Tensor of shape (batch, 2)
-        Output:
-        - pool_h: Tensor of shape (batch, bottleneck_dim)
-        """
-        pool_h = []
-        for _, (start, end) in enumerate(seq_start_end):
-            start = start.item()
-            end = end.item()
-            num_ped = end - start
-            curr_hidden = h_states.view(-1, self.h_dim)[start:end]
-            curr_end_pos = end_pos[start:end]
-            # Repeat -> H1, H2, H1, H2
-            curr_hidden_1 = curr_hidden.repeat(num_ped, 1)
-            # Repeat position -> P1, P2, P1, P2
-            curr_end_pos_1 = curr_end_pos.repeat(num_ped, 1)
-            # Repeat position -> P1, P1, P2, P2
-            curr_end_pos_2 = self.repeat(curr_end_pos, num_ped)
-            curr_rel_pos = curr_end_pos_1 - curr_end_pos_2
-            curr_rel_embedding = self.spatial_embedding(curr_rel_pos)
-            mlp_h_input = torch.cat([curr_rel_embedding, curr_hidden_1], dim=1)
-            curr_pool_h = self.mlp_pre_pool(mlp_h_input)
-            curr_pool_h = curr_pool_h.view(num_ped, num_ped, -1).max(1)[0]
-            pool_h.append(curr_pool_h)
-        pool_h = torch.cat(pool_h, dim=0)
-        return pool_h
-
-
-class SocialPooling(nn.Module):
-    """Current state of the art pooling mechanism:
-    http://cvgl.stanford.edu/papers/CVPR16_Social_LSTM.pdf"""
-
-    def __init__(
-            self,
-            h_dim=64,
-            activation="relu",
-            batch_norm=True,
-            dropout=0.0,
-            neighborhood_size=2.0,
-            grid_size=8,
-            pool_dim=None,
-    ):
-        super(SocialPooling, self).__init__()
-        self.h_dim = h_dim
-        self.grid_size = grid_size
-        self.neighborhood_size = neighborhood_size
-        if pool_dim:
-            mlp_pool_dims = [grid_size * grid_size * h_dim, pool_dim]
-        else:
-            mlp_pool_dims = [grid_size * grid_size * h_dim, h_dim]
-
-        self.mlp_pool = make_mlp(
-            mlp_pool_dims, activation=activation, batch_norm=batch_norm, dropout=dropout
-        )
-
-    def get_bounds(self, ped_pos):
-        top_left_x = ped_pos[:, 0] - self.neighborhood_size / 2
-        top_left_y = ped_pos[:, 1] + self.neighborhood_size / 2
-        bottom_right_x = ped_pos[:, 0] + self.neighborhood_size / 2
-        bottom_right_y = ped_pos[:, 1] - self.neighborhood_size / 2
-        top_left = torch.stack([top_left_x, top_left_y], dim=1)
-        bottom_right = torch.stack([bottom_right_x, bottom_right_y], dim=1)
-        return top_left, bottom_right
-
-    def get_grid_locations(self, top_left, other_pos):
-        cell_x = torch.floor(
-            ((other_pos[:, 0] - top_left[:, 0]) / self.neighborhood_size)
-            * self.grid_size
-        )
-        cell_y = torch.floor(
-            ((top_left[:, 1] - other_pos[:, 1]) / self.neighborhood_size)
-            * self.grid_size
-        )
-        grid_pos = cell_x + cell_y * self.grid_size
-        return grid_pos
-
-    def repeat(self, tensor, num_reps):
-        """
-        Inputs:
-        -tensor: 2D tensor of any shape
-        -num_reps: Number of times to repeat each row
-        Outpus:
-        -repeat_tensor: Repeat each row such that: R1, R1, R2, R2
-        """
-        col_len = tensor.size(1)
-        tensor = tensor.unsqueeze(dim=1).repeat(1, num_reps, 1)
-        tensor = tensor.view(-1, col_len)
-        return tensor
-
-    def forward(self, h_states, seq_start_end, end_pos):
-        """
-        Inputs:
-        - h_states: Tesnsor of shape (num_layers, batch, h_dim)
-        - seq_start_end: A list of tuples which delimit sequences within batch.
-        - end_pos: Absolute end position of obs_traj (batch, 2)
-        Output:
-        - pool_h: Tensor of shape (batch, h_dim)
-        """
-        pool_h = []
-        for _, (start, end) in enumerate(seq_start_end):
-            start = start.item()
-            end = end.item()
-            num_ped = end - start
-            grid_size = self.grid_size * self.grid_size
-            curr_hidden = h_states.view(-1, self.h_dim)[start:end]
-            curr_hidden_repeat = curr_hidden.repeat(num_ped, 1)
-            curr_end_pos = end_pos[start:end]
-            curr_pool_h_size = (num_ped * grid_size) + 1
-            curr_pool_h = curr_hidden.new_zeros((curr_pool_h_size, self.h_dim))
-            # curr_end_pos = curr_end_pos.data
-            top_left, bottom_right = self.get_bounds(curr_end_pos)
-
-            # Repeat position -> P1, P2, P1, P2
-            curr_end_pos = curr_end_pos.repeat(num_ped, 1)
-            # Repeat bounds -> B1, B1, B2, B2
-            top_left = self.repeat(top_left, num_ped)
-            bottom_right = self.repeat(bottom_right, num_ped)
-
-            grid_pos = self.get_grid_locations(top_left, curr_end_pos).type_as(
-                seq_start_end
-            )
-            # Make all positions to exclude as non-zero
-            # Find which peds to exclude
-            x_bound = (curr_end_pos[:, 0] >= bottom_right[:, 0]) + (
-                    curr_end_pos[:, 0] <= top_left[:, 0]
-            )
-            y_bound = (curr_end_pos[:, 1] >= top_left[:, 1]) + (
-                    curr_end_pos[:, 1] <= bottom_right[:, 1]
-            )
-
-            within_bound = x_bound + y_bound
-            within_bound[0:: num_ped + 1] = 1  # Don't include the ped itself
-            within_bound = within_bound.view(-1)
-
-            # This is a tricky way to get scatter add to work. Helps me avoid a
-            # for loop. Offset everything by 1. Use the initial 0 position to
-            # dump all uncessary adds.
-            grid_pos += 1
-            total_grid_size = self.grid_size * self.grid_size
-            offset = torch.arange(
-                0, total_grid_size * num_ped, total_grid_size
-            ).type_as(seq_start_end)
-
-            offset = self.repeat(offset.view(-1, 1), num_ped).view(-1)
-            grid_pos += offset
-            grid_pos[within_bound != 0] = 0
-            grid_pos = grid_pos.view(-1, 1).expand_as(curr_hidden_repeat)
-
-            curr_pool_h = curr_pool_h.scatter_add(0, grid_pos, curr_hidden_repeat)
-            curr_pool_h = curr_pool_h[1:]
-            pool_h.append(curr_pool_h.view(num_ped, -1))
-
-        pool_h = torch.cat(pool_h, dim=0)
-        pool_h = self.mlp_pool(pool_h)
-        return pool_h
-
-
-class TrajectoryGenerator(nn.Module):
-    """Modified from @agrimgupta92's https://github.com/agrimgupta92/sgan/blob/master/sgan/models.py."""
-
-    def __init__(
-            self,
-            obs_len,
-            pred_len,
-            embedding_dim=64,
-            encoder_h_dim=64,
-            decoder_h_dim=128,
-            mlp_dim=1024,
-            num_layers=1,
-            noise_dim=(0,),
-            noise_type="gaussian",
-            noise_mix_type="ped",
-            pooling_type=None,
-            pool_every_timestep=True,
-            dropout=0.0,
-            bottleneck_dim=1024,
-            activation="relu",
-            batch_norm=True,
-            neighborhood_size=2.0,
-            grid_size=8,
-    ):
-        super(TrajectoryGenerator, self).__init__()
-
-        if pooling_type and pooling_type.lower() == "none":
-            pooling_type = None
-
-        self.obs_len = obs_len
-        self.pred_len = pred_len
-        self.mlp_dim = mlp_dim
-        self.encoder_h_dim = encoder_h_dim
-        self.decoder_h_dim = decoder_h_dim
-        self.embedding_dim = embedding_dim
-        self.noise_dim = noise_dim
-        self.num_layers = num_layers
-        self.noise_type = noise_type
-        self.noise_mix_type = noise_mix_type
-        self.pooling_type = pooling_type
-        self.noise_first_dim = 0
-        self.pool_every_timestep = pool_every_timestep
-        self.bottleneck_dim = 1024
-
-        self.encoder = Encoder(
-            embedding_dim=embedding_dim,
-            h_dim=encoder_h_dim,
-            mlp_dim=mlp_dim,
-            num_layers=num_layers,
-            dropout=dropout,
-        )
-
-        self.decoder = Decoder(
-            pred_len,
-            embedding_dim=embedding_dim,
-            h_dim=decoder_h_dim,
-            mlp_dim=mlp_dim,
-            num_layers=num_layers,
-            pool_every_timestep=pool_every_timestep,
-            dropout=dropout,
-            bottleneck_dim=bottleneck_dim,
-            activation=activation,
-            batch_norm=batch_norm,
-            pooling_type=pooling_type,
-            grid_size=grid_size,
-            neighborhood_size=neighborhood_size,
-        )
-
-        if pooling_type == "pool_net":
-            self.pool_net = PoolHiddenNet(
-                embedding_dim=self.embedding_dim,
-                h_dim=encoder_h_dim,
-                mlp_dim=mlp_dim,
-                bottleneck_dim=bottleneck_dim,
-                activation=activation,
-                batch_norm=batch_norm,
-            )
-        elif pooling_type == "spool":
-            self.pool_net = SocialPooling(
-                h_dim=encoder_h_dim,
-                activation=activation,
-                batch_norm=batch_norm,
-                dropout=dropout,
-                neighborhood_size=neighborhood_size,
-                grid_size=grid_size,
-            )
-
-        if self.noise_dim[0] == 0:
-            self.noise_dim = None
-        else:
-            self.noise_first_dim = noise_dim[0]
-
-        # Decoder Hidden
-        if pooling_type:
-            input_dim = encoder_h_dim + bottleneck_dim
-        else:
-            input_dim = encoder_h_dim
-
-        if self.mlp_decoder_needed():
-            mlp_decoder_context_dims = [
-                input_dim,
-                mlp_dim,
-                decoder_h_dim - self.noise_first_dim,
-            ]
-
-            self.mlp_decoder_context = make_mlp(
-                mlp_decoder_context_dims,
-                activation=activation,
-                batch_norm=batch_norm,
-                dropout=dropout,
-            )
-
-    def add_noise(self, _input, seq_start_end, user_noise=None):
-        """
-        Inputs:
-        - _input: Tensor of shape (_, decoder_h_dim - noise_first_dim)
-        - seq_start_end: A list of tuples which delimit sequences within batch.
-        - user_noise: Generally used for inference when you want to see
-        relation between different types of noise and outputs.
-        Outputs:
-        - decoder_h: Tensor of shape (_, decoder_h_dim)
-        """
-        if not self.noise_dim:
-            return _input
-
-        if self.noise_mix_type == "global":
-            noise_shape = (seq_start_end.size(0),) + self.noise_dim
-        else:
-            noise_shape = (_input.size(0),) + self.noise_dim
-
-        if user_noise is not None:
-            z_decoder = user_noise
-        else:
-            z_decoder = get_noise(noise_shape, self.noise_type)
-
-        if self.noise_mix_type == "global":
-            _list = []
-            for idx, (start, end) in enumerate(seq_start_end):
-                start = start.item()
-                end = end.item()
-                _vec = z_decoder[idx].view(1, -1)
-                _to_cat = _vec.repeat(end - start, 1)
-                _list.append(torch.cat([_input[start:end], _to_cat], dim=1))
-            decoder_h = torch.cat(_list, dim=0)
-            return decoder_h
-
-        decoder_h = torch.cat([_input, z_decoder], dim=1)
-
-        return decoder_h
-
-    def mlp_decoder_needed(self):
-        if (
-                self.noise_dim
-                or self.pooling_type
-                or self.encoder_h_dim != self.decoder_h_dim
-        ):
-            return True
-        else:
-            return False
-
-    def forward(self, obs_traj, obs_traj_rel, seq_start_end, user_noise=None):
-        """
-        Inputs:
-        - obs_traj: Tensor of shape (obs_len, batch, 2)
-        - obs_traj_rel: Tensor of shape (obs_len, batch, 2)
-        - seq_start_end: A list of tuples which delimit sequences within batch.
-        - user_noise: Generally used for inference when you want to see
-        relation between different types of noise and outputs.
-        Output:
-        - pred_traj_rel: Tensor of shape (self.pred_len, batch, 2)
-        """
-        batch = obs_traj_rel.size(1)
-        # Encode seq
-        final_encoder_h = self.encoder(obs_traj_rel)
-        # Pool States
-        if self.pooling_type:
-            end_pos = obs_traj[-1, :, :]
-            pool_h = self.pool_net(final_encoder_h, seq_start_end, end_pos)
-            # Construct input hidden states for decoder
-            mlp_decoder_context_input = torch.cat(
-                [final_encoder_h.view(-1, self.encoder_h_dim), pool_h], dim=1
-            )
-        else:
-            mlp_decoder_context_input = final_encoder_h.view(-1, self.encoder_h_dim)
-
-        # Add Noise
-        if self.mlp_decoder_needed():
-            noise_input = self.mlp_decoder_context(mlp_decoder_context_input)
-        else:
-            noise_input = mlp_decoder_context_input
-        decoder_h = self.add_noise(noise_input, seq_start_end, user_noise=user_noise)
-        decoder_h = torch.unsqueeze(decoder_h, 0)
-
-        decoder_c = torch.zeros(self.num_layers, batch, self.decoder_h_dim).cuda()
-
-        state_tuple = (decoder_h, decoder_c)
-        last_pos = obs_traj[-1]
-        last_pos_rel = obs_traj_rel[-1]
-        # Predict Trajectory
-
-        decoder_out = self.decoder(last_pos, last_pos_rel, state_tuple, seq_start_end)
-        pred_traj_fake_rel, final_decoder_h = decoder_out
-
-        return pred_traj_fake_rel
-
-
-class TrajectoryDiscriminator(nn.Module):
-    def __init__(
-            self,
-            obs_len,
-            pred_len,
-            embedding_dim=64,
-            h_dim=64,
-            mlp_dim=1024,
-            num_layers=1,
-            activation="relu",
-            batch_norm=True,
-            dropout=0.0,
-            d_type="local",
-    ):
-        super(TrajectoryDiscriminator, self).__init__()
-
-        self.obs_len = obs_len
-        self.pred_len = pred_len
-        self.seq_len = obs_len + pred_len
-        self.mlp_dim = mlp_dim
-        self.h_dim = h_dim
-        self.d_type = d_type
-
-        self.encoder = Encoder(
-            embedding_dim=embedding_dim,
-            h_dim=h_dim,
-            mlp_dim=mlp_dim,
-            num_layers=num_layers,
-            dropout=dropout,
-        )
-
-        real_classifier_dims = [h_dim, mlp_dim, 1]
-        self.real_classifier = make_mlp(
-            real_classifier_dims,
-            activation=activation,
-            batch_norm=batch_norm,
-            dropout=dropout,
-        )
-        if d_type == "global":
-            mlp_pool_dims = [h_dim + embedding_dim, mlp_dim, h_dim]
-            self.pool_net = PoolHiddenNet(
-                embedding_dim=embedding_dim,
-                h_dim=h_dim,
-                mlp_dim=mlp_pool_dims,
-                bottleneck_dim=h_dim,
-                activation=activation,
-                batch_norm=batch_norm,
-            )
-
-    def forward(self, traj, traj_rel, seq_start_end=None):
-        """
-        Inputs:
-        - traj: Tensor of shape (obs_len + pred_len, batch, 2)
-        - traj_rel: Tensor of shape (obs_len + pred_len, batch, 2)
-        - seq_start_end: A list of tuples which delimit sequences within batch
-        Output:
-        - scores: Tensor of shape (batch,) with real/fake scores
-        """
-        final_h = self.encoder(traj_rel)
-        # Note: In case of 'global' option we are using start_pos as opposed to
-        # end_pos. The intution being that hidden state has the whole
-        # trajectory and relative postion at the start when combined with
-        # trajectory information should help in discriminative behavior.
-        if self.d_type == "local":
-            classifier_input = final_h.squeeze()
-        else:
-            classifier_input = self.pool_net(final_h.squeeze(), seq_start_end, traj[0])
-        scores = self.real_classifier(classifier_input)
-        return scores
diff --git a/traja/models/predictive_models/ae.py b/traja/models/predictive_models/ae.py
index f0dddbfa..91418c16 100644
--- a/traja/models/predictive_models/ae.py
+++ b/traja/models/predictive_models/ae.py
@@ -1,6 +1,7 @@
 import torch
-from torch import nn
 
+from traja.models.base_models.MLPClassifier import MLPClassifier
+from traja.models.base_models.MLPRegressor import MLPRegressor
 from traja.models.utils import TimeDistributed
 
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -177,53 +178,6 @@ def forward(self, x, num_future=None):
         return output
 
 
-class MLPClassifier(torch.nn.Module):
-    """ MLP classifier: Classify the input data using the latent embeddings
-        Parameters:
-        -----------
-            input_size: The number of expected latent size
-            hidden_size: The number of features in the hidden state h
-            num_classes: Size of labels or the number of categories in the data
-            dropout:  If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer,
-                        with dropout probability equal to dropout
-            num_classifier_layers: Number of hidden layers in the classifier
-            """
-
-    def __init__(
-            self,
-            input_size: int,
-            hidden_size: int,
-            num_classes: int,
-            latent_size: int,
-            num_classifier_layers: int,
-            dropout: float,
-    ):
-        super(MLPClassifier, self).__init__()
-
-        self.input_size = input_size
-        self.hidden_size = hidden_size
-        self.num_classes = num_classes
-        self.num_classifier_layers = num_classifier_layers
-        self.dropout = dropout
-
-        # Classifier layers
-        self.hidden = nn.ModuleList([nn.Linear(self.input_size, self.hidden_size)])
-        self.hidden.extend(
-            [
-                nn.Linear(self.hidden_size, self.hidden_size)
-                for _ in range(1, self.num_classifier_layers - 1)
-            ]
-        )
-        self.hidden = nn.Sequential(*self.hidden)
-        self.out = nn.Linear(self.hidden_size, self.num_classes)
-        self.dropout = torch.nn.Dropout(p=dropout)
-
-    def forward(self, x):
-        x = self.dropout(self.hidden(x))
-        out = self.out(x)
-        return out
-
-
 class MultiModelAE(torch.nn.Module):
     """Implementation of Multimodel  autoencoders; This Module wraps the  Autoencoder
     models [Encoder,Latent,Decoder]. If classify=True, then the wrapper also include classification layers
@@ -324,22 +278,56 @@ def __init__(
             self.classifier = MLPClassifier(
                 input_size=self.latent_size,
                 hidden_size=self.classifier_hidden_size,
-                num_classes=self.num_classes,
-                latent_size=self.latent_size,
-                num_classifier_layers=self.num_classifier_layers,
+                output_size=self.num_classes,
+                num_layers=self.num_classifier_layers,
                 dropout=self.dropout,
             )
 
         if self.num_regressor_parameters is not None:
-            self.regressor = MLPClassifier(
+            self.regressor = MLPRegressor(
                 input_size=self.latent_size,
                 hidden_size=self.regressor_hidden_size,
-                num_classes=self.num_regressor_parameters,
-                latent_size=self.latent_size,
-                num_classifier_layers=self.num_regressor_layers,
+                output_size=self.num_regressor_parameters,
+                num_layers=self.num_regressor_layers,
                 dropout=self.dropout,
             )
 
+    def reset_classifier(self, classifier_hidden_size: int, num_classifier_layers: int):
+        """Reset the classifier, with a new hidden size and depth.
+        This is useful when parameter searching.
+
+        classifier_hidden_size: The number of units in each classifier layer
+        num_layers: Number of layers in the classifier
+        """
+        self.classifier_hidden_size = classifier_hidden_size
+        self.num_classifier_layers = num_classifier_layers
+
+        self.classifier = MLPClassifier(
+            input_size=self.latent_size,
+            hidden_size=self.classifier_hidden_size,
+            output_size=self.num_classes,
+            num_layers=self.num_classifier_layers,
+            dropout=self.dropout,
+        )
+
+    def reset_regressor(self, regressor_hidden_size: int, num_regressor_layers: int):
+        """Reset the regressor, with a new hidden size and depth.
+        This is useful when parameter searching.
+
+        regressor_hidden_size: The number of units in each classifier layer
+        num_regressor_layers: Number of layers in the classifier
+        """
+        self.num_regressor_layers = num_regressor_layers
+        self.regressor_hidden_size = regressor_hidden_size
+
+        self.regressor = MLPRegressor(
+            input_size=self.latent_size,
+            hidden_size=self.regressor_hidden_size,
+            output_size=self.num_regressor_parameters,
+            num_layers=self.num_regressor_layers,
+            dropout=self.dropout,
+        )
+
     def get_ae_parameters(self):
         """
         Return:
diff --git a/traja/models/predictive_models/lstm.py b/traja/models/predictive_models/lstm.py
index 3c245343..0f97630c 100644
--- a/traja/models/predictive_models/lstm.py
+++ b/traja/models/predictive_models/lstm.py
@@ -42,6 +42,7 @@ def __init__(
 
         self.batch_size = batch_size
         self.input_size = input_size
+        self.num_past = num_future  # num_past and num_future are equal
         self.num_future = num_future
         self.hidden_size = hidden_size
         self.num_layers = num_layers
@@ -87,6 +88,6 @@ def forward(self, x, training=True, classify=False, regress=False, latent=False)
         # Decoder input Shape(batch_size, num_futures, latent_size)
         out, (dec_hidden, dec_cell) = self.lstm(x, (h0.detach(), c0.detach()))
 
-        # Map the decoder output: Shape(batch_size, sequence_len, hidden_dim) to Time Dsitributed Linear Layer
+        # Map the decoder output: Shape(batch_size, sequence_len, hidden_dim) to Time Distributed Linear Layer
         out = self.output(out)
         return out
diff --git a/traja/models/train.py b/traja/models/train.py
index e1ab9ead..bfaad956 100644
--- a/traja/models/train.py
+++ b/traja/models/train.py
@@ -1,8 +1,6 @@
-import matplotlib.pyplot as plt
 import torch
 
 from . import utils
-from . import visualizer
 from .losses import Criterion
 from .optimizers import Optimizer
 
@@ -23,11 +21,11 @@ class HybridTrainer(object):
         lstm_hidden_size: The number of features in the hidden state h
         num_lstm_layers: Number of layers in the LSTM model
         reset_state: If True, will reset the hidden and cell state for each batch of data
-        num_classes: Number of categories/labels
+        output_size: Number of sequence_ids/labels
         latent_size: Latent space dimension
         dropout:  If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer,
                     with dropout probability equal to dropout
-        num_classifier_layers: Number of layers in the classifier
+        num_layers: Number of layers in the classifier
         batch_size: Number of samples in a batch 
         num_future: Number of time steps to be predicted forward
         num_past: Number of past time steps otherwise, length of sequences in each batch of data.
@@ -42,7 +40,7 @@ class HybridTrainer(object):
 
     """
 
-    valid_models = ['ae', 'vae', 'lstm']
+    valid_models = ["ae", "vae", "lstm"]
 
     def __init__(
             self,
@@ -67,7 +65,7 @@ def __init__(
         self.lr_factor = lr_factor
         self.scheduler_patience = scheduler_patience
 
-        if model.model_type == 'lstm':
+        if model.model_type == "lstm":
             self.model_hyperparameters = {
                 "input_size": model.input_size,
                 "batch_size": model.batch_size,
@@ -100,26 +98,42 @@ def __init__(
             }
 
         self.model = model
-
         # Classification, regression task checks
-        self.classify = True if model.model_type != 'lstm' and model.classifier_hidden_size is not None else False
-        self.regress = True if model.model_type != 'lstm' and model.regressor_hidden_size is not None else False
+        self.classify = (
+            True
+            if model.model_type != "lstm" and model.classifier_hidden_size is not None
+            else False
+        )
+        self.regress = (
+            True
+            if model.model_type != "lstm" and model.regressor_hidden_size is not None
+            else False
+        )
 
         # Model optimizer and the learning rate scheduler
         optimizer = Optimizer(
             self.model_type, self.model, self.optimizer_type, classify=self.classify
         )
 
-        self.forecasting_optimizers, self.classification_optimizers, self.regression_optimizers = optimizer.get_optimizers(
-            lr=self.lr)
-        self.forecasting_schedulers, self.classification_schedulers, self.regression_schedulers = optimizer.get_lrschedulers(
+        (
+            self.forecasting_optimizers,
+            self.classification_optimizers,
+            self.regression_optimizers,
+        ) = optimizer.get_optimizers(lr=self.lr)
+        (
+            self.forecasting_schedulers,
+            self.classification_schedulers,
+            self.regression_schedulers,
+        ) = optimizer.get_lrschedulers(
             factor=self.lr_factor, patience=self.scheduler_patience
         )
 
     def __str__(self):
         return f"Training model type {self.model_type}"
 
-    def fit(self, dataloaders, model_save_path=None, training_mode='forecasting', epochs=50):
+    def fit(
+            self, dataloaders, model_save_path=None, training_mode="forecasting", epochs=50
+    ):
         """
         This method implements the batch- wise training and testing protocol for both time series forecasting and
         classification of the timeseriesis_classification
@@ -127,22 +141,27 @@ def fit(self, dataloaders, model_save_path=None, training_mode='forecasting', ep
         Parameters:
         -----------
         dataloaders: Dictionary containing train and test dataloaders
-                train_loader: Dataloader object of train dataset with batch data [data,target,category]
-                test_loader: Dataloader object of test dataset with [data,target,category]
+                train_loader: Dataloader object of train dataset with batch data [data,target,ids]
+                test_loader: Dataloader object of test dataset with [data,target,ids]
         model_save_path: Directory path to save the model
         training_mode: Type of training ('forecasting', 'classification')
         epochs: Number of epochs to train
         """
 
         assert model_save_path is not None, f"Model path {model_save_path} unknown"
-        assert training_mode in ['forecasting', 'classification',
-                                 'regression'], f'Training mode {training_mode} unknown'
+        assert training_mode in [
+            "forecasting",
+            "classification",
+            "regression",
+        ], f"Training mode {training_mode} unknown"
 
         self.model.to(device)
 
-        train_loader, test_loader = dataloaders.values()
+        train_loader = dataloaders['train_loader']
+        test_loader = dataloaders['test_loader']
+
         # Training
-        for epoch in range(epochs):
+        for epoch in range(epochs + 1):
             test_loss_forecasting = 0
             test_loss_classification = 0
             test_loss_regression = 0
@@ -150,7 +169,9 @@ def fit(self, dataloaders, model_save_path=None, training_mode='forecasting', ep
                 # Training
                 self.model.train()
                 total_loss = 0
-                for idx, (data, target, category, parameters) in enumerate(train_loader):
+                for idx, (data, target, ids, parameters) in enumerate(
+                        train_loader
+                ):
                     # Reset optimizer states
                     for optimizer in self.forecasting_optimizers:
                         optimizer.zero_grad()
@@ -161,27 +182,27 @@ def fit(self, dataloaders, model_save_path=None, training_mode='forecasting', ep
                         for optimizer in self.regression_optimizers:
                             optimizer.zero_grad()
 
-                    if type(category) == list:
-                        category = category[0]
-                    data, target, category, parameters = (
+                    if type(ids) == list:
+                        ids = ids[0]
+                    data, target, ids, parameters = (
                         data.float().to(device),
                         target.float().to(device),
-                        category.to(device),
-                        parameters.float().to(device)
+                        ids.to(device),
+                        parameters.float().to(device),
                     )
 
                     if training_mode == "forecasting":
-                        if self.model_type == "ae" or self.model_type == 'lstm':
+                        if self.model_type == "ae" or self.model_type == "lstm":
                             decoder_out = self.model(
                                 data, training=True, classify=False, latent=False
                             )
-                            loss = Criterion().ae_criterion(decoder_out, target)
+                            loss = Criterion().forecasting_criterion(decoder_out, target, loss_type=self.loss_type)
                         else:  # vae
                             decoder_out, latent_out, mu, logvar = self.model(
                                 data, training=True, classify=False
                             )
-                            loss = Criterion().vae_criterion(
-                                decoder_out, target, mu, logvar
+                            loss = Criterion().forecasting_criterion(
+                                decoder_out, target, mu=mu, logvar=logvar, loss_type=self.loss_type
                             )
 
                         loss.backward()
@@ -193,15 +214,17 @@ def fit(self, dataloaders, model_save_path=None, training_mode='forecasting', ep
                             data, training=True, classify=True, latent=False
                         )
                         loss = Criterion().classifier_criterion(
-                            classifier_out, (category - 1).long()
+                            classifier_out, (ids - 1).long()
                         )
 
                         loss.backward()
                         for optimizer in self.classification_optimizers:
                             optimizer.step()
 
-                    elif training_mode == 'regression':
-                        regressor_out = self.model(data, training=True, regress=True, latent=False)
+                    elif training_mode == "regression":
+                        regressor_out = self.model(
+                            data, training=True, regress=True, latent=False
+                        )
                         loss = Criterion().regressor_criterion(
                             regressor_out, parameters
                         )
@@ -214,75 +237,74 @@ def fit(self, dataloaders, model_save_path=None, training_mode='forecasting', ep
 
                 print(
                     "Epoch {} | {} loss {}".format(
-                        epoch, training_mode, total_loss / (idx + 1)
+                        epoch, training_mode, total_loss / len(train_loader.dataset)
                     )
                 )
 
             # Testing
-            if epoch % 10 == 0:
+            if epoch % 10 == 9 and epoch != 0:
                 with torch.no_grad():
                     if self.classify:
                         total = 0.0
                         correct = 0.0
                     self.model.eval()
-                    for idx, (data, target, category, parameters) in enumerate(test_loader):
-                        if type(category) == list:
-                            category = category[0]
-                        data, target, category, parameters = (
+                    for idx, (data, target, ids, parameters) in enumerate(
+                            test_loader
+                    ):
+                        if type(ids) == list:
+                            ids = ids[0]
+                        data, target, ids, parameters = (
                             data.float().to(device),
                             target.float().to(device),
-                            category.to(device),
-                            parameters.float().to(device)
+                            ids.to(device),
+                            parameters.float().to(device),
                         )
                         # Time series forecasting test
-                        if self.model_type == 'ae' or self.model_type == 'lstm':
+                        if self.model_type == "ae" or self.model_type == "lstm":
                             out = self.model(
                                 data, training=False, classify=False, latent=False
                             )
                             test_loss_forecasting += (
-                                Criterion().ae_criterion(out, target).item()
+                                Criterion().forecasting_criterion(out, target, loss_type=self.loss_type).item()
                             )
 
                         else:
                             decoder_out, latent_out, mu, logvar = self.model(
-                                data, training=False, classify=False
+                                data, training=False, classify=False, latent=True
                             )
-                            test_loss_forecasting += Criterion().vae_criterion(
-                                decoder_out, target, mu, logvar
+                            test_loss_forecasting += Criterion().forecasting_criterion(
+                                decoder_out, target, mu=mu, logvar=logvar, loss_type=self.loss_type
                             )
 
                         # Classification test
                         if self.classify:
-                            category = category.long()
-                            if self.model_type == 'ae' or self.model_type == 'lstm':
-                                classifier_out = self.model(
-                                    data, training=False, classify=True
-                                )
-                            else:
-                                classifier_out, latent_out, mu, logvar = self.model(
-                                    data, training=False, classify=True
-                                )
+                            ids = ids.long()
+                            classifier_out = self.model(
+                                data, training=False, classify=True, latent=False
+                            )
 
                             test_loss_classification += (
                                 Criterion()
-                                    .classifier_criterion(classifier_out, category - 1)
+                                    .classifier_criterion(classifier_out, ids - 1)
                                     .item()
                             )
 
                             # Compute number of correct samples
-                            total += category.size(0)
+                            total += ids.size(0)
                             _, predicted = torch.max(classifier_out.data, 1)
-                            correct += (predicted == (category - 1)).sum().item()
+                            correct += (predicted == (ids - 1)).sum().item()
 
                         if self.regress:
-                            regressor_out = self.model(data, training=True, regress=True, latent=False)
+                            regressor_out = self.model(
+                                data, training=False, regress=True, latent=False
+                            )
                             test_loss_regression += Criterion().regressor_criterion(
                                 regressor_out, parameters
                             )
 
                 test_loss_forecasting /= len(test_loader.dataset)
                 print(
-                    f"====> Mean test set generator loss: {test_loss_forecasting:.4f}"
+                    f"====> Mean test set forecasting loss: {test_loss_forecasting:.4f}"
                 )
                 if self.classify:
                     accuracy = correct / total
@@ -293,236 +315,99 @@ def fit(self, dataloaders, model_save_path=None, training_mode='forecasting', ep
                         )
 
                 if self.regress:
-                    print(f'====> Mean test set regressor loss: {test_loss_regression:.4f}')
+                    print(
+                        f"====> Mean test set regressor loss: {test_loss_regression:.4f}"
+                    )
 
             # Scheduler metric is test set loss
             if training_mode == "forecasting":
                 for scheduler in self.forecasting_schedulers.values():
                     scheduler.step(test_loss_forecasting)
-            elif training_mode == 'classification':
+            elif training_mode == "classification":
                 for scheduler in self.classification_schedulers.values():
                     scheduler.step(test_loss_classification)
-            elif training_mode == 'regression':
+            elif training_mode == "regression":
                 for scheduler in self.regression_schedulers.values():
                     scheduler.step(test_loss_regression)
 
         # Save the model at target path
         utils.save(self.model, self.model_hyperparameters, PATH=model_save_path)
 
+    def validate(self, validation_loader):
+        # Perform model validation
+        validation_loss_forecasting = 0.0
+        validation_loss_classification = 0.0
+        validation_loss_regression = 0.0
+        with torch.no_grad():
+            if self.classify:
+                total = 0.0
+                correct = 0.0
+            self.model.eval()
+            for idx, (data, target, ids, parameters) in enumerate(validation_loader):
+                if type(ids) == list:
+                    ids = ids[0]
+                data, target, ids, parameters = (
+                    data.float().to(device),
+                    target.float().to(device),
+                    ids.to(device),
+                    parameters.float().to(device),
+                )
+                # Time series forecasting test
+                if self.model_type == "ae" or self.model_type == "lstm":
+                    out = self.model(
+                        data, training=False, classify=False, latent=False
+                    )
+                    validation_loss_forecasting += (
+                        Criterion().forecasting_criterion(out, target, loss_type=self.loss_type).item()
+                    )
 
-class CustomTrainer:
-    """
-    Wrapper for training and testing user defined models
-    Args:
-        model: Custom/User-defined model
-        optimizer_type: Type of optimizer to use for training.Should be from ['Adam', 'Adadelta', 'Adagrad',
-                                                                                        'AdamW', 'SparseAdam', 'RMSprop', '
-                                                                                        Rprop', 'LBFGS', 'ASGD', 'Adamax']
-        device: Selected device; 'cuda' or 'cpu'
-        epochs: Number of epochs to train the network
-        lr:Optimizer learning rate
-        lr_factor:  Factor by which the learning rate will be reduced
-        scheduler_patience: Number of epochs with no improvement after which learning rate will be reduced.
-                                        For example, if patience = 2, then we will ignore the first 2 epochs with no
-                                        improvement, and will only decrease the LR after the 3rd epoch if the loss still
-                                        hasn’t improved then.
-    """
-
-    def __init__(
-            self,
-            model: torch.nn.Module,
-            optimizer_type: None,
-            criterion: None,
-            epochs: int,
-            lr: float = 0.001,
-            lr_factor: float = 0.001,
-            scheduler_patience: int = 10,
-    ):
-        self.model = model
-        self.optimizer_type = optimizer_type
-        self.criterion = criterion
-        self.epochs = epochs
-        self.lr = lr
-        self.lr_factor = lr_factor
-        self.scheduler_patience = scheduler_patience
-
-        self.model_type = "custom"
-        optimizer = Optimizer(self.model_type, self.model, self.optimizer_type)
-        self.optimizer = optimizer.get_optimizers(lr=self.lr)
-        self.scheduler = optimizer.get_lrschedulers(
-            factor=self.lr_factor, patience=self.scheduler_patience
-        )
-        self.viz = True
-
-    def fit(self, dataloaders, model_save_path):
-
-        """ Implements the batch wise training and testing for time series forecasting
-            Save train, test and validation performance in forecasting/classification tasks as a performance.csv
-        Args:
-            dataloaders: Dictionary containing train and test dataloaders
-                train_loader: Dataloader object of train dataset with batch data [data,target,category]
-                test_loader: Dataloader object of test dataset with [data,target,category]
-            model_save_path: Directory path to save the model
-        Return:
-            None
-        """
-
-        # Init Visualization
-        if self.viz == "True":
-            self.fig = plt.figure(num="Latent Network Activity")
-            self.plt_close = False
-            self.directednetwork = visualizer.DirectedNetwork()
-
-            self.fig2 = plt.figure(num="Local Linear Embedded Trajectory")
-            self.plt2_close = False
-            self.lle = visualizer.LocalLinearEmbedding()
-
-            self.fig3 = plt.figure(num="Spectral Embedded Latent")
-            self.plt3_close = False
-            self.spectral_clustering = visualizer.SpectralEmbedding()
-
-            plt.pause(0.00001)
-
-        # Training loop
-        self.model.to(device)
-        train_loader, test_loader = dataloaders.values()
-        for epoch in range(self.epochs):
-            if epoch > 0:
-                self.model.train()
-                total_loss = 0
-                for idx, (data, target, _) in enumerate(train_loader):
-                    self.optimizer.zero_grad()
-                    data, target = data.float().to(device), target.float().to(device)
-                    activations, output = self.model(data)
-                    loss = self.criterion(output, target)
-                    loss.backward()
-                    self.optimizer.step()
-                    total_loss += loss
-
-                    # TODO: Wrapper for visualization at visualizer.
-                    if self.viz == "True":
-                        # Visualize the network during training
-                        if not self.plt_close:
-                            # Get the hidden to hidden weights in the network and plot the connections
-                            # TODO: Visualization of multiple layer activations in a window
-                            hidden_weights = dict(
-                                self.model.lstm.w_hhl0.clone().detach().numpy()
-                            )
-
-                            # Hidden activativations
-                            hidden_activ = list(activations.clone().detach().numpy()[0])
-
-                            try:
-                                plt_close = self.directednetwork.show(
-                                    hidden_activ, hidden_weights, self.fig4
-                                )
-                            except Exception:
-                                plt_close = True
-                                pass
+                else:
+                    decoder_out, latent_out, mu, logvar = self.model(
+                        data, training=False, classify=False
+                    )
+                    validation_loss_forecasting += Criterion().forecasting_criterion(
+                        decoder_out, target, mu=mu, logvar=logvar, loss_type=self.loss_type
+                    )
 
-                            plt_close = self.directednetwork.show(
-                                hidden_activ, hidden_weights, self.fig
-                            )
+                # Classification test
+                if self.classify:
+                    ids = ids.long()
+                    classifier_out = self.model(
+                        data, training=False, classify=True, latent=False
+                    )
 
-                        # # Visualize the network during training
-                        if not self.plt2_close:
-                            # Get the principle components
-                            pc = self.lle.local_linear_embedding(
-                                X=activations.clone().detach().numpy(),
-                                d=3,
-                                k=20,
-                                alpha=0.1,
-                            )
-                            plt2_close = self.lle.show(pc, self.fig2)
+                    validation_loss_classification += (
+                        Criterion()
+                            .classifier_criterion(classifier_out, ids - 1)
+                            .item()
+                    )
 
-                        # Visualize the graph embedding using spectral clusters
-                        if not self.plt3_close:
-                            # Get the principle components
-                            embeddings = self.spectral_clustering.spectral_embedding(
-                                X=activations.clone().detach().numpy(), rad=0.8
-                            )
-                            plt3_close = self.spectral_clustering.show(
-                                activations.clone().detach().numpy(),
-                                embeddings,
-                                self.fig3,
-                            )
+                    # Compute number of correct samples
+                    total += ids.size(0)
+                    _, predicted = torch.max(classifier_out.data, 1)
+                    correct += (predicted == (ids - 1)).sum().item()
 
-                print("Epoch {} | loss {}".format(epoch, total_loss / (idx + 1)))
+                if self.regress:
+                    regressor_out = self.model(
+                        data, training=True, regress=True, latent=False
+                    )
+                    validation_loss_regression += Criterion().regressor_criterion(
+                        regressor_out, parameters
+                    )
 
-            # Testing loop
-            if epoch % 10 == 0:
-                with torch.no_grad():
-                    self.model.eval()
-                    self.test_loss_forecasting = 0
-                    for idx, (data, target, _) in enumerate(list(test_loader)):
-                        data, target = (
-                            data.float().to(device),
-                            target.float().to(device),
-                        )
-                        activations, out = self.model(data)
-                        self.test_loss_forecasting += self.criterion(out, target).item()
+            validation_loss_forecasting /= len(validation_loader.dataset)
+            print(
+                f"====> Mean Validation set generator loss: {validation_loss_forecasting:.4f}"
+            )
+            if self.classify:
+                accuracy = correct / total
+                if validation_loss_classification != 0:
+                    validation_loss_classification /= len(validation_loader.dataset)
+                    print(
+                        f"====> Mean Validation set classifier loss: {validation_loss_classification:.4f}; accuracy: {accuracy:.4f}"
+                    )
 
-                self.test_loss_forecasting /= len(test_loader.dataset)
+            if self.regress:
                 print(
-                    f"====> Test set generator loss: {self.test_loss_forecasting:.4f}"
+                    f"====> Mean Validation set regressor loss: {validation_loss_regression:.4f}"
                 )
-
-            # Scheduler metric is test set loss
-            self.scheduler.step(self.test_loss_forecasting)
-
-        # Save the model at target path
-        utils.save(self.model, hyperparameters=None, PATH=model_save_path)
-
-
-class VAEGANTrainer:
-    def __init__(self):
-        pass
-
-    def train(self):
-        return NotImplementedError
-
-
-class IRLTrainer:
-    def __init__(self):
-        pass
-
-    def train(self):
-        return NotImplementedError
-
-
-# TODO
-class Trainer:
-    """Wraps all above Trainers. Instantiate and return the Trainer of model type """
-
-    def __init__(self, *model_hyperparameters, **kwargs):
-        self.model_type = model_hyperparameters["model_type"]
-        self.TrainerType = None
-
-    @property
-    def TrainerType(self):
-        return self.__TrainerType
-
-    @TrainerType.setter
-    def TrainerType(self, model_type):
-        """[summary]
-
-        Args:
-            model_type ([type]): [description]
-        """
-        if model_type in ["ae", "vae"]:
-            self.__TrainerType = HybridTrainer
-        elif model_type in ["lstm"]:
-            self.__TrainerType = LSTMTrainer
-        else:
-            self.__TrainerType = CustomTrainer
-
-    # Check model type, instantiate and set corresponding trainer as traja trainer:
-    def __new__(cls):
-        # Generative model trainer(model_type)
-
-        # Predictive model trainer(model_type)
-
-        # Custom trainer(classify=False)
-
-        # Return the instance of the trainer
-        return NotImplementedError
diff --git a/traja/models/visualizer.py b/traja/models/visualizer.py
index 832cd43f..0f450bd9 100644
--- a/traja/models/visualizer.py
+++ b/traja/models/visualizer.py
@@ -20,7 +20,7 @@
 
 
 def DisplayLatentDynamics(latent):
-    """Visualize the dynamics of combination of latents 
+    """Visualize the dynamics in latent space. Compatible only with the RNN latents 
     Args:
         latent(tensor): Each point in the list is latent's state at the end of a sequence of each batch.
         Latent shape (batch_size, latent_dim)
@@ -262,7 +262,7 @@ def show(self, X, spec_embed, fig3):
 
 if __name__ == "__main__":
     # create the coordinates
-    numebr_of_points = 21
+    number_of_points = 21
     small_range = -1.0
     large_range = 1.0
 
@@ -270,7 +270,7 @@ def show(self, X, spec_embed, fig3):
     ycoordinates = np.linspace(small_range, large_range, num=numebr_of_points)
 
     xcoord_mesh, ycoord_mesh = np.meshgrid(xcoordinates, ycoordinates)
-    inds = np.array(range(numebr_of_points ** 2))
+    inds = np.array(range(number_of_points ** 2))
     s1 = xcoord_mesh.ravel()[inds]
     s2 = ycoord_mesh.ravel()[inds]
     coordinate = np.c_[s1, s2]
@@ -280,9 +280,9 @@ def show(self, X, spec_embed, fig3):
         " to ",
         large_range,
         " with ",
-        numebr_of_points,
+        number_of_points,
         " total number of coordinate: ",
-        numebr_of_points ** 2,
+        number_of_points ** 2,
     )
 
 
diff --git a/traja/parsers.py b/traja/parsers.py
index 20574edb..e907d76e 100644
--- a/traja/parsers.py
+++ b/traja/parsers.py
@@ -53,16 +53,16 @@ def from_df(df: pd.DataFrame, xcol=None, ycol=None, time_col=None, **kwargs):
 
 
 def read_file(
-    filepath: str,
-    id: Optional[str] = None,
-    xcol: Optional[str] = None,
-    ycol: Optional[str] = None,
-    parse_dates: Union[str, bool] = False,
-    xlim: Optional[tuple] = None,
-    ylim: Optional[tuple] = None,
-    spatial_units: str = "m",
-    fps: Optional[float] = None,
-    **kwargs,
+        filepath: str,
+        id: Optional[str] = None,
+        xcol: Optional[str] = None,
+        ycol: Optional[str] = None,
+        parse_dates: Union[str, bool] = False,
+        xlim: Optional[tuple] = None,
+        ylim: Optional[tuple] = None,
+        spatial_units: str = "m",
+        fps: Optional[float] = None,
+        **kwargs,
 ):
     """Convenience method wrapping pandas `read_csv` and initializing metadata.
 
@@ -101,11 +101,11 @@ def read_file(
     stripped_cols = {c: lambda x: x.strip() for c in whitespace_cols}
     converters = {**stripped_cols, **kwargs.pop("converters", {})}
 
-    # Downcast to float32 # TODO: Benchmark float32 vs float64 for very big datasets
+    # Downcast to float32 # TODO: Benchmark float32 vs float64 for very big dataset
     float_cols = df_test.select_dtypes(include=[np.float]).columns
     float32_cols = {c: np.float32 for c in float_cols}
 
-    # Convert string columns to categories
+    # Convert string columns to sequence_ids
     string_cols = [c for c in df_test if df_test[c].dtype == str]
     category_cols = {c: "category" for c in string_cols}
     dtype = {**float32_cols, **category_cols, **kwargs.pop("dtype", {})}
diff --git a/traja/plotting.py b/traja/plotting.py
index 83dd5fd1..eaf1162c 100644
--- a/traja/plotting.py
+++ b/traja/plotting.py
@@ -1,16 +1,17 @@
+import logging
 from collections import OrderedDict
 from datetime import timedelta
-import logging
 from typing import Union, Optional, Tuple, List
 
 import matplotlib
 import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import torch
+from matplotlib import dates as md
 from matplotlib.axes import Axes
 from matplotlib.collections import PathCollection
-from matplotlib import dates as md
 from matplotlib.figure import Figure
-import numpy as np
-import pandas as pd
 from pandas.core.dtypes.common import (
     is_datetime_or_timedelta_dtype,
     is_datetime64_any_dtype,
@@ -21,7 +22,6 @@
 from traja.frame import TrajaDataFrame
 from traja.trajectory import coords_to_flow
 
-
 __all__ = [
     "_get_after_plot_args",
     "_label_axes",
@@ -48,7 +48,7 @@
     "plot_transition_matrix",
     "plot_xy",
     "polar_bar",
-    "predict",
+    "plot_prediction",
     "sans_serif",
     "stylize_axes",
     "trip_grid",
@@ -75,22 +75,44 @@ def _rolling(df, window, step):
     count = 0
     df_length = len(df)
     while count < (df_length - window):
-        yield count, df[count : window + count]
+        yield count, df[count: window + count]
         count += step
 
 
-def predict(
-    xy: np.ndarray,
-    nb_steps: int = 10,
-    epochs: int = 1000,
-    batch_size: int = 1,
-    model="lstm",
-):  # pragma: no cover
-    """Method for training and visualizing LSTM with trajectory datasets."""
-    if model == "lstm":
-        from traja.models.nn import TrajectoryLSTM
+def plot_prediction(model, dataloader, index, scaler=None):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    fig, ax = plt.subplots(2, 1, figsize=(10, 10))
+    model = model.to(device)
+    batch_size = model.batch_size
+    num_past = model.num_past
+    input_size = model.input_size
+
+    data, target, category, parameters = list(iter(dataloader))[index]
+    data = data.float().to(device)
+    prediction = model(data, latent=False)
+
+    # Send tensors to CPU so numpy can work with them
+    pred = prediction[batch_size - 1:batch_size, :].cpu().squeeze().detach().numpy()
+    target = target.clone().detach()[batch_size - 1:batch_size, :].squeeze()
+    real = target.cpu()
+
+    data = data.cpu().reshape(batch_size * num_past, input_size).detach().numpy()
 
-        TrajectoryLSTM(xy, nb_steps=nb_steps, epochs=epochs, batch_size=batch_size)
+    if scaler:
+        data = scaler.inverse_transform(data)
+        real = scaler.inverse_transform(real)
+        pred = scaler.inverse_transform(pred)
+
+    ax[0].plot(data[:, 0], data[:, 1], label="History")
+    ax[0].plot(real[:, 0], real[:, 1], label="Real")
+    ax[0].plot(pred[:, 0], pred[:, 1], label="Pred")
+
+    ax[1].scatter(real[:, 0], real[:, 1], label="Real")
+    ax[1].scatter(pred[:, 0], pred[:, 1], label="Pred")
+
+    for a in ax:
+        a.legend()
+    plt.show()
 
 
 def bar_plot(trj: TrajaDataFrame, bins: Union[int, tuple] = None, **kwargs) -> Axes:
@@ -106,7 +128,6 @@ def bar_plot(trj: TrajaDataFrame, bins: Union[int, tuple] = None, **kwargs) -> A
 
     """
     # TODO: Add time component
-    from mpl_toolkits.mplot3d import Axes3D
 
     bins = traja.trajectory._bins_to_tuple(trj, bins)
 
@@ -154,7 +175,7 @@ def plot_rolling_hull(trj: TrajaDataFrame, window=100, step=20, areas=False, **k
             hull_areas.append(hull.area)
         plt.plot(hull_areas, **kwargs)
         plt.title(f"Rolling Trajectory Convex Hull Area\nWindow={window},Step={step}")
-        plt.ylabel(f"Area {trj.__dict__.get('spatial_units','m')}")
+        plt.ylabel(f"Area {trj.__dict__.get('spatial_units', 'm')}")
         plt.xlabel("Frame")
     else:
         xlim, ylim = traja.trajectory._get_xylim(trj)
@@ -162,14 +183,14 @@ def plot_rolling_hull(trj: TrajaDataFrame, window=100, step=20, areas=False, **k
         plt.ylim = ylim
         for idx, hull in enumerate(hulls):
             if hasattr(
-                hull, "exterior"
+                    hull, "exterior"
             ):  # Occassionally a Point object without it reaches
                 plt.plot(*hull.exterior.xy, alpha=idx / len(hulls), c="k", **kwargs)
         ax = plt.gca()
         ax.set_aspect("equal")
         ax.set(
-            xlabel=f"x ({trj.__dict__.get('spatial_units','m')})",
-            ylabel=f"y ({trj.__dict__.get('spatial_units','m')})",
+            xlabel=f"x ({trj.__dict__.get('spatial_units', 'm')})",
+            ylabel=f"y ({trj.__dict__.get('spatial_units', 'm')})",
             title="Rolling Trajectory Convex Hull\nWindow={window},Step={step}",
         )
 
@@ -203,8 +224,6 @@ def plot_period(trj: TrajaDataFrame, col="x", dark=(7, 19), **kwargs):
 
 
 def plot_rolling_hull_3d(trj: TrajaDataFrame, window=100, step=20, **kwargs):
-    from mpl_toolkits.mplot3d import Axes3D
-
     hulls = []
 
     fig = plt.figure()
@@ -233,8 +252,8 @@ def plot_rolling_hull_3d(trj: TrajaDataFrame, window=100, step=20, **kwargs):
         ax.plot(*xy, z)
 
     ax.set(
-        xlabel=f"{trj.__dict__.get('spatial_units','m')}",
-        ylabel=f"{trj.__dict__.get('spatial_units','m')}",
+        xlabel=f"{trj.__dict__.get('spatial_units', 'm')}",
+        ylabel=f"{trj.__dict__.get('spatial_units', 'm')}",
         title=f"Rolling Trajectory Convex Hull\nWindow={window},Step={step}",
     )
 
@@ -260,7 +279,6 @@ def plot_3d(trj: TrajaDataFrame, **kwargs) -> matplotlib.collections.PathCollect
             rt.traja.plot_3d()
 
     """
-    from mpl_toolkits.mplot3d import Axes3D
 
     fig = plt.figure()
     ax = fig.add_subplot(111, projection="3d")
@@ -275,7 +293,7 @@ def plot_3d(trj: TrajaDataFrame, **kwargs) -> matplotlib.collections.PathCollect
     NPOINTS = len(trj)
     ax.set_prop_cycle(color=[cm(1.0 * i / (NPOINTS - 1)) for i in range(NPOINTS - 1)])
     for i in range(NPOINTS - 1):
-        ax.plot(trj.x[i : i + 2], trj.y[i : i + 2], trj.index[i : i + 2])
+        ax.plot(trj.x[i: i + 2], trj.y[i: i + 2], trj.index[i: i + 2])
 
     dist = kwargs.pop("dist", None)
     if dist:
@@ -288,13 +306,14 @@ def plot_3d(trj: TrajaDataFrame, **kwargs) -> matplotlib.collections.PathCollect
 
     return ax
 
+
 def plot(
-    trj: TrajaDataFrame,
-    n_coords: Optional[int] = None,
-    show_time: bool = False,
-    accessor: Optional[traja.TrajaAccessor] = None,
-    ax=None,
-    **kwargs,
+        trj: TrajaDataFrame,
+        n_coords: Optional[int] = None,
+        show_time: bool = False,
+        accessor: Optional[traja.TrajaAccessor] = None,
+        ax=None,
+        **kwargs,
 ) -> matplotlib.collections.PathCollection:
     """Plot trajectory for single animal over period.
 
@@ -433,9 +452,9 @@ def plot(
     elif time_col and is_datetime:
         cbar_labels = (
             trj[time_col]
-            .iloc[indices]
-            .dt.strftime("%Y-%m-%d %H:%M:%S")
-            .values.astype(str)
+                .iloc[indices]
+                .dt.strftime("%Y-%m-%d %H:%M:%S")
+                .values.astype(str)
         )
     else:
         # Convert frames to time
@@ -479,12 +498,12 @@ def plot_periodogram(trj, coord: str = "y", fs: int = 1, interactive: bool = Tru
 
 
 def plot_autocorrelation(
-    trj: TrajaDataFrame,
-    coord: str = "y",
-    unit: str = "Days",
-    sample_rate: float = 3.0,
-    xmax: int = 1000,
-    interactive: bool = True,
+        trj: TrajaDataFrame,
+        coord: str = "y",
+        unit: str = "Days",
+        sample_rate: float = 3.0,
+        xmax: int = 1000,
+        interactive: bool = True,
 ):
     """Plot autocorrelation of given coordinate.
     
@@ -515,10 +534,10 @@ def plot_autocorrelation(
 
 
 def plot_collection(
-    trjs: Union[pd.DataFrame, TrajaDataFrame],
-    id_col: str = "id",
-    colors: Optional[Union[dict, List[str]]] = None,
-    **kwargs,
+        trjs: Union[pd.DataFrame, TrajaDataFrame],
+        id_col: str = "id",
+        colors: Optional[Union[dict, List[str]]] = None,
+        **kwargs,
 ):
     """Plot trajectories of multiple subjects identified by `id`.
 
@@ -602,10 +621,10 @@ def _label_axes(trj: TrajaDataFrame, ax) -> Axes:
 
 
 def plot_quiver(
-    trj: TrajaDataFrame,
-    bins: Optional[Union[int, tuple]] = None,
-    quiverplot_kws: dict = {},
-    **kwargs,
+        trj: TrajaDataFrame,
+        bins: Optional[Union[int, tuple]] = None,
+        quiverplot_kws: dict = {},
+        **kwargs,
 ) -> Axes:
     """Plot average flow from each grid cell to neighbor.
 
@@ -634,14 +653,14 @@ def plot_quiver(
 
 
 def plot_contour(
-    trj: TrajaDataFrame,
-    bins: Optional[Union[int, tuple]] = None,
-    filled: bool = True,
-    quiver: bool = True,
-    contourplot_kws: dict = {},
-    contourfplot_kws: dict = {},
-    quiverplot_kws: dict = {},
-    **kwargs,
+        trj: TrajaDataFrame,
+        bins: Optional[Union[int, tuple]] = None,
+        filled: bool = True,
+        quiver: bool = True,
+        contourplot_kws: dict = {},
+        contourfplot_kws: dict = {},
+        quiverplot_kws: dict = {},
+        **kwargs,
 ) -> Axes:
     """Plot average flow from each grid cell to neighbor.
 
@@ -680,10 +699,10 @@ def plot_contour(
 
 
 def plot_surface(
-    trj: TrajaDataFrame,
-    bins: Optional[Union[int, tuple]] = None,
-    cmap: str = "jet",
-    **surfaceplot_kws: dict,
+        trj: TrajaDataFrame,
+        bins: Optional[Union[int, tuple]] = None,
+        cmap: str = "jet",
+        **surfaceplot_kws: dict,
 ) -> Figure:
     """Plot surface of flow from each grid cell to neighbor in 3D.
 
@@ -696,7 +715,6 @@ def plot_surface(
     Returns:
         ax (:class:`~matplotlib.axes.Axes`): Axes of quiver plot
     """
-    from mpl_toolkits.mplot3d import Axes3D
 
     after_plot_args, surfaceplot_kws = _get_after_plot_args(**surfaceplot_kws)
 
@@ -721,13 +739,13 @@ def plot_surface(
 
 
 def plot_stream(
-    trj: TrajaDataFrame,
-    bins: Optional[Union[int, tuple]] = None,
-    cmap: str = "jet",
-    contourfplot_kws: dict = {},
-    contourplot_kws: dict = {},
-    streamplot_kws: dict = {},
-    **kwargs,
+        trj: TrajaDataFrame,
+        bins: Optional[Union[int, tuple]] = None,
+        cmap: str = "jet",
+        contourfplot_kws: dict = {},
+        contourplot_kws: dict = {},
+        streamplot_kws: dict = {},
+        **kwargs,
 ) -> Figure:
     """Plot average flow from each grid cell to neighbor.
 
@@ -763,15 +781,15 @@ def plot_stream(
 
 
 def plot_flow(
-    trj: TrajaDataFrame,
-    kind: str = "quiver",
-    *args,
-    contourplot_kws: dict = {},
-    contourfplot_kws: dict = {},
-    streamplot_kws: dict = {},
-    quiverplot_kws: dict = {},
-    surfaceplot_kws: dict = {},
-    **kwargs,
+        trj: TrajaDataFrame,
+        kind: str = "quiver",
+        *args,
+        contourplot_kws: dict = {},
+        contourfplot_kws: dict = {},
+        streamplot_kws: dict = {},
+        quiverplot_kws: dict = {},
+        surfaceplot_kws: dict = {},
+        **kwargs,
 ) -> Figure:
     """Plot average flow from each grid cell to neighbor.
 
@@ -818,13 +836,13 @@ def _get_after_plot_args(**kwargs: dict) -> (dict, dict):
 
 
 def trip_grid(
-    trj: TrajaDataFrame,
-    bins: Union[tuple, int] = 10,
-    log: bool = False,
-    spatial_units: str = None,
-    normalize: bool = False,
-    hist_only: bool = False,
-    **kwargs,
+        trj: TrajaDataFrame,
+        bins: Union[tuple, int] = 10,
+        log: bool = False,
+        spatial_units: str = None,
+        normalize: bool = False,
+        hist_only: bool = False,
+        **kwargs,
 ) -> Tuple[np.ndarray, PathCollection]:
     """Generate a heatmap of time spent by point-to-cell gridding.
 
@@ -890,7 +908,7 @@ def _process_after_plot_args(**after_plot_args):
 
 
 def color_dark(
-    series: pd.Series, ax: matplotlib.axes.Axes = None, start: int = 19, end: int = 7
+        series: pd.Series, ax: matplotlib.axes.Axes = None, start: int = 19, end: int = 7
 ):
     """Color dark phase in plot."""
     assert is_datetime_or_timedelta_dtype(
@@ -899,7 +917,7 @@ def color_dark(
 
     if not ax:
         ax = plt.gca()
-        
+
     # get boundaries for dark times
     dark_mask = (series.index.hour >= start) | (series.index.hour < end)
     run_values, run_starts, run_lengths = find_runs(dark_mask)
@@ -980,7 +998,7 @@ def plot_xy(xy: np.ndarray, *args: Optional, **kwargs: Optional):
 
 
 def plot_actogram(
-    series: pd.Series, dark=(19, 7), ax: matplotlib.axes.Axes = None, **kwargs
+        series: pd.Series, dark=(19, 7), ax: matplotlib.axes.Axes = None, **kwargs
 ):
     """Plot activity or displacement as an actogram.
 
@@ -994,8 +1012,8 @@ def plot_actogram(
     assert is_datetime_or_timedelta_dtype(
         series.index
     ), f"Series must have datetime index but has {type(series.index)}"
-    
-    after_plot_args, _ = _get_after_plot_args(**kwargs)    
+
+    after_plot_args, _ = _get_after_plot_args(**kwargs)
 
     ax = series.plot(ax=ax)
     ax.set_ylabel(series.name)
@@ -1006,12 +1024,12 @@ def plot_actogram(
 
 
 def _polar_bar(
-    radii: np.ndarray,
-    theta: np.ndarray,
-    bin_size: int = 2,
-    ax: Optional[matplotlib.axes.Axes] = None,
-    overlap: bool = True,
-    **kwargs: str,
+        radii: np.ndarray,
+        theta: np.ndarray,
+        bin_size: int = 2,
+        ax: Optional[matplotlib.axes.Axes] = None,
+        overlap: bool = True,
+        **kwargs: str,
 ) -> Axes:
     after_plot_args, kwargs = _get_after_plot_args(**kwargs)
 
@@ -1045,13 +1063,13 @@ def _polar_bar(
 
 
 def polar_bar(
-    trj: TrajaDataFrame,
-    feature: str = "turn_angle",
-    bin_size: int = 2,
-    threshold: float = 0.001,
-    overlap: bool = True,
-    ax: Optional[matplotlib.axes.Axes] = None,
-    **plot_kws: str,
+        trj: TrajaDataFrame,
+        feature: str = "turn_angle",
+        bin_size: int = 2,
+        threshold: float = 0.001,
+        overlap: bool = True,
+        ax: Optional[matplotlib.axes.Axes] = None,
+        **plot_kws: str,
 ) -> Axes:
     """Plot polar bar chart.
 
@@ -1082,7 +1100,7 @@ def polar_bar(
     trj = trj[pd.notnull(trj.displacement)]
 
     assert (
-        len(trj) > 0
+            len(trj) > 0
     ), f"Dataframe is empty after filtering for step distance threshold {threshold}"
 
     ax = _polar_bar(
@@ -1097,11 +1115,11 @@ def polar_bar(
 
 
 def plot_clustermap(
-    displacements: List[pd.Series],
-    rule: Optional[str] = None,
-    nr_steps=None,
-    colors: Optional[List[Union[int, str]]] = None,
-    **kwargs,
+        displacements: List[pd.Series],
+        rule: Optional[str] = None,
+        nr_steps=None,
+        colors: Optional[List[Union[int, str]]] = None,
+        **kwargs,
 ):
     """Plot cluster map / dendrogram of trajectories with DatetimeIndex.
 
@@ -1167,9 +1185,9 @@ def _get_markov_edges(Q: pd.DataFrame, greater_than=0.1):
 
 
 def plot_transition_graph(
-    data: Union[pd.DataFrame, traja.TrajaDataFrame, np.ndarray],
-    outpath="markov.dot",
-    interactive=True,
+        data: Union[pd.DataFrame, traja.TrajaDataFrame, np.ndarray],
+        outpath="markov.dot",
+        interactive=True,
 ):
     """Plot transition graph with networkx.
 
@@ -1188,9 +1206,9 @@ def plot_transition_graph(
         raise ImportError(f"{e} - please install it with pip")
 
     if (
-        isinstance(data, (traja.TrajaDataFrame))
-        or isinstance(data, pd.DataFrame)
-        and "x" in data
+            isinstance(data, (traja.TrajaDataFrame))
+            or isinstance(data, pd.DataFrame)
+            and "x" in data
     ):
         transition_matrix = traja.transitions(data)
         edges_wts = _get_markov_edges(pd.DataFrame(transition_matrix))
@@ -1226,9 +1244,9 @@ def plot_transition_graph(
 
 
 def plot_transition_matrix(
-    data: Union[pd.DataFrame, traja.TrajaDataFrame, np.ndarray],
-    interactive=True,
-    **kwargs,
+        data: Union[pd.DataFrame, traja.TrajaDataFrame, np.ndarray],
+        interactive=True,
+        **kwargs,
 ) -> matplotlib.image.AxesImage:
     """Plot transition matrix.
     
diff --git a/traja/test/test_data/test_plotting.py b/traja/test/test_data/test_plotting.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/traja/tests/test_accessor.py b/traja/tests/test_accessor.py
index ac97b44b..1b7c45a5 100644
--- a/traja/tests/test_accessor.py
+++ b/traja/tests/test_accessor.py
@@ -1,5 +1,5 @@
-import shapely
 import pandas as pd
+import shapely
 
 import traja
 
@@ -30,15 +30,15 @@ def test_xy():
     assert xy.shape == (20, 2)
 
 
-#def test_calc_derivatives():
+# def test_calc_derivatives():
 #    df.traja.calc_derivatives()
 
 
-#def test_get_derivatives():
+# def test_get_derivatives():
 #    df.traja.get_derivatives()
 
 
-#def test_speed_intervals():
+# def test_speed_intervals():
 #    si = df.traja.speed_intervals(faster_than=100)
 #    assert isinstance(si, traja.TrajaDataFrame)
 
diff --git a/traja/tests/test_dataset.py b/traja/tests/test_dataset.py
new file mode 100644
index 00000000..a1b42d77
--- /dev/null
+++ b/traja/tests/test_dataset.py
@@ -0,0 +1,560 @@
+import pandas as pd
+
+from traja.dataset import dataset
+
+
+def test_time_based_sampling_dataloaders_do_not_overlap():
+    data = list()
+    num_ids = 140
+    sequence_length = 2000
+
+    # Hyperparameters
+    batch_size = 10
+    num_past = 10
+    num_future = 5
+    train_split_ratio = 0.501
+    validation_split_ratio = 0.25
+
+    split_by_id = False  # The test condition
+
+    # The train[0] column should contain only 1s, the test column should contain 2s and the
+    # validation column set should contain 3s.
+    # When scaled, this translates to -1., 0 and 1. respectively.
+    for sample_id in range(num_ids):
+        for element in range(round(sequence_length * train_split_ratio)):
+            data.append([1, element, sample_id])
+        for element in range(round(sequence_length * (1 - train_split_ratio - validation_split_ratio))):
+            data.append([2, element, sample_id])
+        for element in range(round(sequence_length * validation_split_ratio)):
+            data.append([3, element, sample_id])
+
+    df = pd.DataFrame(data, columns=['x', 'y', 'ID'])
+
+    dataloaders = dataset.MultiModalDataLoader(df,
+                                               batch_size=batch_size,
+                                               n_past=num_past,
+                                               n_future=num_future,
+                                               num_workers=1,
+                                               train_split_ratio=train_split_ratio,
+                                               validation_split_ratio=validation_split_ratio,
+                                               split_by_id=split_by_id)
+
+    for data, target, ids, parameters in dataloaders['train_loader']:
+        for sequence in data:
+            for sample in sequence:
+                assert sample[0] == -1.
+        for sequence in target:
+            for sample in sequence:
+                assert sample[0] == -1.
+
+    for data, target, ids, parameters in dataloaders['test_loader']:
+        for sequence in data:
+            for sample in sequence:
+                assert sample[0] == 0
+        for sequence in target:
+            for sample in sequence:
+                assert sample[0] == 0
+
+    for data, target, ids, parameters in dataloaders['validation_loader']:
+        for sequence in data:
+            for sample in sequence:
+                assert sample[0] == 1
+        for sequence in target:
+            for sample in sequence:
+                assert sample[0] == 1
+
+
+def test_time_based_sampling_dataloaders_with_short_stride_do_not_overlap():
+    data = list()
+    num_ids = 140
+    sequence_length = 2000
+
+    # Hyperparameters
+    batch_size = 15
+    num_past = 10
+    num_future = 5
+    train_split_ratio = 0.498
+    validation_split_ratio = 0.25
+
+    stride = 5
+
+    split_by_id = False  # The test condition
+
+    # The train[0] column should contain only 1s, the test column should contain 2s and the
+    # validation column set should contain 3s.
+    # When scaled, this translates to -1., 0 and 1. respectively.
+    for sample_id in range(num_ids):
+        for element in range(round(sequence_length * train_split_ratio) - 6):
+            data.append([1, element, sample_id])
+        for element in range(round(sequence_length * (1 - train_split_ratio - validation_split_ratio)) + -4):
+            data.append([2, element, sample_id])
+        for element in range(round(sequence_length * validation_split_ratio) + 10):
+            data.append([3, element, sample_id])
+
+    df = pd.DataFrame(data, columns=['x', 'y', 'ID'])
+
+    dataloaders = dataset.MultiModalDataLoader(df,
+                                               batch_size=batch_size,
+                                               n_past=num_past,
+                                               n_future=num_future,
+                                               num_workers=1,
+                                               train_split_ratio=train_split_ratio,
+                                               validation_split_ratio=validation_split_ratio,
+                                               split_by_id=split_by_id,
+                                               stride=stride)
+
+    for data, target, ids, parameters in dataloaders['train_loader']:
+        for sequence in data:
+            for sample in sequence:
+                assert sample[0] == -1.
+        for sequence in target:
+            for sample in sequence:
+                assert sample[0] == -1.
+
+    for data, target, ids, parameters in dataloaders['test_loader']:
+        for sequence in data:
+            for sample in sequence:
+                assert sample[0] == 0
+        for sequence in target:
+            for sample in sequence:
+                assert sample[0] == 0
+
+    for data, target, ids, parameters in dataloaders['validation_loader']:
+        for sequence in data:
+            for sample in sequence:
+                assert sample[0] == 1
+        for sequence in target:
+            for sample in sequence:
+                assert sample[0] == 1
+
+
+def test_time_based_sampling_dataloaders_with_stride_one_do_not_overlap():
+    data = list()
+    num_ids = 2
+    sequence_length = 200
+
+    # Hyperparameters
+    batch_size = 15
+    num_past = 10
+    num_future = 5
+    train_split_ratio = 0.5
+    validation_split_ratio = 0.25
+
+    stride = 1
+
+    split_by_id = False  # The test condition
+
+    # The train[0] column should contain only 1s, the test column should contain 2s and the
+    # validation column set should contain 3s.
+    # When scaled, this translates to -1., 0 and 1. respectively.
+    for sample_id in range(num_ids):
+        for element in range(round(sequence_length * train_split_ratio) - 8):
+            data.append([1, element, sample_id])
+        for element in range(round(sequence_length * (1 - train_split_ratio - validation_split_ratio)) - 4):
+            data.append([2, element, sample_id])
+        for element in range(round(sequence_length * validation_split_ratio) + 12):
+            data.append([3, element, sample_id])
+
+    df = pd.DataFrame(data, columns=['x', 'y', 'ID'])
+
+    dataloaders = dataset.MultiModalDataLoader(df,
+                                               batch_size=batch_size,
+                                               n_past=num_past,
+                                               n_future=num_future,
+                                               num_workers=4,
+                                               train_split_ratio=train_split_ratio,
+                                               validation_split_ratio=validation_split_ratio,
+                                               split_by_id=split_by_id,
+                                               stride=stride)
+
+    for data, target, ids, parameters in dataloaders['train_loader']:
+        for sequence in data:
+            for sample in sequence:
+                assert sample[0] == -1.
+        for sequence in target:
+            for sample in sequence:
+                assert sample[0] == -1.
+
+    for data, target, ids, parameters in dataloaders['test_loader']:
+        for sequence in data:
+            for sample in sequence:
+                assert sample[0] == 0
+        for sequence in target:
+            for sample in sequence:
+                assert sample[0] == 0
+
+    for data, target, ids, parameters in dataloaders['validation_loader']:
+        for sequence in data:
+            for sample in sequence:
+                assert sample[0] == 1
+        for sequence in target:
+            for sample in sequence:
+                assert sample[0] == 1
+
+
+def test_time_based_weighted_sampling_dataloaders_do_not_overlap():
+    data = list()
+    num_ids = 232
+    sample_id = 0
+
+    for sequence_id in range(num_ids):
+        for sequence in range(40 + (int(sequence_id * 2.234) % 117)):
+            data.append([sequence, sample_id, sequence_id])
+            sample_id += 1
+
+    df = pd.DataFrame(data, columns=['x', 'y', 'ID'])
+
+    # Hyperparameters
+    batch_size = 10
+    num_past = 10
+    num_future = 5
+    train_split_ratio = 0.333
+    validation_split_ratio = 0.333
+
+    dataloaders = dataset.MultiModalDataLoader(df,
+                                               batch_size=batch_size,
+                                               n_past=num_past,
+                                               n_future=num_future,
+                                               num_workers=1,
+                                               train_split_ratio=train_split_ratio,
+                                               validation_split_ratio=validation_split_ratio,
+                                               scale=False,
+                                               split_by_id=False,
+                                               weighted_sampling=True,
+                                               stride=1)
+
+    train_ids = extract_sample_ids_from_dataloader(dataloaders['train_loader'])
+    test_ids = extract_sample_ids_from_dataloader(dataloaders['test_loader'])
+    validation_ids = extract_sample_ids_from_dataloader(dataloaders['validation_loader'])
+    sequential_train_ids = extract_sample_ids_from_dataloader(dataloaders['sequential_train_loader'])
+    sequential_test_ids = extract_sample_ids_from_dataloader(dataloaders['sequential_test_loader'])
+    sequential_validation_ids = extract_sample_ids_from_dataloader(dataloaders['sequential_validation_loader'])
+
+    verify_that_indices_belong_to_precisely_one_loader(train_ids, test_ids, validation_ids)
+    verify_that_indices_belong_to_precisely_one_loader(sequential_train_ids, sequential_test_ids,
+                                                       sequential_validation_ids)
+
+
+def test_id_wise_sampling_with_few_ids_does_not_put_id_in_multiple_dataloaders():
+    data = list()
+    num_ids = 5
+    sample_id = 0
+
+    for sequence_id in range(num_ids):
+        for sequence in range(40 + int(sequence_id / 14)):
+            data.append([sequence, sample_id, sequence_id])
+            sample_id += 1
+
+    df = pd.DataFrame(data, columns=['x', 'y', 'ID'])
+
+    # Hyperparameters
+    batch_size = 1
+    num_past = 10
+    num_future = 5
+    train_split_ratio = 0.5
+    validation_split_ratio = 0.2
+
+    dataloaders = dataset.MultiModalDataLoader(df,
+                                               batch_size=batch_size,
+                                               n_past=num_past,
+                                               n_future=num_future,
+                                               num_workers=1,
+                                               train_split_ratio=train_split_ratio,
+                                               validation_split_ratio=validation_split_ratio,
+                                               scale=False)
+
+    verify_sequential_id_sampled_sequential_dataloaders_equal_dataloaders(dataloaders, train_split_ratio,
+                                                                          validation_split_ratio, num_ids)
+
+
+def test_id_wise_sampling_with_short_sequences_does_not_divide_by_zero():
+    data = list()
+    num_ids = 283
+    sample_id = 0
+
+    for sequence_id in range(num_ids):
+        for sequence in range(1 + (sequence_id % 74)):  # Some sequences will generate zero time series
+            data.append([sequence, sample_id, sequence_id])
+            sample_id += 1
+
+    df = pd.DataFrame(data, columns=['x', 'y', 'ID'])
+
+    # Hyperparameters
+    batch_size = 1
+    num_past = 10
+    num_future = 5
+    train_split_ratio = 0.333
+    validation_split_ratio = 0.333
+
+    dataloaders = dataset.MultiModalDataLoader(df,
+                                               batch_size=batch_size,
+                                               n_past=num_past,
+                                               n_future=num_future,
+                                               num_workers=1,
+                                               train_split_ratio=train_split_ratio,
+                                               validation_split_ratio=validation_split_ratio,
+                                               scale=False)
+
+    verify_sequential_id_sampled_sequential_dataloaders_equal_dataloaders(dataloaders, train_split_ratio,
+                                                                          validation_split_ratio, num_ids,
+                                                                          expect_all_ids=False)
+
+
+def test_id_wise_sampling_does_not_put_id_in_multiple_dataloaders():
+    data = list()
+    num_ids = 150
+    sample_id = 0
+
+    for sequence_id in range(num_ids):
+        for sequence in range(40):
+            data.append([sequence, sample_id, sequence_id])
+            sample_id += 1
+
+    df = pd.DataFrame(data, columns=['x', 'y', 'ID'])
+
+    # Hyperparameters
+    batch_size = 10
+    num_past = 10
+    num_future = 5
+    train_split_ratio = 0.333
+    validation_split_ratio = 0.333
+
+    dataloaders = dataset.MultiModalDataLoader(df,
+                                               batch_size=batch_size,
+                                               n_past=num_past,
+                                               n_future=num_future,
+                                               num_workers=1,
+                                               train_split_ratio=train_split_ratio,
+                                               validation_split_ratio=validation_split_ratio,
+                                               scale=False)
+
+    verify_sequential_id_sampled_sequential_dataloaders_equal_dataloaders(dataloaders, train_split_ratio,
+                                                                          validation_split_ratio, num_ids)
+
+
+def test_id_wise_weighted_sampling_does_not_put_id_in_multiple_dataloaders():
+    data = list()
+    num_ids = 150
+    sample_id = 0
+
+    for sequence_id in range(num_ids):
+        for sequence in range(40 + (int(sequence_id * 2.234) % 117)):
+            data.append([sequence, sample_id, sequence_id])
+            sample_id += 1
+
+    df = pd.DataFrame(data, columns=['x', 'y', 'ID'])
+
+    # Hyperparameters
+    batch_size = 10
+    num_past = 10
+    num_future = 5
+    train_split_ratio = 0.333
+    validation_split_ratio = 0.333
+
+    dataloaders = dataset.MultiModalDataLoader(df,
+                                               batch_size=batch_size,
+                                               n_past=num_past,
+                                               n_future=num_future,
+                                               num_workers=1,
+                                               train_split_ratio=train_split_ratio,
+                                               validation_split_ratio=validation_split_ratio,
+                                               scale=False,
+                                               weighted_sampling=True,
+                                               stride=1)
+
+    verify_id_wise_sampled_dataloaders_do_not_overlap(dataloaders, train_split_ratio, validation_split_ratio, num_ids)
+
+
+def extract_sample_ids_from_dataloader(dataloader):
+    sample_ids = list()
+    for data, target, ids, parameters in dataloader:
+        for index, sequence_id in enumerate(ids):
+            sample_ids.append(int(data[index][0][1]))
+    return sample_ids
+
+
+def verify_id_wise_sampled_dataloaders_do_not_overlap(dataloaders, train_split_ratio, validation_split_ratio, num_ids,
+                                                      expect_all_ids=True):
+    train_ids = []  # We check that the sequence IDs are not mixed
+    train_sample_ids = []  # We also check that the sample IDs do not overlap
+    for data, target, ids, parameters in dataloaders['train_loader']:
+        for index, sequence_id in enumerate(ids):
+            sequence_id = int(sequence_id)
+            if sequence_id not in train_ids:
+                train_ids.append(sequence_id)
+            train_sample_ids.append(int(data[index][0][1]))
+
+    test_ids = []
+    test_sample_ids = []
+    for data, target, ids, parameters in dataloaders['test_loader']:
+        for index, sequence_id in enumerate(ids):
+            sequence_id = int(sequence_id)
+            if sequence_id not in test_ids:
+                test_ids.append(sequence_id)
+            test_sample_ids.append(int(data[index][0][1]))
+
+            assert sequence_id not in train_ids, 'Found test data in train loader!'
+
+    validation_ids = []
+    validation_sample_ids = []
+    for data, target, ids, parameters in dataloaders['validation_loader']:
+        for index, sequence_id in enumerate(ids):
+            sequence_id = int(sequence_id)
+            if sequence_id not in validation_ids:
+                validation_ids.append(sequence_id)
+            validation_sample_ids.append(int(data[index][0][1]))
+
+            assert sequence_id not in train_ids, 'Found validation data in train loader!'
+            assert sequence_id not in test_ids, 'Found validation data in test loader!'
+
+    if expect_all_ids:
+        assert len(train_ids) == round(train_split_ratio * num_ids), 'Wrong number of training ids!'
+        assert len(validation_ids) == round(
+            validation_split_ratio * num_ids), 'Wrong number of validation ids!'
+        assert len(train_ids) + len(test_ids) + len(
+            validation_ids) == num_ids, 'Wrong number of ids!'
+
+    return train_ids, train_sample_ids, test_ids, test_sample_ids, validation_ids, validation_sample_ids
+
+
+def verify_sequential_id_sampled_sequential_dataloaders_equal_dataloaders(dataloaders, train_split_ratio,
+                                                                          validation_split_ratio, num_ids,
+                                                                          expect_all_ids=True):
+    train_ids, train_sample_ids, test_ids, test_sample_ids, validation_ids, validation_sample_ids = verify_id_wise_sampled_dataloaders_do_not_overlap(
+        dataloaders, train_split_ratio, validation_split_ratio, num_ids, expect_all_ids)
+
+    # We check that all sample IDs are present in the sequential samplers and vice versa
+    train_sequential_sample_ids = []
+    for data, target, ids, parameters in dataloaders['sequential_train_loader']:
+        for index, sequence_id in enumerate(ids):
+            sequence_id = int(sequence_id)
+            train_sequential_sample_ids.append(int(data[index][0][1]))
+            assert sequence_id in train_ids, f'train_ids missing id {sequence_id}!'
+
+    train_sample_ids = sorted(train_sample_ids)
+    assert len(train_sample_ids) == len(
+        train_sequential_sample_ids), 'train and sequential_train loaders have different lengths!'
+    for index in range(len(train_sample_ids)):
+        assert train_sample_ids[index] == train_sequential_sample_ids[
+            index], f'Index {train_sample_ids[index]} is not equal to {train_sequential_sample_ids[index]}!'
+
+    test_sequential_sample_ids = []
+    for data, target, ids, parameters in dataloaders['sequential_test_loader']:
+        for index, sequence_id in enumerate(ids):
+            sequence_id = int(sequence_id)
+            test_sequential_sample_ids.append(int(data[index][0][1]))
+            assert sequence_id in test_ids, f'test_ids missing id {sequence_id}!'
+
+    test_sample_ids = sorted(test_sample_ids)
+    assert len(test_sample_ids) == len(
+        test_sequential_sample_ids), 'test and sequential_test loaders have different lengths!'
+    for index in range(len(test_sample_ids)):
+        assert test_sample_ids[index] == test_sequential_sample_ids[
+            index], f'Index {test_sample_ids[index]} is not equal to {test_sequential_sample_ids[index]}!'
+
+    validation_sequential_sample_ids = []
+    for data, target, ids, parameters in dataloaders['sequential_validation_loader']:
+        for index, sequence_id in enumerate(ids):
+            sequence_id = int(sequence_id)
+            validation_sequential_sample_ids.append(int(data[index][0][1]))
+            assert sequence_id in validation_ids, f'validation_ids missing id {sequence_id}!'
+
+    validation_sample_ids = sorted(validation_sample_ids)
+    assert len(validation_sample_ids) == len(
+        validation_sequential_sample_ids), 'validation and sequential_validation loaders have different lengths!'
+    for index in range(len(validation_sample_ids)):
+        assert validation_sample_ids[index] == validation_sequential_sample_ids[
+            index], f'Index {validation_sample_ids[index]} is not equal to {validation_sequential_sample_ids[index]}!'
+
+    verify_that_indices_belong_to_precisely_one_loader(train_sample_ids, test_sample_ids, validation_sample_ids)
+    # Check that all indices belong to precisely one loader
+    # Note that (because some samples are dropped and because we only check the first value in data)
+    # not all indices are in a loader.
+    train_index = 0
+    test_index = 0
+    validation_index = 0
+    for index in range(len(train_sample_ids) + len(test_sample_ids) + len(validation_sample_ids)):
+        if train_sample_ids[train_index] < index:
+            train_index += 1
+        if test_sample_ids[test_index] < index:
+            test_index += 1
+        if validation_sample_ids[validation_index] < index:
+            validation_index += 1
+        index_is_in_train = train_sample_ids[train_index] == index
+        index_is_in_test = test_sample_ids[test_index] == index
+        index_is_in_validation = validation_sample_ids[validation_index] == index
+
+        assert not (index_is_in_train and index_is_in_test), f'Index {index} is in both the train and test loaders!'
+        assert not (
+                index_is_in_train and index_is_in_validation), f'Index {index} is in both the train and validation loaders!'
+        assert not (
+                index_is_in_test and index_is_in_validation), f'Index {index} is in both the test and validation loaders!'
+
+
+def verify_that_indices_belong_to_precisely_one_loader(train_sample_ids, test_sample_ids, validation_sample_ids):
+    # Check that all indices belong to precisely one loader
+    # Note that (because some samples are dropped and because we only check the first value in data)
+    # not all indices are in a loader.
+    train_index = 0
+    test_index = 0
+    validation_index = 0
+    for index in range(len(train_sample_ids) + len(test_sample_ids) + len(validation_sample_ids)):
+        if train_sample_ids[train_index] < index:
+            train_index += 1
+        if test_sample_ids[test_index] < index:
+            test_index += 1
+        if validation_sample_ids[validation_index] < index:
+            validation_index += 1
+        index_is_in_train = train_sample_ids[train_index] == index
+        index_is_in_test = test_sample_ids[test_index] == index
+        index_is_in_validation = validation_sample_ids[validation_index] == index
+
+        assert not (index_is_in_train and index_is_in_test), f'Index {index} is in both the train and test loaders!'
+        assert not (
+                index_is_in_train and index_is_in_validation), f'Index {index} is in both the train and validation loaders!'
+        assert not (
+                index_is_in_test and index_is_in_validation), f'Index {index} is in both the test and validation loaders!'
+
+
+def test_sequential_data_loader_indices_are_sequential():
+    data = list()
+    num_ids = 46
+
+    for sample_id in range(num_ids):
+        for sequence in range(40 + int(sample_id / 14)):
+            data.append([sequence, sequence, sample_id])
+
+    df = pd.DataFrame(data, columns=['x', 'y', 'ID'])
+
+    # Hyperparameters
+    batch_size = 18
+    num_past = 13
+    num_future = 8
+    train_split_ratio = 0.5
+    validation_split_ratio = 0.2
+    stride = 1
+
+    dataloaders = dataset.MultiModalDataLoader(df,
+                                               batch_size=batch_size,
+                                               n_past=num_past,
+                                               n_future=num_future,
+                                               num_workers=1,
+                                               train_split_ratio=train_split_ratio,
+                                               validation_split_ratio=validation_split_ratio,
+                                               stride=stride)
+
+    current_id = 0
+    for data, target, ids, parameters in dataloaders['sequential_train_loader']:
+        for id in ids:
+            id = int(id)
+            if id > current_id:
+                current_id = id
+            assert id == current_id, 'IDs in sequential train loader should increase monotonically!'
+
+    current_id = 0
+    for data, target, ids, parameters in dataloaders['sequential_test_loader']:
+        for id in ids:
+            id = int(id)
+            if id > current_id:
+                current_id = id
+            assert id == current_id, 'IDs in sequential test loader should increase monotonically!'
diff --git a/traja/tests/test_losses.py b/traja/tests/test_losses.py
new file mode 100644
index 00000000..b2adcf03
--- /dev/null
+++ b/traja/tests/test_losses.py
@@ -0,0 +1,22 @@
+import torch
+
+from traja.models.losses import Criterion
+
+
+def test_forecasting_loss_yields_correct_value():
+    criterion = Criterion()
+
+    predicted = torch.ones((1, 8))
+    target = torch.zeros((1, 8))
+
+    manhattan_loss = criterion.forecasting_criterion(predicted, target, loss_type='manhattan')  # 8
+    huber_low_loss = criterion.forecasting_criterion(predicted * 0.5, target, loss_type='huber')  # ~1
+    huber_high_loss = criterion.forecasting_criterion(predicted * 2, target, loss_type='huber')  # ~12
+    mse_low_loss = criterion.forecasting_criterion(predicted * 0.5, target, loss_type='mse')  # 0.25
+    mse_high_loss = criterion.forecasting_criterion(predicted * 2, target, loss_type='mse')  # 4
+
+    assert manhattan_loss == 8
+    assert huber_low_loss == 1
+    assert huber_high_loss == 12
+    assert mse_low_loss == 0.25
+    assert mse_high_loss == 4
diff --git a/traja/tests/test_models.py b/traja/tests/test_models.py
index 42d36e86..c09839fd 100644
--- a/traja/tests/test_models.py
+++ b/traja/tests/test_models.py
@@ -1,30 +1,37 @@
-from traja.datasets import dataset
-from traja.datasets.example import jaguar
-from traja.models.generative_models.vae import MultiModelVAE
-from traja.models.predictive_models.ae import MultiModelAE
-from traja.models.predictive_models.lstm import LSTM
-from traja.models.train import HybridTrainer
+import pandas as pd
 
-# Sample data
-df = jaguar()
+import traja
+from traja.dataset import dataset
+from traja.dataset.example import jaguar
+from traja.models import LSTM
+from traja.models import MultiModelAE
+from traja.models import MultiModelVAE
+from traja.models.losses import Criterion
+from traja.models.train import HybridTrainer
 
 
-def test_aevae():
+def test_aevae_jaguar():
     """
     Test Autoencoder and variational auto encoder models for training/testing/generative network and
     classification networks
 
     """
+
+    # Sample data
+    df = jaguar()
+
     # Hyperparameters
     batch_size = 10
     num_past = 10
     num_future = 5
     # Prepare the dataloader
-    data_loaders, scalers = dataset.MultiModalDataLoader(df,
-                                                         batch_size=batch_size,
-                                                         n_past=num_past,
-                                                         n_future=num_future,
-                                                         num_workers=1)
+    data_loaders = dataset.MultiModalDataLoader(df,
+                                                batch_size=batch_size,
+                                                n_past=num_past,
+                                                n_future=num_future,
+                                                train_split_ratio=0.5,
+                                                num_workers=1,
+                                                split_by_id=False)
 
     model_save_path = './model.pt'
 
@@ -44,6 +51,13 @@ def test_aevae():
                           batch_first=True,
                           reset_state=True)
 
+    # Test that we can run functions on our network.
+    model.disable_latent_output()
+    model.enable_latent_output()
+
+    # Test that we can reset the classifier
+    model.reset_classifier(classifier_hidden_size=32, num_classifier_layers=4)
+
     # Model Trainer
     # Model types; "ae" or "vae"
     trainer = HybridTrainer(model=model,
@@ -54,23 +68,43 @@ def test_aevae():
     trainer.fit(data_loaders, model_save_path, epochs=10, training_mode='forecasting')
     trainer.fit(data_loaders, model_save_path, epochs=10, training_mode='classification')
 
+    scaler = data_loaders['train_loader'].dataset.scaler
+
+    # Load the trained model given the path
+    model_path = './model.pt'
+    hyperparams = './hypers.json'
+    model_hyperparameters = traja.models.read_hyperparameters(hyperparams)
 
-def test_ae():
+    # For prebuild traja generative models
+    generator = traja.models.inference.Generator(model_type='vae', model_hyperparameters=model_hyperparameters,
+                                                 model_path=model_path, model=None)
+    out = generator.generate(num_future, classify=False, scaler=scaler, plot_data=False)
+
+    trainer.validate(data_loaders['validation_loader'])
+
+
+def test_ae_jaguar():
     """
     Test Autoencoder and variational auto encoder models for training/testing/generative network and
     classification networks
 
     """
+
+    # Sample data
+    df = jaguar()
+
     # Hyperparameters
     batch_size = 10
     num_past = 10
     num_future = 5
     # Prepare the dataloader
-    data_loaders, scalers = dataset.MultiModalDataLoader(df,
-                                                         batch_size=batch_size,
-                                                         n_past=num_past,
-                                                         n_future=num_future,
-                                                         num_workers=1)
+    data_loaders = dataset.MultiModalDataLoader(df,
+                                                batch_size=batch_size,
+                                                n_past=num_past,
+                                                n_future=num_future,
+                                                num_workers=1,
+                                                train_split_ratio=0.5,
+                                                validation_split_ratio=0.2)
 
     model_save_path = './model.pt'
 
@@ -79,6 +113,10 @@ def test_ae():
                          dropout=0.1, reset_state=True, bidirectional=False, num_classifier_layers=4,
                          classifier_hidden_size=32, num_classes=9)
 
+    # Test that we can reset the classifier
+    model.reset_classifier(classifier_hidden_size=32, num_classifier_layers=4)
+
+
     # Model Trainer
     # Model types; "ae" or "vae"
     trainer = HybridTrainer(model=model,
@@ -89,11 +127,17 @@ def test_ae():
     trainer.fit(data_loaders, model_save_path, epochs=10, training_mode='forecasting')
     trainer.fit(data_loaders, model_save_path, epochs=10, training_mode='classification')
 
+    trainer.validate(data_loaders['sequential_validation_loader'])
+
 
-def test_lstm():
+def test_lstm_jaguar():
     """
     Testing method for lstm model used for forecasting.
     """
+
+    # Sample data
+    df = jaguar()
+
     # Hyperparameters
     batch_size = 10
     num_past = 10
@@ -103,11 +147,11 @@ def test_lstm():
     assert num_past == num_future
 
     # Prepare the dataloader
-    data_loaders, scalers = dataset.MultiModalDataLoader(df,
-                                                         batch_size=batch_size,
-                                                         n_past=num_past,
-                                                         n_future=num_future,
-                                                         num_workers=1)
+    data_loaders = dataset.MultiModalDataLoader(df,
+                                                batch_size=batch_size,
+                                                n_past=num_past,
+                                                n_future=num_future,
+                                                num_workers=1)
 
     model_save_path = './model.pt'
 
@@ -128,4 +172,172 @@ def test_lstm():
                             optimizer_type='Adam',
                             loss_type='huber')
     # Train the model
-    trainer.fit(data_loaders, model_save_path, epochs=10, training_mode='forecasting')
+    trainer.fit(data_loaders, model_save_path, epochs=2, training_mode='forecasting')
+
+
+def test_aevae_regression_network_converges():
+    """
+    Test Autoencoder and variational auto encoder models for training/testing/generative network and
+    classification networks
+
+    """
+
+    data = list()
+    num_ids = 3
+
+    for sample_id in range(num_ids):
+        for sequence in range(70 + sample_id * 4):
+            parameter_one = 0.2 * sample_id
+            parameter_two = 91.235 * sample_id
+            data.append([sequence, sequence, sample_id, parameter_one, parameter_two])
+    # Sample data
+    df = pd.DataFrame(data, columns=['x', 'y', 'ID', 'parameter_one', 'parameter_two'])
+
+    parameter_columns = ['parameter_one', 'parameter_two']
+
+    # Hyperparameters
+    batch_size = 1
+    num_past = 10
+    num_future = 5
+    # Prepare the dataloader
+    data_loaders = dataset.MultiModalDataLoader(df,
+                                                batch_size=batch_size,
+                                                n_past=num_past,
+                                                n_future=num_future,
+                                                train_split_ratio=0.333,
+                                                validation_split_ratio=0.333,
+                                                num_workers=1,
+                                                parameter_columns=parameter_columns,
+                                                split_by_id=False,
+                                                stride=1)
+
+    model_save_path = './model.pt'
+
+    model = MultiModelVAE(input_size=2,
+                          output_size=2,
+                          lstm_hidden_size=32,
+                          num_lstm_layers=2,
+                          num_regressor_parameters=len(parameter_columns),
+                          latent_size=10,
+                          dropout=0.1,
+                          num_regressor_layers=4,
+                          regressor_hidden_size=32,
+                          batch_size=batch_size,
+                          num_future=num_future,
+                          num_past=num_past,
+                          bidirectional=False,
+                          batch_first=True,
+                          reset_state=True)
+
+    # Test resetting the regressor, to make sure this function works
+    model.reset_regressor(regressor_hidden_size=32, num_regressor_layers=4)
+
+    # Model Trainer
+    # Model types; "ae" or "vae"
+    trainer = HybridTrainer(model=model,
+                            optimizer_type='Adam',
+                            loss_type='mse')
+
+    criterion = Criterion()
+    loss_pre_training = 0.
+    for data, _, _, parameters in data_loaders['train_loader']:
+        prediction = model(data.float(), regress=True, latent=False)
+        loss_pre_training += criterion.regressor_criterion(prediction, parameters)
+
+    print(f'Loss pre training: {loss_pre_training}')
+
+    # Train the model
+    trainer.fit(data_loaders, model_save_path, epochs=2, training_mode='forecasting')
+    trainer.fit(data_loaders, model_save_path, epochs=2, training_mode='regression')
+
+    loss_post_training = 0.
+    for data, _, _, parameters in data_loaders['train_loader']:
+        prediction = model(data.float(), regress=True, latent=False)
+        loss_post_training += criterion.regressor_criterion(prediction, parameters)
+
+    print(f'Loss post training: {loss_post_training}')
+    assert loss_post_training < loss_pre_training
+
+
+def test_ae_regression_network_converges():
+    """
+    Test Autoencoder and variational auto encoder models for training/testing/generative network and
+    classification networks
+
+    """
+
+    data = list()
+    num_ids = 3
+
+    for sample_id in range(num_ids):
+        for sequence in range(70 + sample_id * 4):
+            parameter_one = 0.2 * sample_id
+            parameter_two = 91.235 * sample_id
+            data.append([sequence, sequence, sample_id, parameter_one, parameter_two])
+    # Sample data
+    df = pd.DataFrame(data, columns=['x', 'y', 'ID', 'parameter_one', 'parameter_two'])
+
+    parameter_columns = ['parameter_one', 'parameter_two']
+
+    # Hyperparameters
+    batch_size = 1
+    num_past = 10
+    num_future = 5
+    # Prepare the dataloader
+    data_loaders = dataset.MultiModalDataLoader(df,
+                                                batch_size=batch_size,
+                                                n_past=num_past,
+                                                n_future=num_future,
+                                                train_split_ratio=0.333,
+                                                validation_split_ratio=0.333,
+                                                num_workers=1,
+                                                parameter_columns=parameter_columns,
+                                                split_by_id=False,
+                                                stride=1)
+
+    model_save_path = './model.pt'
+
+    model = MultiModelAE(input_size=2,
+                         output_size=2,
+                          lstm_hidden_size=32,
+                          num_lstm_layers=2,
+                          num_regressor_parameters=len(parameter_columns),
+                          latent_size=10,
+                          dropout=0.1,
+                          num_regressor_layers=4,
+                          regressor_hidden_size=32,
+                          batch_size=batch_size,
+                          num_future=num_future,
+                          num_past=num_past,
+                          bidirectional=False,
+                          batch_first=True,
+                          reset_state=True)
+
+    # Test resetting the regressor, to make sure this function works
+    model.reset_regressor(regressor_hidden_size=32, num_regressor_layers=4)
+
+    # Model Trainer
+    # Model types; "ae" or "vae"
+    trainer = HybridTrainer(model=model,
+                            optimizer_type='Adam',
+                            loss_type='mse')
+
+    criterion = Criterion()
+    loss_pre_training = 0.
+    for data, _, _, parameters in data_loaders['train_loader']:
+        prediction = model(data.float(), regress=True, latent=False)
+        loss_pre_training += criterion.regressor_criterion(prediction, parameters)
+
+    print(f'Loss pre training: {loss_pre_training}')
+
+    # Train the model
+    trainer.fit(data_loaders, model_save_path, epochs=2, training_mode='forecasting')
+    trainer.fit(data_loaders, model_save_path, epochs=2, training_mode='regression')
+
+    loss_post_training = 0.
+    for data, _, _, parameters in data_loaders['train_loader']:
+        prediction = model(data.float(), regress=True, latent=False)
+        loss_post_training += criterion.regressor_criterion(prediction, parameters)
+
+    print(f'Loss post training: {loss_post_training}')
+    assert loss_post_training < loss_pre_training
\ No newline at end of file
diff --git a/traja/tests/test_optimizers.py b/traja/tests/test_optimizers.py
index e343bc20..d41dbd18 100644
--- a/traja/tests/test_optimizers.py
+++ b/traja/tests/test_optimizers.py
@@ -15,4 +15,4 @@ def test_get_optimizers():
     model_optimizers = opt.get_optimizers(lr=0.1)
     model_schedulers = opt.get_lrschedulers(factor=0.1, patience=10)
 
-    print(model_optimizers, model_schedulers)
\ No newline at end of file
+    print(model_optimizers, model_schedulers)
diff --git a/traja/tests/test_parsers.py b/traja/tests/test_parsers.py
index 1f0230da..7d468f17 100644
--- a/traja/tests/test_parsers.py
+++ b/traja/tests/test_parsers.py
@@ -5,7 +5,6 @@
 
 import traja
 
-
 df = traja.generate(n=20)
 
 
diff --git a/traja/tests/test_plotting.py b/traja/tests/test_plotting.py
index 2b86468e..5fdbfb76 100644
--- a/traja/tests/test_plotting.py
+++ b/traja/tests/test_plotting.py
@@ -1,8 +1,14 @@
 import warnings
 
+import matplotlib
 import numpy as np
 import numpy.testing as npt
-import matplotlib
+
+from traja.dataset import dataset
+from traja.dataset.example import jaguar
+from traja.models.generative_models.vae import MultiModelVAE
+from traja.models.train import HybridTrainer
+from traja.plotting import plot_prediction
 
 matplotlib.use("Agg")
 import matplotlib.pyplot as plt
@@ -119,3 +125,53 @@ def test_plot():
             ]
         ),
     )
+
+
+def test_plot_prediction():
+    # Hyperparameters
+    batch_size = 10
+    num_past = 10
+    num_future = 10
+
+    input_size = 2
+    lstm_hidden_size = 512
+    lstm_num_layers = 4
+    batch_first = True
+    reset_state = True
+    output_size = 2
+    num_classes = 9
+    latent_size = 20
+    dropout = 0.1
+    bidirectional = False
+
+    # Prepare the dataloader
+    df = jaguar()
+    data_loaders = dataset.MultiModalDataLoader(df,
+                                                batch_size=batch_size,
+                                                n_past=num_past,
+                                                n_future=num_future,
+                                                num_workers=1)
+
+    model = MultiModelVAE(input_size=input_size,
+                          output_size=output_size,
+                          lstm_hidden_size=lstm_hidden_size,
+                          num_lstm_layers=lstm_num_layers,
+                          num_classes=num_classes,
+                          latent_size=latent_size,
+                          dropout=dropout,
+                          num_classifier_layers=4,
+                          classifier_hidden_size=32,
+                          batch_size=batch_size,
+                          num_future=num_future,
+                          num_past=num_past,
+                          bidirectional=bidirectional,
+                          batch_first=batch_first,
+                          reset_state=reset_state)
+
+    trainer = HybridTrainer(model=model,
+                            optimizer_type='Adam',
+                            loss_type='huber')
+
+    model_save_path = './model.pt'
+
+    plot_prediction(model, data_loaders['sequential_test_loader'], 1)
diff --git a/traja/trajectory.py b/traja/trajectory.py
index 285d7f68..ea429260 100644
--- a/traja/trajectory.py
+++ b/traja/trajectory.py
@@ -16,7 +16,6 @@
 import traja
 from traja import TrajaDataFrame
 
-
 __all__ = [
     "_bins_to_tuple",
     "_get_time_col",
@@ -198,7 +197,7 @@ def length(trj: TrajaDataFrame) -> float:
 
 
 def expected_sq_displacement(
-    trj: TrajaDataFrame, n: int = 0, eqn1: bool = True
+        trj: TrajaDataFrame, n: int = 0, eqn1: bool = True
 ) -> float:
     """Expected displacement.
 
@@ -219,15 +218,15 @@ def expected_sq_displacement(
         # Eqn 1
         alpha = np.arctan2(s, c)
         gamma = ((1 - c) ** 2 - s2) * np.cos((n + 1) * alpha) - 2 * s * (
-            1 - c
+                1 - c
         ) * np.sin((n + 1) * alpha)
         esd = (
-            n * l2
-            + 2 * l ** 2 * ((c - c ** 2 - s2) * n - c) / ((1 - c) ** 2 + s2)
-            + 2
-            * l ** 2
-            * ((2 * s2 + (c + s2) ** ((n + 1) / 2)) / ((1 - c) ** 2 + s2) ** 2)
-            * gamma
+                n * l2
+                + 2 * l ** 2 * ((c - c ** 2 - s2) * n - c) / ((1 - c) ** 2 + s2)
+                + 2
+                * l ** 2
+                * ((2 * s2 + (c + s2) ** ((n + 1) / 2)) / ((1 - c) ** 2 + s2) ** 2)
+                * gamma
         )
         return abs(esd)
     else:
@@ -254,13 +253,13 @@ def to_utm(trj, lat="lat", lon="lon"):
 
 
 def traj_from_coords(
-    track: Union[np.ndarray, pd.DataFrame],
-    x_col=1,
-    y_col=2,
-    time_col: Optional[str] = None,
-    fps: Union[float, int] = 4,
-    spatial_units: str = "m",
-    time_units: str = "s",
+        track: Union[np.ndarray, pd.DataFrame],
+        x_col=1,
+        y_col=2,
+        time_col: Optional[str] = None,
+        fps: Union[float, int] = 4,
+        spatial_units: str = "m",
+        time_units: str = "s",
 ) -> TrajaDataFrame:
     """Create TrajaDataFrame from coordinates.
 
@@ -529,11 +528,11 @@ def transitions(trj: TrajaDataFrame, **kwargs):
 
 
 def grid_coordinates(
-    trj: TrajaDataFrame,
-    bins: Union[int, tuple] = None,
-    xlim: tuple = None,
-    ylim: tuple = None,
-    assign: bool = False,
+        trj: TrajaDataFrame,
+        bins: Union[int, tuple] = None,
+        xlim: tuple = None,
+        ylim: tuple = None,
+        assign: bool = False,
 ):
     """Returns ``DataFrame`` of trajectory discretized into 2D lattice grid coordinates.
     Args:
@@ -578,17 +577,17 @@ def grid_coordinates(
 
 
 def generate(
-    n: int = 1000,
-    random: bool = True,
-    step_length: int = 2,
-    angular_error_sd: float = 0.5,
-    angular_error_dist: Callable = None,
-    linear_error_sd: float = 0.2,
-    linear_error_dist: Callable = None,
-    fps: float = 50,
-    spatial_units: str = "m",
-    seed: int = None,
-    **kwargs,
+        n: int = 1000,
+        random: bool = True,
+        step_length: int = 2,
+        angular_error_sd: float = 0.5,
+        angular_error_dist: Callable = None,
+        linear_error_sd: float = 0.2,
+        linear_error_dist: Callable = None,
+        fps: float = 50,
+        spatial_units: str = "m",
+        seed: int = None,
+        **kwargs,
 ):
     """Generates a trajectory.
 
@@ -685,7 +684,7 @@ def generate(
 
 
 def _resample_time(
-    trj: TrajaDataFrame, step_time: Union[float, int, str], errors="coerce"
+        trj: TrajaDataFrame, step_time: Union[float, int, str], errors="coerce"
 ):
     if not is_datetime_or_timedelta_dtype(trj.index):
         raise Exception(f"{trj.index.dtype} is not datetime or timedelta.")
@@ -698,8 +697,8 @@ def _resample_time(
                 trj = trj.loc[~trj.index.duplicated(keep="first")]
                 df = (
                     trj.resample(step_time)
-                    .bfill(limit=1)
-                    .interpolate(method="spline", order=2)
+                        .bfill(limit=1)
+                        .interpolate(method="spline", order=2)
                 )
             else:
                 logger.error("Error: duplicate time indices")
@@ -765,7 +764,7 @@ def resample_time(trj: TrajaDataFrame, step_time: str, new_fps: Optional[bool] =
         _trj = _resample_time(_trj, step_time)
     else:
         raise NotImplementedError(
-            f"Time column ({time_col}) not of expected datasets type."
+            f"Time column ({time_col}) not of expected dataset type."
         )
     return _trj
 
@@ -845,7 +844,7 @@ def rediscretize_points(trj: TrajaDataFrame, R: Union[float, int], time_out=Fals
 
 
 def _rediscretize_points(
-    trj: TrajaDataFrame, R: Union[float, int], time_out=False
+        trj: TrajaDataFrame, R: Union[float, int], time_out=False
 ) -> dict:
     """Helper function for :func:`traja.trajectory.rediscretize`.
 
@@ -878,7 +877,7 @@ def _rediscretize_points(
         # Find the first point `curr_ind` for which |points[curr_ind] - p_0| >= R
         curr_ind = np.NaN
         for i in range(
-            candidate_start, n_points
+                candidate_start, n_points
         ):  # range of search space for next point
             d = np.linalg.norm(points[i] - result[step_nr])
             if d >= R:
@@ -1100,7 +1099,7 @@ def calc_heading(trj: TrajaDataFrame):
 
 
 def speed_intervals(
-    trj: TrajaDataFrame, faster_than: float = None, slower_than: float = None
+        trj: TrajaDataFrame, faster_than: float = None, slower_than: float = None
 ) -> pd.DataFrame:
     """Calculate speed time intervals.
 
@@ -1156,13 +1155,13 @@ def speed_intervals(
     if len(start_frames) > 0 or len(stop_frames) > 0:
         # Assume interval started at beginning of trajectory, since we don't know what happened before that
         if len(stop_frames) > 0 and (
-            len(start_frames) == 0 or stop_frames[0] < start_frames[0]
+                len(start_frames) == 0 or stop_frames[0] < start_frames[0]
         ):
             start_frames = np.append(1, start_frames)
         # Similarly, assume that interval can't extend past end of trajectory
         if (
-            len(stop_frames) == 0
-            or start_frames[len(start_frames) - 1] > stop_frames[len(stop_frames) - 1]
+                len(stop_frames) == 0
+                or start_frames[len(start_frames) - 1] > stop_frames[len(stop_frames) - 1]
         ):
             stop_frames = np.append(stop_frames, len(speed) - 1)
 
@@ -1214,12 +1213,12 @@ def get_derivatives(trj: TrajaDataFrame):
         # Convert to float divisible series
         # TODO: Add support for other time units
         t = t.dt.total_seconds()
-    v = d[1 : len(d)] / t.diff()
+    v = d[1: len(d)] / t.diff()
     v.rename("speed")
-    vt = t[1 : len(t)].rename("speed_times")
+    vt = t[1: len(t)].rename("speed_times")
     # Calculate linear acceleration
     a = v.diff() / vt.diff().rename("acceleration")
-    at = vt[1 : len(vt)].rename("accleration_times")
+    at = vt[1: len(vt)].rename("accleration_times")
 
     data = dict(speed=v, speed_times=vt, acceleration=a, acceleration_times=at)
     derivs = derivs.merge(pd.DataFrame(data), left_index=True, right_index=True)
@@ -1231,9 +1230,9 @@ def get_derivatives(trj: TrajaDataFrame):
 
 def _get_xylim(trj: TrajaDataFrame) -> Tuple[Tuple, Tuple]:
     if (
-        "xlim" in trj.__dict__
-        and "ylim" in trj.__dict__
-        and isinstance(trj.xlim, (list, tuple))
+            "xlim" in trj.__dict__
+            and "ylim" in trj.__dict__
+            and isinstance(trj.xlim, (list, tuple))
     ):
         return trj.xlim, trj.ylim
     else:
@@ -1252,8 +1251,8 @@ def coords_to_flow(trj: TrajaDataFrame, bins: Union[int, tuple] = None):
     Returns:
         X (:class:`~numpy.ndarray`): X coordinates of arrow locations
         Y (:class:`~numpy.ndarray`): Y coordinates of arrow locations
-        U (:class:`~numpy.ndarray`): X component of vector datasets
-        V (:class:`~numpy.ndarray`): Y component of vector datasets
+        U (:class:`~numpy.ndarray`): X component of vector dataset
+        V (:class:`~numpy.ndarray`): Y component of vector dataset
 
     """
     xlim, ylim = _get_xylim(trj)