main.py

import sys
from typing import Tuple, Optional, Union
import numpy as np
import pandas as pd
from darts import TimeSeries
from darts.models import TCNModel
from darts.utils.data import PastCovariatesSequentialDataset, PastCovariatesInferenceDataset
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error as mse


class CustomDataset:
    """
    A custom dataset class that contains common attributes and methods for training and inference datasets.
    """
    def __init__(self):
        # Initialize all attributes to None
        self.labels_timestamps = None
        self.labels = None
        self.timestamps = None
        self.data = None
        self.features = None
        self.target = None
        self.data_windows = None

    def check_unique_timestamps(self):
        """
        Check the uniqueness of timestamps in the data windows.
        If timestamps are not unique, print the count of each timestamp and raise a ValueError.
        """
        timestamps_unique = np.unique(self.timestamps, axis=0)
        if timestamps_unique.shape != self.timestamps.shape:
            for timestamp, count in zip(
                    *np.unique(self.timestamps, return_counts=True, axis=0)
            ):
                if count > 1:
                    print(f"{int(timestamp)}: {count}.")

            raise ValueError("Timestamps in the data windows are not unique.")

    def init_common_attributes(self, **kwargs):
        """
        Initialize common attributes for the dataset.
        """
        self.data_windows = kwargs["data_windows"]
        self.target = kwargs["target_feature"]
        self.features = len(self.data_windows["data"][0][0])
        self.data = self.data_windows["data"]
        self.timestamps = self.data_windows["timestamps"]
        self.labels = self.data_windows["labels"]
        self.labels_timestamps = self.data_windows["labels_timestamps"]

    def common_init(self):
        """
        This method should be implemented in subclasses.
        """
        raise NotImplementedError


class CustomTrainingDataset(CustomDataset):
    def __init__(self):
        super().__init__()
        self.output_chunk_length = None
        self.input_chunk_length = None

    def common_init(self, **kwargs):
        """
        Initialize common attributes for the training dataset.
        """
        self.input_chunk_length = kwargs["input_chunk_length"]
        self.output_chunk_length = kwargs["output_chunk_length"]

        self.init_common_attributes(**kwargs)
        self.check_unique_timestamps()

        print(f"Initialized dataset with {np.unique(self.timestamps).shape[0]} timestamps.", flush=True)


class CustomInferenceDataset(CustomDataset):

    def __init__(self):
        super().__init__()

    def _common_init(self, **kwargs):
        """
        Initialize common attributes for the inference dataset.
        """
        self.input_time_series = kwargs["target_series"]
        target_series = kwargs["target_series"]
        target_feature = kwargs["target_feature"]
        if isinstance(target_feature, str):
            print(
                "Target feature passed as string. Getting index of the feature from the TimeSeries object.",
                file=sys.stderr,
            )
            target = target_series.features.index(target_feature)
        else:
            target = target_feature

        kwargs["target_feature"] = target

        self.init_common_attributes(
            **kwargs
        )
        self.check_unique_timestamps()

        print(f"Initialized dataset with {np.unique(self.timestamps).shape[0]} timestamps.", flush=True)


class CustomPastCovariatesSequentialDataset(CustomTrainingDataset, PastCovariatesSequentialDataset):
    def __init__(
            self,
            target_series: TimeSeries,
            covariates: TimeSeries,
            target_feature: int,
            input_chunk_length: int,
            output_chunk_length: int,
            data_windows: dict,
    ):
        """
        Initialize the CustomPastCovariatesSequentialDataset.
        """
        CustomTrainingDataset.__init__(self)
        PastCovariatesSequentialDataset.__init__(
            self,
            target_series=target_series,
            covariates=covariates,
            input_chunk_length=input_chunk_length,
            output_chunk_length=output_chunk_length
        )

        self.common_init(target_series=target_series,
                         target_feature=target_feature,
                         data_windows=data_windows,
                         input_chunk_length=input_chunk_length,
                         output_chunk_length=output_chunk_length
                         )

    def __len__(self):
        """
        Return the length of the dataset.
        """
        return len(self.data_windows["data"])

    def __getitem__(
            self, item
    ) -> Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray], np.ndarray]:
        """
        Get an item from the dataset.
        """
        past_target = self.data[item][:, [self.target]]
        covariate = self.data[item][
                    :, [i for i in range(self.features) if i != self.target]
                    ]
        static_covariate = None
        future_target = self.labels[item].reshape(1, 1)

        # copied these names from PastCovariatesSequentialDataset as a reference
        return past_target, covariate, static_covariate, future_target


class CustomPastCovariatesInferenceDataset(CustomInferenceDataset, PastCovariatesInferenceDataset):
    def __init__(
            self,
            target_series: TimeSeries,
            covariates: TimeSeries,
            target_feature: int,
            input_chunk_length: int,
            output_chunk_length: int,
            data_windows: dict,
    ):
        """
        Initialize the CustomPastCovariatesInferenceDataset.
        """
        CustomInferenceDataset.__init__(self)
        PastCovariatesInferenceDataset.__init__(
            self,
            target_series=target_series,
            covariates=covariates,
            input_chunk_length=input_chunk_length,
            output_chunk_length=output_chunk_length,
        )

        self._common_init(target_series=target_series,
                          target_feature=target_feature,
                          data_windows=data_windows,
                          input_chunk_length=input_chunk_length,
                          output_chunk_length=output_chunk_length
                          )

    def __len__(self):
        """
        Return the length of the dataset.
        """
        return len(self.data_windows["data"])

    def __getitem__(
            self, item
    ) -> Tuple[
        np.ndarray,
        Optional[np.ndarray],
        Optional[np.ndarray],
        Optional[np.ndarray],
        TimeSeries,
        Union[pd.Timestamp, int],
    ]:
        """
        Get an item from the dataset.
        """
        # extract only target feature
        # darts sanity checking wants the predict sample to be the same shape as the train sample
        past_target = self.data[item][:, [self.target]]
        # extract all features except target
        past_covariates = self.data[item][
                          :, [i for i in range(self.features) if i != self.target]
                          ]
        static_covariates = None
        future_past_covariates = None
        input_time_series = self.input_time_series
        if isinstance(input_time_series.pd_dataframe().index[0], pd.Timestamp):
            prediction_start_time_step = pd.to_datetime(self.labels_timestamps[item][0])
        else:
            prediction_start_time_step = self.labels_timestamps[item].astype(int)[0]

        return (
            past_target,
            past_covariates,
            future_past_covariates,
            static_covariates,
            input_time_series,
            prediction_start_time_step,
        )


def generate_dataset(data: pd.DataFrame) -> dict:
    """
        This function generates a dataset for training, validation and testing.
        The dataset is a dictionary containing data, timestamps, labels and labels_timestamps.

        Args:
            data (pd.DataFrame): The input data as a pandas DataFrame.

        Returns:
            dict: The generated dataset.
        """
    ...  # data is used to generate the sliding window datasets
    return {
        "data": np.array(
            [
                [[1, 6, 11], [2, 7, 12], [3, 8, 13], [4, 9, 14], [5, 10, 15]],
                [[2, 7, 12], [3, 8, 13], [4, 9, 14], [5, 10, 15], [6, 11, 16]],
                [[3, 8, 13], [4, 9, 14], [5, 10, 15], [6, 11, 16], [7, 12, 17]],
                [[4, 9, 14], [5, 10, 15], [6, 11, 16], [7, 12, 17], [8, 13, 18]],
                [[5, 10, 15], [6, 11, 16], [7, 12, 17], [8, 13, 18], [9, 14, 19]],
                [[6, 11, 16], [7, 12, 17], [8, 13, 18], [9, 14, 19], [10, 15, 20]],
                [[7, 12, 17], [8, 13, 18], [9, 14, 19], [10, 15, 20], [11, 16, 21]],
                [[8, 13, 18], [9, 14, 19], [10, 15, 20], [11, 16, 21], [12, 17, 22]],
                [[9, 14, 19], [10, 15, 20], [11, 16, 21], [12, 17, 22], [13, 18, 23]],
                [[10, 15, 20], [11, 16, 21], [12, 17, 22], [13, 18, 23], [14, 19, 24]],
                [[11, 16, 21], [12, 17, 22], [13, 18, 23], [14, 19, 24], [15, 20, 25]],
                [[12, 17, 22], [13, 18, 23], [14, 19, 24], [15, 20, 25], [16, 21, 26]],
                [[13, 18, 23], [14, 19, 24], [15, 20, 25], [16, 21, 26], [17, 22, 27]],
                [[14, 19, 24], [15, 20, 25], [16, 21, 26], [17, 22, 27], [18, 23, 28]],
                [[15, 20, 25], [16, 21, 26], [17, 22, 27], [18, 23, 28], [19, 24, 29]],
            ],
            dtype=np.float32,
        ),
        "timestamps": np.array(
            [
                [0, 1, 2, 3, 4],
                [1, 2, 3, 4, 5],
                [2, 3, 4, 5, 6],
                [3, 4, 5, 6, 7],
                [4, 5, 6, 7, 8],
                [5, 6, 7, 8, 9],
                [6, 7, 8, 9, 10],
                [7, 8, 9, 10, 11],
                [8, 9, 10, 11, 12],
                [9, 10, 11, 12, 13],
                [10, 11, 12, 13, 14],
                [11, 12, 13, 14, 15],
                [12, 13, 14, 15, 16],
                [13, 14, 15, 16, 17],
                [14, 15, 16, 17, 18],
            ],
            dtype=np.float32,
        ),
        "labels": np.array(
            [
                [6], [7], [8], [9], [10], [11], [12], [13], [14], [15], [16], [17], [18], [19], [20]
            ], dtype=np.float32
        ),
        "labels_timestamps": np.array(
            [
                [5], [6], [7], [8], [9], [10], [11], [12], [13], [14], [15], [16], [17], [18], [19]
            ], dtype=np.float32
        ),
    }


def main():
    """
    The main function of the script. It prepares the data, trains the model, and makes predictions.
    """
    input_chunk_length = 5
    output_chunk_length = 1
    model = TCNModel(input_chunk_length=input_chunk_length, output_chunk_length=output_chunk_length)
    target_feature = 0
    target_feature_name = 'feature1'
    data = pd.DataFrame({
        "feature1": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
        "feature2": [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25],
        "feature3": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30],
    }, index=range(20))

    # split train, val and test data. Not necessary for this example as we are using a fixed dataset
    # train_data, val_test_data = target_series.split_after(0.6)  # up to [6, 11, 16], 6 elements
    # val_data, test_data = val_test_data.split_after(0.5)  # up to [8, 13, 18] and [10, 15, 20], 2 elements each

    data_windows = generate_dataset(data)
    train_data_windows = {
        "data": data_windows["data"][:6], "timestamps": data_windows["timestamps"][:6],
        "labels": data_windows["labels"][:6], "labels_timestamps": data_windows["labels_timestamps"][:6]
    }
    val_data_windows = {
        "data": data_windows["data"][6:8], "timestamps": data_windows["timestamps"][6:8],
        "labels": data_windows["labels"][6:8], "labels_timestamps": data_windows["labels_timestamps"][6:8]
    }

    test_data_windows = {
        "data": data_windows["data"][8:], "timestamps": data_windows["timestamps"][8:],
        "labels": data_windows["labels"][8:], "labels_timestamps": data_windows["labels_timestamps"][8:]
    }

    train_target_series = TimeSeries.from_dataframe(
        data.iloc[:6][[target_feature_name]]
    )
    val_target_series = TimeSeries.from_dataframe(
        data.iloc[6:8][[target_feature_name]]
    )
    test_target_series = TimeSeries.from_dataframe(
        data.iloc[8:][[target_feature_name]]
    )

    train_covariates = TimeSeries.from_dataframe(
        data.iloc[:6][['feature2', 'feature3']]
    )
    val_covariates = TimeSeries.from_dataframe(
        data.iloc[6:8][['feature2', 'feature3']]
    )
    test_covariates = TimeSeries.from_dataframe(
        data.iloc[8:][['feature2', 'feature3']]
    )

    train_dataset = CustomPastCovariatesSequentialDataset(
        target_series=train_target_series,
        covariates=train_covariates,
        target_feature=target_feature,
        input_chunk_length=input_chunk_length,
        output_chunk_length=output_chunk_length,
        data_windows=train_data_windows,
    )

    val_dataset = CustomPastCovariatesSequentialDataset(
        target_series=val_target_series,
        covariates=val_covariates,
        target_feature=target_feature,
        input_chunk_length=input_chunk_length,
        output_chunk_length=output_chunk_length,
        data_windows=val_data_windows,
    )

    early_stopping = EarlyStopping(
        monitor="val_loss",
        patience=10,
        min_delta=0.05,
        verbose=True,
        mode="min",
    )

    trainer = Trainer(
        accelerator='auto',
        devices=1,
        callbacks=[early_stopping],
        max_epochs=100,
    )

    model.fit_from_dataset(
        train_dataset=train_dataset,
        val_dataset=val_dataset,
        trainer=trainer,
        verbose=True,
    )

    inference_dataset = CustomPastCovariatesInferenceDataset(
        target_series=test_target_series,
        covariates=test_covariates,
        target_feature=target_feature,
        input_chunk_length=input_chunk_length,
        output_chunk_length=output_chunk_length,
        data_windows=test_data_windows,
    )

    tester = Trainer(accelerator='auto', devices=1)

    forecast_horizon = 1
    y_pred: list[TimeSeries] = model.predict_from_dataset(
        n=forecast_horizon,
        input_series_dataset=inference_dataset,
        trainer=tester,
        verbose=True,
    )

    y_pred = np.concatenate([[ts.values()[0]] for ts in y_pred])
    y_true = test_data_windows["labels"].flatten()
    print("MSE:", mse(y_true, y_pred))


if __name__ == '__main__':
    main()