feat: add ARIMA model (#577)

Closes #570 ### Summary of Changes - added ```time_series_from_csv``` to ```TimeSeries```-class. - added ```split_rows``` to ```TimeSeries```-class. - added ```ArimaModel``` - added ```compare_timeseries``` to ```TimeSeries```-class.  You can see test workflow on the test function ```test_arima_model()``` --------- Co-authored-by: megalinter-bot <[email protected]> Co-authored-by: WinPlay02 <[email protected]> Co-authored-by: Alexander <[email protected]>
Safe-DS · Apr 9, 2024 · 8b9c7a9 · 8b9c7a9
1 parent ffb8304
commit 8b9c7a9
Show file tree

Hide file tree

Showing 16 changed files with 2,606 additions and 701 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,6 +26,7 @@ seaborn = "^0.13.0"
 torch = {version = "^2.2.0", source = "torch_cuda121"}
 torchvision = {version = "^0.17.0", source = "torch_cuda121"}
 xxhash = "^3.4.1"
+statsmodels = "^0.14.1"
 
 [tool.poetry.group.dev.dependencies]
 pytest = ">=7.2.1,<9.0.0"

diff --git a/src/safeds/data/tabular/containers/_time_series.py b/src/safeds/data/tabular/containers/_time_series.py
@@ -21,6 +21,7 @@
 
 if TYPE_CHECKING:
     from collections.abc import Callable, Mapping, Sequence
+    from pathlib import Path
     from typing import Any
 
 
@@ -29,6 +30,57 @@ class TimeSeries(Table):
     # ------------------------------------------------------------------------------------------------------------------
     # Creation
     # ------------------------------------------------------------------------------------------------------------------
+
+    @staticmethod
+    def timeseries_from_csv_file(
+        path: str | Path,
+        target_name: str,
+        time_name: str,
+        feature_names: list[str] | None = None,
+    ) -> TimeSeries:
+        """
+        Read data from a CSV file into a table.
+
+        Parameters
+        ----------
+        path:
+            The path to the CSV file.
+
+        target_name:
+            The name of the target column
+
+        time_name:
+            The name of the time column
+
+        feature_names:
+            The name(s) of the column(s)
+
+        Returns
+        -------
+        table:
+            The time series created from the CSV file.
+
+        Raises
+        ------
+        FileNotFoundError
+            If the specified file does not exist.
+        WrongFileExtensionError
+            If the file is not a csv file.
+        UnknownColumnNameError
+            If target_name or time_name matches none of the column names.
+        Value Error
+            If one column is target and feature
+        Value Error
+            If one column is time and feature
+
+        """
+        return TimeSeries._from_table(
+            Table.from_csv_file(path=path),
+            target_name=target_name,
+            time_name=time_name,
+            feature_names=feature_names,
+        )
+
     @staticmethod
     def _from_tagged_table(
         tagged_table: TaggedTable,
@@ -128,12 +180,17 @@ def _from_table(
 
         if target_name not in table.column_names:
             raise UnknownColumnNameError([target_name])
-
         result = object.__new__(TimeSeries)
         result._data = table._data
+
         result._schema = table._schema
         result._time = table.get_column(time_name)
         result._target = table.get_column(target_name)
+        # empty Columns have dtype Object
+        if len(result._time._data) == 0:
+            result._time._data = pd.Series(name=time_name)
+        if len(result.target._data) == 0:
+            result.target._data = pd.Series(name=target_name)
         if feature_names is None or len(feature_names) == 0:
             result._feature_names = []
             result._features = Table()
@@ -203,6 +260,11 @@ def __init__(
             raise UnknownColumnNameError([time_name])
         self._time: Column = _data.get_column(time_name)
         self._target: Column = _data.get_column(target_name)
+        # empty Columns have dtype Object
+        if len(self._time._data) == 0:
+            self._time._data = pd.Series(name=time_name)
+        if len(self.target._data) == 0:
+            self.target._data = pd.Series(name=target_name)
 
     def __eq__(self, other: object) -> bool:
         """
@@ -216,6 +278,7 @@ def __eq__(self, other: object) -> bool:
             return NotImplemented
         if self is other:
             return True
+
         return (
             self.time == other.time
             and self.target == other.target
@@ -1113,3 +1176,102 @@ def plot_scatterplot(
         buffer.seek(0)
         self._data = self._data.reset_index()
         return Image.from_bytes(buffer.read())
+
+    def split_rows(self, percentage_in_first: float) -> tuple[TimeSeries, TimeSeries]:
+        """
+        Split the table into two new tables.
+
+        The original time series is not modified.
+
+        Parameters
+        ----------
+        percentage_in_first:
+            The desired size of the first time series in percentage to the given time series; must be between 0 and 1.
+
+        Returns
+        -------
+        result:
+            A tuple containing the two resulting time series. The first time series has the specified size, the second time series
+            contains the rest of the data.
+
+        Raises
+        ------
+        ValueError:
+            if the 'percentage_in_first' is not between 0 and 1.
+
+        Examples
+        --------
+        >>> from safeds.data.tabular.containers import TimeSeries
+        >>> time_series = TimeSeries({"time":[0, 1, 2, 3, 4], "temperature": [10, 15, 20, 25, 30], "sales": [54, 74, 90, 206, 210]}, time_name="time", target_name="sales")
+        >>> slices = time_series.split_rows(0.4)
+        >>> slices[0]
+           time  temperature  sales
+        0     0           10     54
+        1     1           15     74
+        >>> slices[1]
+           time  temperature  sales
+        0     2           20     90
+        1     3           25    206
+        2     4           30    210
+        """
+        temp = self._as_table()
+        t1, t2 = temp.split_rows(percentage_in_first=percentage_in_first)
+        return (
+            TimeSeries._from_table(
+                t1,
+                time_name=self.time.name,
+                target_name=self._target.name,
+                feature_names=self._feature_names,
+            ),
+            TimeSeries._from_table(
+                t2,
+                time_name=self.time.name,
+                target_name=self._target.name,
+                feature_names=self._feature_names,
+            ),
+        )
+
+    def plot_compare_time_series(self, time_series: list[TimeSeries]) -> Image:
+        """
+        Plot the given time series targets along the time on the x-axis.
+
+        Parameters
+        ----------
+        time_series:
+            A list of time series to be plotted.
+
+        Returns
+        -------
+        plot:
+              A plot with all the time series targets plotted by the time on the x-axis.
+
+        Raises
+        ------
+        NonNumericColumnError
+            if the target column contains non numerical values
+
+        """
+        if not self._target.type.is_numeric():
+            raise NonNumericColumnError("The time series plotted column contains non-numerical columns.")
+
+        data = pd.DataFrame()
+        data[self.time.name] = self.time._data
+        data[self.target.name] = self.target._data
+        for index, ts in enumerate(time_series):
+            if not ts.target.type.is_numeric():
+                raise NonNumericColumnError("The time series plotted column contains non-numerical columns.")
+            data[ts.target.name + " " + str(index)] = ts.target._data
+        fig = plt.figure()
+
+        data = pd.melt(data, [self.time.name])
+        sns.lineplot(x=self.time.name, y="value", hue="variable", data=data)
+        plt.title("Multiple Series Plot")
+        plt.xlabel("Time")
+
+        plt.tight_layout()
+        buffer = io.BytesIO()
+        fig.savefig(buffer, format="png")
+        plt.close()  # Prevents the figure from being displayed directly
+        buffer.seek(0)
+        self._data = self._data.reset_index()
+        return Image.from_bytes(buffer.read())
diff --git a/src/safeds/exceptions/__init__.py b/src/safeds/exceptions/__init__.py
@@ -28,6 +28,7 @@
     DatasetMissesFeaturesError,
     LearningError,
     ModelNotFittedError,
+    NonTimeSeriesError,
     PredictionError,
     UntaggedTableError,
 )
@@ -56,6 +57,7 @@
     "DatasetMissesFeaturesError",
     "LearningError",
     "ModelNotFittedError",
+    "NonTimeSeriesError",
     "PredictionError",
     "UntaggedTableError",
     # Other

diff --git a/src/safeds/exceptions/_ml.py b/src/safeds/exceptions/_ml.py
@@ -78,3 +78,15 @@ def __init__(self) -> None:
                 " features and which are the target to predict.\nUse Table.tag_column() to create a tagged table."
             ),
         )
+
+
+class NonTimeSeriesError(Exception):
+    """Raised when a table is used instead of a TimeSeries in a regression or classification."""
+
+    def __init__(self) -> None:
+        super().__init__(
+            (
+                "This method needs a time series.\nA time series is a table that additionally knows which columns are"
+                " time and which are the target to predict.\n"
+            ),
+        )
diff --git a/src/safeds/ml/classical/regression/__init__.py b/src/safeds/ml/classical/regression/__init__.py
@@ -1,6 +1,7 @@
 """Models for regression tasks."""
 
 from ._ada_boost import AdaBoostRegressor
+from ._arima import ArimaModelRegressor
 from ._decision_tree import DecisionTreeRegressor
 from ._elastic_net_regression import ElasticNetRegressor
 from ._gradient_boosting import GradientBoostingRegressor
@@ -14,6 +15,7 @@
 
 __all__ = [
     "AdaBoostRegressor",
+    "ArimaModelRegressor",
     "DecisionTreeRegressor",
     "ElasticNetRegressor",
     "GradientBoostingRegressor",