feat: easily create a baseline model (#811)

Closes #710 ### Summary of Changes Added BaselineClassifier, BaselineRegressor and test cases --------- Co-authored-by: megalinter-bot <[email protected]>
Safe-DS · Jun 27, 2024 · 8e1b995 · 8e1b995
1 parent 3878751
commit 8e1b995
Show file tree

Hide file tree

Showing 9 changed files with 595 additions and 7 deletions.
diff --git a/src/safeds/exceptions/__init__.py b/src/safeds/exceptions/__init__.py
@@ -24,6 +24,7 @@
     ModelNotFittedError,
     PlainTableError,
     PredictionError,
+    TargetDataMismatchError,
 )
 
 
@@ -69,6 +70,7 @@ class OutOfBoundsError(SafeDsError):
     # ML exceptions
     "DatasetMissesDataError",
     "DatasetMissesFeaturesError",
+    "TargetDataMismatchError",
     "FeatureDataMismatchError",
     "InvalidFitDataError",
     "InputSizeError",

diff --git a/src/safeds/exceptions/_ml.py b/src/safeds/exceptions/_ml.py
@@ -15,6 +15,26 @@ def __init__(self, missing_feature_names: list[str]):
         super().__init__(f"Dataset misses the feature columns '{missing_feature_names}'.")
 
 
+class TargetDataMismatchError(ValueError):
+    """
+    Raised when the target column of a test dataset mismatches with the target column of the training dataset.
+
+    Currently only used in the Baseline Models.
+
+    Parameters
+    ----------
+    actual_target_name:
+        The actual target column of the dataset.
+    missing_target_name:
+        The name of the missing target column.
+    """
+
+    def __init__(self, actual_target_name: str, missing_target_name: str):
+        super().__init__(
+            f"The provided target column '{actual_target_name}' does not match the target column of the training set '{missing_target_name}'.",
+        )
+
+
 class DatasetMissesDataError(ValueError):
     """Raised when a dataset contains no rows."""
 
@@ -72,16 +92,16 @@ def __init__(self, reason: str):
 
 
 class FeatureDataMismatchError(Exception):
-    """Raised when the columns of the table passed to the predict or fit method do not match with the specified features of the neural network."""
+    """Raised when the columns of the table passed to the predict or fit method do not match with the specified features of the model."""
 
     def __init__(self) -> None:
         super().__init__(
-            "The features in the given table do not match with the specified feature columns names of the neural network.",
+            "The features in the given table do not match with the specified feature columns names of the model.",
         )
 
 
 class InputSizeError(Exception):
-    """Raised when the amount of features being passed to a network does not match with its input size."""
+    """Raised when the amount of features being passed to a model does not match with its input size."""
 
     def __init__(self, data_size: int | ModelImageSize, input_layer_size: int | ModelImageSize | None) -> None:
         # TODO: remove input_layer_size type None again

diff --git a/src/safeds/ml/classical/classification/__init__.py b/src/safeds/ml/classical/classification/__init__.py
@@ -6,6 +6,7 @@
 
 if TYPE_CHECKING:
     from ._ada_boost_classifier import AdaBoostClassifier
+    from ._baseline_classifier import BaselineClassifier
     from ._classifier import Classifier
     from ._decision_tree_classifier import DecisionTreeClassifier
     from ._gradient_boosting_classifier import GradientBoostingClassifier
@@ -18,6 +19,7 @@
     __name__,
     {
         "AdaBoostClassifier": "._ada_boost_classifier:AdaBoostClassifier",
+        "BaselineClassifier": "._baseline_classifier:BaselineClassifier",
         "Classifier": "._classifier:Classifier",
         "DecisionTreeClassifier": "._decision_tree_classifier:DecisionTreeClassifier",
         "GradientBoostingClassifier": "._gradient_boosting_classifier:GradientBoostingClassifier",
@@ -30,6 +32,7 @@
 
 __all__ = [
     "AdaBoostClassifier",
+    "BaselineClassifier",
     "Classifier",
     "DecisionTreeClassifier",
     "GradientBoostingClassifier",

diff --git a/src/safeds/ml/classical/classification/_baseline_classifier.py b/src/safeds/ml/classical/classification/_baseline_classifier.py
@@ -0,0 +1,188 @@
+import copy
+from concurrent.futures import ALL_COMPLETED, wait
+from typing import Self
+
+from safeds._validation._check_columns_are_numeric import _check_columns_are_numeric
+from safeds.data.labeled.containers import TabularDataset
+from safeds.exceptions import (
+    DatasetMissesDataError,
+    FeatureDataMismatchError,
+    ModelNotFittedError,
+    TargetDataMismatchError,
+)
+from safeds.ml.classical.classification import (
+    AdaBoostClassifier,
+    Classifier,
+    DecisionTreeClassifier,
+    GradientBoostingClassifier,
+    RandomForestClassifier,
+    SupportVectorClassifier,
+)
+
+
+def _fit_single_model(model: Classifier, train_data: TabularDataset) -> Classifier:
+    return model.fit(train_data)  # pragma: no cover
+
+
+def _predict_single_model(model: Classifier, test_data: TabularDataset) -> TabularDataset:
+    return model.predict(test_data)  # pragma: no cover
+
+
+class BaselineClassifier:
+    """
+    Baseline Classifier.
+
+    Get a baseline by fitting data on multiple different models and comparing the best metrics.
+
+    Parameters ---------- extended_search: If set to true, an extended set of models will be used to fit the
+    classifier. This might result in significantly higher runtime.
+    """
+
+    def __init__(self, extended_search: bool = False):
+        self._is_fitted = False
+        self._list_of_model_types = [
+            AdaBoostClassifier(),
+            DecisionTreeClassifier(),
+            SupportVectorClassifier(),
+            RandomForestClassifier(),
+        ]
+        if extended_search:
+            self._list_of_model_types.extend([GradientBoostingClassifier()])  # pragma: no cover
+
+        self._fitted_models: list[Classifier] = []
+        self._feature_names: list[str] | None = None
+        self._target_name: str = "none"
+
+    def fit(self, train_data: TabularDataset) -> Self:
+        """
+        Train the Classifier with given training data.
+
+        The original model is not modified.
+
+        Parameters
+        ----------
+        train_data:
+            The data the network should be trained on.
+
+        Returns
+        -------
+        trained_classifier:
+            The trained Classifier
+
+        Raises
+        ------
+        DatasetMissesDataError
+            If the given train_data contains no data.
+        ColumnTypeError
+            If one or more columns contain non-numeric values.
+        """
+        from concurrent.futures import ProcessPoolExecutor
+
+        # Validate Data
+        train_data_as_table = train_data.to_table()
+        if train_data_as_table.row_count == 0:
+            raise DatasetMissesDataError
+        _check_columns_are_numeric(train_data_as_table, train_data.features.add_columns(train_data.target).column_names)
+
+        copied_model = copy.deepcopy(self)
+
+        with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor:
+            futures = []
+            for model in self._list_of_model_types:
+                futures.append(executor.submit(_fit_single_model, model, train_data))
+            [done, _] = wait(futures, return_when=ALL_COMPLETED)
+            for future in done:
+                copied_model._fitted_models.append(future.result())
+        executor.shutdown()
+
+        copied_model._is_fitted = True
+        copied_model._feature_names = train_data.features.column_names
+        copied_model._target_name = train_data.target.name
+        return copied_model
+
+    def predict(self, test_data: TabularDataset) -> dict[str, float]:
+        """
+        Make a prediction for the given test data and calculate the best metrics.
+
+        The original Model is not modified.
+
+        Parameters
+        ----------
+        test_data:
+            The data the Classifier should predict.
+
+        Returns
+        -------
+        best_metrics:
+            A dictionary with the best metrics that were achieved.
+
+        Raises
+        ------
+        ModelNotFittedError
+            If the model has not been fitted yet
+        FeatureDataMismatchError
+            If the features of the test data do not match with the features of the trained Classifier.
+        DatasetMissesDataError
+            If the given test_data contains no data.
+        TargetDataMismatchError
+            If the target column of the test data does not match the target column of the training data.
+        ColumnTypeError
+            If one or more columns contain non-numeric values.
+        """
+        from concurrent.futures import ProcessPoolExecutor
+
+        from safeds.ml.metrics import ClassificationMetrics
+
+        if not self._is_fitted:
+            raise ModelNotFittedError
+
+        # Validate data
+        if not self._feature_names == test_data.features.column_names:
+            raise FeatureDataMismatchError
+        if not self._target_name == test_data.target.name:
+            raise TargetDataMismatchError(
+                actual_target_name=test_data.target.name,
+                missing_target_name=self._target_name,
+            )
+        test_data_as_table = test_data.to_table()
+        if test_data_as_table.row_count == 0:
+            raise DatasetMissesDataError
+        _check_columns_are_numeric(test_data_as_table, test_data.features.add_columns(test_data.target).column_names)
+
+        with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor:
+            results = []
+            futures = []
+            for model in self._fitted_models:
+                futures.append(executor.submit(_predict_single_model, model, test_data))
+            [done, _] = wait(futures, return_when=ALL_COMPLETED)
+            for future in done:
+                results.append(future.result())
+        executor.shutdown()
+
+        max_metrics = {"accuracy": 0.0, "f1score": 0.0, "precision": 0.0, "recall": 0.0}
+        for result in results:
+            accuracy = ClassificationMetrics.accuracy(result, test_data)
+
+            positive_class = test_data.target.get_value(0)
+            f1score = ClassificationMetrics.f1_score(result, test_data, positive_class)
+            precision = ClassificationMetrics.precision(result, test_data, positive_class)
+            recall = ClassificationMetrics.recall(result, test_data, positive_class)
+
+            if max_metrics.get("accuracy", 0.0) < accuracy:
+                max_metrics.update({"accuracy": accuracy})
+
+            if max_metrics.get("f1score", 0.0) < f1score:
+                max_metrics.update({"f1score": f1score})
+
+            if max_metrics.get("precision", 0.0) < precision:
+                max_metrics.update({"precision": precision})
+
+            if max_metrics.get("recall", 0.0) < recall:
+                max_metrics.update({"recall": recall})
+
+        return max_metrics
+
+    @property
+    def is_fitted(self) -> bool:
+        """Whether the model is fitted."""
+        return self._is_fitted
diff --git a/src/safeds/ml/classical/regression/__init__.py b/src/safeds/ml/classical/regression/__init__.py
@@ -7,6 +7,7 @@
 if TYPE_CHECKING:
     from ._ada_boost_regressor import AdaBoostRegressor
     from ._arima import ArimaModelRegressor
+    from ._baseline_regressor import BaselineRegressor
     from ._decision_tree_regressor import DecisionTreeRegressor
     from ._elastic_net_regressor import ElasticNetRegressor
     from ._gradient_boosting_regressor import GradientBoostingRegressor
@@ -23,6 +24,7 @@
     {
         "AdaBoostRegressor": "._ada_boost_regressor:AdaBoostRegressor",
         "ArimaModelRegressor": "._arima:ArimaModelRegressor",
+        "BaselineRegressor": "._baseline_regressor:BaselineRegressor",
         "DecisionTreeRegressor": "._decision_tree_regressor:DecisionTreeRegressor",
         "ElasticNetRegressor": "._elastic_net_regressor:ElasticNetRegressor",
         "GradientBoostingRegressor": "._gradient_boosting_regressor:GradientBoostingRegressor",
@@ -39,6 +41,7 @@
 __all__ = [
     "AdaBoostRegressor",
     "ArimaModelRegressor",
+    "BaselineRegressor",
     "DecisionTreeRegressor",
     "ElasticNetRegressor",
     "GradientBoostingRegressor",