Skip to content

Commit

Permalink
feat: Improve Error Handling of classifiers and regressors (#355)
Browse files Browse the repository at this point in the history
Closes #153 .

### Summary of Changes

The Error Handling of classifiers and regressors was not ideal. Now
every classifier and regressor validates the data, especially when given
non-numerical or missing values in the table.

<!-- Please provide a summary of changes in this pull request, ensuring
all changes are explained. -->

---------

Co-authored-by: Alexander Gréus <[email protected]>
Co-authored-by: megalinter-bot <[email protected]>
Co-authored-by: Severin Paul Höfer <[email protected]>
Co-authored-by: Alexander <[email protected]>
Co-authored-by: Junior Atemebang <[email protected]>
  • Loading branch information
6 people authored Jun 16, 2023
1 parent 54f4ae1 commit 66f5f64
Show file tree
Hide file tree
Showing 25 changed files with 604 additions and 45 deletions.
4 changes: 4 additions & 0 deletions src/safeds/exceptions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
ColumnSizeError,
DuplicateColumnNameError,
IndexOutOfBoundsError,
MissingValuesColumnError,
NonNumericColumnError,
SchemaMismatchError,
TransformerNotFittedError,
Expand All @@ -14,6 +15,7 @@
)
from safeds.exceptions._ml import (
DatasetContainsTargetError,
DatasetMissesDataError,
DatasetMissesFeaturesError,
LearningError,
ModelNotFittedError,
Expand All @@ -33,11 +35,13 @@
"UnknownColumnNameError",
"ValueNotPresentWhenFittedError",
"WrongFileExtensionError",
"MissingValuesColumnError",
# ML exceptions
"DatasetContainsTargetError",
"DatasetMissesFeaturesError",
"LearningError",
"ModelNotFittedError",
"PredictionError",
"UntaggedTableError",
"DatasetMissesDataError",
]
23 changes: 21 additions & 2 deletions src/safeds/exceptions/_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,27 @@ def __init__(self, column_names: list[str]):
class NonNumericColumnError(Exception):
"""Exception raised for trying to do numerical operations on a non-numerical column."""

def __init__(self, column_info: str) -> None:
super().__init__(f"Tried to do a numerical operation on one or multiple non numerical Columns: \n{column_info}")
def __init__(self, column_info: str, help_msg: str | None = None) -> None:
line_break = "\n"
super().__init__(
(
"Tried to do a numerical operation on one or multiple non-numerical columns:"
f" \n{column_info}{line_break + help_msg if help_msg is not None else ''}"
),
)


class MissingValuesColumnError(Exception):
"""Exception raised for trying to do operations on columns containing missing values."""

def __init__(self, column_info: str, help_msg: str | None = None) -> None:
line_break = "\n"
super().__init__(
(
"Tried to do an operation on one or multiple columns containing missing values:"
f" \n{column_info}{line_break + help_msg if help_msg is not None else ''}"
),
)


class DuplicateColumnNameError(Exception):
Expand Down
7 changes: 7 additions & 0 deletions src/safeds/exceptions/_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,13 @@ def __init__(self, missing_feature_names: list[str]):
super().__init__(f"Dataset misses the feature columns '{missing_feature_names}'.")


class DatasetMissesDataError(ValueError):
"""Raised when a dataset contains no rows."""

def __init__(self) -> None:
super().__init__("Dataset contains no rows")


class LearningError(Exception):
"""
Raised when an error occurred while training a model.
Expand Down
72 changes: 72 additions & 0 deletions src/safeds/ml/classical/_util_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@
from safeds.data.tabular.containers import Table, TaggedTable
from safeds.exceptions import (
DatasetContainsTargetError,
DatasetMissesDataError,
DatasetMissesFeaturesError,
LearningError,
MissingValuesColumnError,
ModelNotFittedError,
NonNumericColumnError,
PredictionError,
UntaggedTableError,
)
Expand All @@ -30,9 +33,44 @@ def fit(model: Any, tagged_table: TaggedTable) -> None:
If the tagged table contains invalid values or if the training failed.
UntaggedTableError
If the table is untagged.
NonNumericColumnError
If the training data contains non-numerical values.
MissingValuesColumnError
If the training data contains missing values.
DatasetMissesDataError
If the training data contains no rows.
"""
if not isinstance(tagged_table, TaggedTable) and isinstance(tagged_table, Table):
raise UntaggedTableError

if tagged_table.number_of_rows == 0:
raise DatasetMissesDataError

non_numerical_column_names = set(tagged_table.features.column_names) - set(
tagged_table.features.remove_columns_with_non_numerical_values().column_names,
)
if len(non_numerical_column_names) != 0:
raise NonNumericColumnError(
str(non_numerical_column_names),
(
"You can use the LabelEncoder or OneHotEncoder to transform your non-numerical data to numerical"
" data.\nThe OneHotEncoder should be used if you work with nominal data. If your data contains too many"
" different values\nor is ordinal, you should use the LabelEncoder."
),
)

null_containing_column_names = set(tagged_table.features.column_names) - set(
tagged_table.features.remove_columns_with_missing_values().column_names,
)
if len(null_containing_column_names) != 0:
raise MissingValuesColumnError(
str(null_containing_column_names),
(
"You can use the Imputer to replace the missing values based on different strategies.\nIf you want to"
" remove the missing values entirely you can use the method `Table.remove_rows_with_missing_values`."
),
)

try:
model.fit(
tagged_table.features._data,
Expand Down Expand Up @@ -73,6 +111,12 @@ def predict(model: Any, dataset: Table, feature_names: list[str] | None, target_
If the dataset misses feature columns.
PredictionError
If predicting with the given dataset failed.
NonNumericColumnError
If the dataset contains non-numerical values.
MissingValuesColumnError
If the dataset contains missing values.
DatasetMissesDataError
If the dataset contains no rows.
"""
# Validation
if model is None or target_name is None or feature_names is None:
Expand All @@ -83,6 +127,34 @@ def predict(model: Any, dataset: Table, feature_names: list[str] | None, target_
if missing_feature_names:
raise DatasetMissesFeaturesError(missing_feature_names)

if dataset.number_of_rows == 0:
raise DatasetMissesDataError

non_numerical_column_names = set(dataset.keep_only_columns(feature_names).column_names) - set(
dataset.keep_only_columns(feature_names).remove_columns_with_non_numerical_values().column_names,
)
if len(non_numerical_column_names) != 0:
raise NonNumericColumnError(
str(non_numerical_column_names),
(
"You can use the LabelEncoder or OneHotEncoder to transform your non-numerical data to numerical"
" data.\nThe OneHotEncoder should be used if you work with nominal data. If your data contains too many"
" different values\nor is ordinal, you should use the LabelEncoder.\n"
),
)

null_containing_column_names = set(dataset.keep_only_columns(feature_names).column_names) - set(
dataset.keep_only_columns(feature_names).remove_columns_with_missing_values().column_names,
)
if len(null_containing_column_names) != 0:
raise MissingValuesColumnError(
str(null_containing_column_names),
(
"You can use the Imputer to replace the missing values based on different strategies.\nIf you want to"
" remove the missing values entirely you can use the method `Table.remove_rows_with_missing_values`."
),
)

dataset_df = dataset.keep_only_columns(feature_names)._data
dataset_df.columns = feature_names

Expand Down
14 changes: 14 additions & 0 deletions src/safeds/ml/classical/classification/_ada_boost.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,14 @@ def fit(self, training_set: TaggedTable) -> AdaBoost:
------
LearningError
If the training data contains invalid values or if the training failed.
UntaggedTableError
If the table is untagged.
NonNumericColumnError
If the training data contains non-numerical values.
MissingValuesColumnError
If the training data contains missing values.
DatasetMissesDataError
If the training data contains no rows.
"""
wrapped_classifier = self._get_sklearn_classifier()
fit(wrapped_classifier, training_set)
Expand Down Expand Up @@ -129,6 +137,12 @@ def predict(self, dataset: Table) -> TaggedTable:
If the dataset misses feature columns.
PredictionError
If predicting with the given dataset failed.
NonNumericColumnError
If the dataset contains non-numerical values.
MissingValuesColumnError
If the dataset contains missing values.
DatasetMissesDataError
If the dataset contains no rows.
"""
return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name)

Expand Down
14 changes: 14 additions & 0 deletions src/safeds/ml/classical/classification/_decision_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,14 @@ def fit(self, training_set: TaggedTable) -> DecisionTree:
------
LearningError
If the training data contains invalid values or if the training failed.
UntaggedTableError
If the table is untagged.
NonNumericColumnError
If the training data contains non-numerical values.
MissingValuesColumnError
If the training data contains missing values.
DatasetMissesDataError
If the training data contains no rows.
"""
wrapped_classifier = self._get_sklearn_classifier()
fit(wrapped_classifier, training_set)
Expand Down Expand Up @@ -78,6 +86,12 @@ def predict(self, dataset: Table) -> TaggedTable:
If the dataset misses feature columns.
PredictionError
If predicting with the given dataset failed.
NonNumericColumnError
If the dataset contains non-numerical values.
MissingValuesColumnError
If the dataset contains missing values.
DatasetMissesDataError
If the dataset contains no rows.
"""
return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name)

Expand Down
14 changes: 14 additions & 0 deletions src/safeds/ml/classical/classification/_gradient_boosting.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,14 @@ def fit(self, training_set: TaggedTable) -> GradientBoosting:
------
LearningError
If the training data contains invalid values or if the training failed.
UntaggedTableError
If the table is untagged.
NonNumericColumnError
If the training data contains non-numerical values.
MissingValuesColumnError
If the training data contains missing values.
DatasetMissesDataError
If the training data contains no rows.
"""
wrapped_classifier = self._get_sklearn_classifier()
fit(wrapped_classifier, training_set)
Expand Down Expand Up @@ -112,6 +120,12 @@ def predict(self, dataset: Table) -> TaggedTable:
If the dataset misses feature columns.
PredictionError
If predicting with the given dataset failed.
NonNumericColumnError
If the dataset contains non-numerical values.
MissingValuesColumnError
If the dataset contains missing values.
DatasetMissesDataError
If the dataset contains no rows.
"""
return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name)

Expand Down
17 changes: 17 additions & 0 deletions src/safeds/ml/classical/classification/_k_nearest_neighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from sklearn.neighbors import KNeighborsClassifier as sk_KNeighborsClassifier

from safeds.exceptions import DatasetMissesDataError
from safeds.ml.classical._util_sklearn import fit, predict

from ._classifier import Classifier
Expand Down Expand Up @@ -69,7 +70,17 @@ def fit(self, training_set: TaggedTable) -> KNearestNeighbors:
If `number_of_neighbors` is greater than the sample size.
LearningError
If the training data contains invalid values or if the training failed.
UntaggedTableError
If the table is untagged.
NonNumericColumnError
If the training data contains non-numerical values.
MissingValuesColumnError
If the training data contains missing values.
DatasetMissesDataError
If the training data contains no rows.
"""
if training_set.number_of_rows == 0:
raise DatasetMissesDataError
if self._number_of_neighbors > training_set.number_of_rows:
raise ValueError(
(
Expand Down Expand Up @@ -111,6 +122,12 @@ def predict(self, dataset: Table) -> TaggedTable:
If the dataset misses feature columns.
PredictionError
If predicting with the given dataset failed.
NonNumericColumnError
If the dataset contains non-numerical values.
MissingValuesColumnError
If the dataset contains missing values.
DatasetMissesDataError
If the dataset contains no rows.
"""
return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name)

Expand Down
14 changes: 14 additions & 0 deletions src/safeds/ml/classical/classification/_logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,14 @@ def fit(self, training_set: TaggedTable) -> LogisticRegression:
------
LearningError
If the training data contains invalid values or if the training failed.
UntaggedTableError
If the table is untagged.
NonNumericColumnError
If the training data contains non-numerical values.
MissingValuesColumnError
If the training data contains missing values.
DatasetMissesDataError
If the training data contains no rows.
"""
wrapped_classifier = self._get_sklearn_classifier()
fit(wrapped_classifier, training_set)
Expand Down Expand Up @@ -78,6 +86,12 @@ def predict(self, dataset: Table) -> TaggedTable:
If the dataset misses feature columns.
PredictionError
If predicting with the given dataset failed.
NonNumericColumnError
If the dataset contains non-numerical values.
MissingValuesColumnError
If the dataset contains missing values.
DatasetMissesDataError
If the dataset contains no rows.
"""
return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name)

Expand Down
14 changes: 14 additions & 0 deletions src/safeds/ml/classical/classification/_random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,14 @@ def fit(self, training_set: TaggedTable) -> RandomForest:
------
LearningError
If the training data contains invalid values or if the training failed.
UntaggedTableError
If the table is untagged.
NonNumericColumnError
If the training data contains non-numerical values.
MissingValuesColumnError
If the training data contains missing values.
DatasetMissesDataError
If the training data contains no rows.
"""
wrapped_classifier = self._get_sklearn_classifier()
fit(wrapped_classifier, training_set)
Expand Down Expand Up @@ -100,6 +108,12 @@ def predict(self, dataset: Table) -> TaggedTable:
If the dataset misses feature columns.
PredictionError
If predicting with the given dataset failed.
NonNumericColumnError
If the dataset contains non-numerical values.
MissingValuesColumnError
If the dataset contains missing values.
DatasetMissesDataError
If the dataset contains no rows.
"""
return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name)

Expand Down
14 changes: 14 additions & 0 deletions src/safeds/ml/classical/classification/_support_vector_machine.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,14 @@ def fit(self, training_set: TaggedTable) -> SupportVectorMachine:
------
LearningError
If the training data contains invalid values or if the training failed.
UntaggedTableError
If the table is untagged.
NonNumericColumnError
If the training data contains non-numerical values.
MissingValuesColumnError
If the training data contains missing values.
DatasetMissesDataError
If the training data contains no rows.
"""
wrapped_classifier = self._get_sklearn_classifier()
fit(wrapped_classifier, training_set)
Expand Down Expand Up @@ -154,6 +162,12 @@ def predict(self, dataset: Table) -> TaggedTable:
If the dataset misses feature columns.
PredictionError
If predicting with the given dataset failed.
NonNumericColumnError
If the dataset contains non-numerical values.
MissingValuesColumnError
If the dataset contains missing values.
DatasetMissesDataError
If the dataset contains no rows.
"""
return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name)

Expand Down
Loading

0 comments on commit 66f5f64

Please sign in to comment.