feat: improve error handling of table (#308)

Closes #147. ### Summary of Changes feat: Added `WrongFileExtensionError` * Validate inputs of a function as the first step * Raise an appropriate exception with an appropriate message if issues are found * Document in the docstring that the exception is raised and under which condition * Test that the correct exception with the correct message is raised Co-authored-by: Marsmaennchen221 <[email protected]> --------- Co-authored-by: Alexander Gréus <[email protected]> Co-authored-by: megalinter-bot <[email protected]> Co-authored-by: Alexander <[email protected]> Co-authored-by: Philip Gutberlet <[email protected]>
Safe-DS · May 25, 2023 · ef87cc4 · ef87cc4
1 parent 5c7a662
commit ef87cc4
Show file tree

Hide file tree

Showing 34 changed files with 390 additions and 111 deletions.
diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py
@@ -25,6 +25,7 @@
     NonNumericColumnError,
     SchemaMismatchError,
     UnknownColumnNameError,
+    WrongFileExtensionError,
 )
 
 from ._column import Column
@@ -92,9 +93,12 @@ def from_csv_file(path: str | Path) -> Table:
         ------
         FileNotFoundError
             If the specified file does not exist.
-        ValueError
-            If the file could not be read.
+        WrongFileExtensionError
+            If the file is not a csv file.
         """
+        path = Path(path)
+        if path.suffix != ".csv":
+            raise WrongFileExtensionError(path, ".csv")
         try:
             return Table._from_pandas_dataframe(pd.read_csv(path))
         except FileNotFoundError as exception:
@@ -105,6 +109,8 @@ def from_excel_file(path: str | Path) -> Table:
         """
         Read data from an Excel file into a table.
 
+        Valid file extensions are `.xls`, '.xlsx', `.xlsm`, `.xlsb`, `.odf`, `.ods` and `.odt`.
+
         Parameters
         ----------
         path : str | Path
@@ -119,9 +125,13 @@ def from_excel_file(path: str | Path) -> Table:
         ------
         FileNotFoundError
             If the specified file does not exist.
-        ValueError
-            If the file could not be read.
+        WrongFileExtensionError
+            If the file is not an Excel file.
         """
+        path = Path(path)
+        excel_extensions = [".xls", ".xlsx", ".xlsm", ".xlsb", ".odf", ".ods", ".odt"]
+        if path.suffix not in excel_extensions:
+            raise WrongFileExtensionError(path, excel_extensions)
         try:
             return Table._from_pandas_dataframe(
                 pd.read_excel(path, engine="openpyxl", usecols=lambda colname: "Unnamed" not in colname),
@@ -148,9 +158,12 @@ def from_json_file(path: str | Path) -> Table:
         ------
         FileNotFoundError
             If the specified file does not exist.
-        ValueError
-            If the file could not be read.
+        WrongFileExtensionError
+            If the file is not a JSON file.
         """
+        path = Path(path)
+        if path.suffix != ".json":
+            raise WrongFileExtensionError(path, ".json")
         try:
             return Table._from_pandas_dataframe(pd.read_json(path))
         except FileNotFoundError as exception:
@@ -197,14 +210,20 @@ def from_columns(columns: list[Column]) -> Table:
         ------
         ColumnLengthMismatchError
             If any of the column sizes does not match with the others.
+        DuplicateColumnNameError
+            If multiple columns have the same name.
         """
         dataframe: DataFrame = pd.DataFrame()
+        column_names = []
 
         for column in columns:
             if column._data.size != columns[0]._data.size:
                 raise ColumnLengthMismatchError(
                     "\n".join(f"{column.name}: {column._data.size}" for column in columns),
                 )
+            if column.name in column_names:
+                raise DuplicateColumnNameError(column.name)
+            column_names.append(column.name)
             dataframe[column.name] = column._data
 
         return Table._from_pandas_dataframe(dataframe)
@@ -328,6 +347,8 @@ def __eq__(self, other: Any) -> bool:
             return NotImplemented
         if self is other:
             return True
+        if self.number_of_rows == 0 and other.number_of_rows == 0:
+            return self.column_names == other.column_names
         table1 = self.sort_columns()
         table2 = other.sort_columns()
         return table1._schema == table2._schema and table1._data.equals(table2._data)
@@ -463,7 +484,7 @@ def get_column_type(self, column_name: str) -> ColumnType:
 
         Raises
         ------
-        ColumnNameError
+        UnknownColumnNameError
             If the specified target column name does not exist.
         """
         return self._schema.get_column_type(column_name)
@@ -627,6 +648,10 @@ def add_row(self, row: Row) -> Table:
         table : Table
             A new table with the added row at the end.
 
+        Raises
+        ------
+        SchemaMismatchError
+            If the schema of the row does not match the table schema.
         """
         if self._schema != row.schema:
             raise SchemaMismatchError
@@ -650,6 +675,11 @@ def add_rows(self, rows: list[Row] | Table) -> Table:
         -------
         result : Table
             A new table which combines the original table and the given rows.
+
+        Raises
+        ------
+        SchemaMismatchError
+            If the schema of on of the row does not match the table schema.
         """
         if isinstance(rows, Table):
             rows = rows.to_rows()
@@ -705,7 +735,7 @@ def keep_only_columns(self, column_names: list[str]) -> Table:
 
         Raises
         ------
-        ColumnNameError
+        UnknownColumnNameError
             If any of the given columns does not exist.
         """
         invalid_columns = []
@@ -737,7 +767,7 @@ def remove_columns(self, column_names: list[str]) -> Table:
 
         Raises
         ------
-        ColumnNameError
+        UnknownColumnNameError
             If any of the given columns does not exist.
         """
         invalid_columns = []
@@ -851,7 +881,7 @@ def rename_column(self, old_name: str, new_name: str) -> Table:
 
         Raises
         ------
-        ColumnNameError
+        UnknownColumnNameError
             If the specified old target column name does not exist.
         DuplicateColumnNameError
             If the specified new target column name already exists.
@@ -960,7 +990,7 @@ def slice_rows(
 
         Raises
         ------
-        ValueError
+        IndexOutOfBoundsError
             If the index is out of bounds.
         """
         if start is None:
@@ -969,8 +999,10 @@ def slice_rows(
         if end is None:
             end = self.number_of_rows
 
-        if start < 0 or end < 0 or start >= self.number_of_rows or end > self.number_of_rows or end < start:
-            raise ValueError("The given index is out of bounds")
+        if end < start:
+            raise IndexOutOfBoundsError(slice(start, end))
+        if start < 0 or end < 0 or start > self.number_of_rows or end > self.number_of_rows:
+            raise IndexOutOfBoundsError(start if start < 0 or start > self.number_of_rows else end)
 
         new_df = self._data.iloc[start:end:step]
         new_df.columns = self._schema.column_names
@@ -1053,10 +1085,13 @@ def split(self, percentage_in_first: float) -> tuple[Table, Table]:
             A tuple containing the two resulting tables. The first table has the specified size, the second table
             contains the rest of the data.
 
-
+        Raises
+        ------
+        ValueError:
+            if the 'percentage_in_first' is not between 0 and 1
         """
-        if percentage_in_first <= 0 or percentage_in_first >= 1:
-            raise ValueError("the given percentage is not in range")
+        if percentage_in_first < 0 or percentage_in_first > 1:
+            raise ValueError("The given percentage is not between 0 and 1")
         return (
             self.slice_rows(0, round(percentage_in_first * self.number_of_rows)),
             self.slice_rows(round(percentage_in_first * self.number_of_rows)),
@@ -1079,6 +1114,13 @@ def tag_columns(self, target_name: str, feature_names: list[str] | None = None)
         -------
         tagged_table : TaggedTable
             A new tagged table with the given target and feature names.
+
+        Raises
+        ------
+        ValueError
+            If the target column is also a feature column.
+        ValueError
+            If no feature columns are specified.
         """
         from ._tagged_table import TaggedTable
 
@@ -1241,10 +1283,11 @@ def plot_lineplot(self, x_column_name: str, y_column_name: str) -> Image:
         UnknownColumnNameError
             If either of the columns do not exist.
         """
-        if not self.has_column(x_column_name):
-            raise UnknownColumnNameError([x_column_name])
-        if not self.has_column(y_column_name):
-            raise UnknownColumnNameError([y_column_name])
+        if not self.has_column(x_column_name) or not self.has_column(y_column_name):
+            raise UnknownColumnNameError(
+                ([x_column_name] if not self.has_column(x_column_name) else [])
+                + ([y_column_name] if not self.has_column(y_column_name) else []),
+            )
 
         fig = plt.figure()
         ax = sns.lineplot(
@@ -1288,10 +1331,11 @@ def plot_scatterplot(self, x_column_name: str, y_column_name: str) -> Image:
         UnknownColumnNameError
             If either of the columns do not exist.
         """
-        if not self.has_column(x_column_name):
-            raise UnknownColumnNameError([x_column_name])
-        if not self.has_column(y_column_name):
-            raise UnknownColumnNameError([y_column_name])
+        if not self.has_column(x_column_name) or not self.has_column(y_column_name):
+            raise UnknownColumnNameError(
+                ([x_column_name] if not self.has_column(x_column_name) else [])
+                + ([y_column_name] if not self.has_column(y_column_name) else []),
+            )
 
         fig = plt.figure()
         ax = sns.scatterplot(
@@ -1399,8 +1443,16 @@ def to_csv_file(self, path: str | Path) -> None:
         ----------
         path : str | Path
             The path to the output file.
+
+        Raises
+        ------
+        WrongFileExtensionError
+            If the file is not a csv file.
         """
-        Path(path).parent.mkdir(parents=True, exist_ok=True)
+        path = Path(path)
+        if path.suffix != ".csv":
+            raise WrongFileExtensionError(path, ".csv")
+        path.parent.mkdir(parents=True, exist_ok=True)
         data_to_csv = self._data.copy()
         data_to_csv.columns = self._schema.column_names
         data_to_csv.to_csv(path, index=False)
@@ -1409,19 +1461,30 @@ def to_excel_file(self, path: str | Path) -> None:
         """
         Write the data from the table into an Excel file.
 
+        Valid file extensions are `.xls`, '.xlsx', `.xlsm`, `.xlsb`, `.odf`, `.ods` and `.odt`.
         If the file and/or the directories do not exist, they will be created. If the file already exists, it will be
         overwritten.
 
         Parameters
         ----------
         path : str | Path
             The path to the output file.
+
+        Raises
+        ------
+        WrongFileExtensionError
+            If the file is not an Excel file.
         """
+        path = Path(path)
+        excel_extensions = [".xls", ".xlsx", ".xlsm", ".xlsb", ".odf", ".ods", ".odt"]
+        if path.suffix not in excel_extensions:
+            raise WrongFileExtensionError(path, excel_extensions)
+
         # Create Excel metadata in the file
         tmp_table_file = openpyxl.Workbook()
         tmp_table_file.save(path)
 
-        Path(path).parent.mkdir(parents=True, exist_ok=True)
+        path.parent.mkdir(parents=True, exist_ok=True)
         data_to_excel = self._data.copy()
         data_to_excel.columns = self._schema.column_names
         data_to_excel.to_excel(path)
@@ -1437,8 +1500,16 @@ def to_json_file(self, path: str | Path) -> None:
         ----------
         path : str | Path
             The path to the output file.
+
+        Raises
+        ------
+        WrongFileExtensionError
+            If the file is not a JSON file.
         """
-        Path(path).parent.mkdir(parents=True, exist_ok=True)
+        path = Path(path)
+        if path.suffix != ".json":
+            raise WrongFileExtensionError(path, ".json")
+        path.parent.mkdir(parents=True, exist_ok=True)
         data_to_json = self._data.copy()
         data_to_json.columns = self._schema.column_names
         data_to_json.to_json(path)

diff --git a/src/safeds/data/tabular/typing/_schema.py b/src/safeds/data/tabular/typing/_schema.py
@@ -184,7 +184,7 @@ def get_column_type(self, column_name: str) -> ColumnType:
 
         Raises
         ------
-        ColumnNameError
+        UnknownColumnNameError
             If the specified column name does not exist.
 
         Examples

diff --git a/src/safeds/exceptions/__init__.py b/src/safeds/exceptions/__init__.py
@@ -10,6 +10,7 @@
     TransformerNotFittedError,
     UnknownColumnNameError,
     ValueNotPresentWhenFittedError,
+    WrongFileExtensionError,
 )
 from safeds.exceptions._ml import (
     DatasetContainsTargetError,
@@ -31,6 +32,7 @@
     "TransformerNotFittedError",
     "UnknownColumnNameError",
     "ValueNotPresentWhenFittedError",
+    "WrongFileExtensionError",
     # ML exceptions
     "DatasetContainsTargetError",
     "DatasetMissesFeaturesError",

diff --git a/src/safeds/exceptions/_data.py b/src/safeds/exceptions/_data.py
@@ -1,5 +1,10 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
 
 class UnknownColumnNameError(KeyError):
     """
@@ -47,7 +52,10 @@ class IndexOutOfBoundsError(IndexError):
     """
 
     def __init__(self, index: int | slice):
-        super().__init__(f"There is no element at index '{index}'.")
+        if isinstance(index, int):
+            super().__init__(f"There is no element at index '{index}'.")
+        else:
+            super().__init__(f"There is no element in the range [{index.start}, {index.stop}]")
 
 
 class ColumnSizeError(Exception):
@@ -92,3 +100,15 @@ class ValueNotPresentWhenFittedError(Exception):
 
     def __init__(self, value: str, column: str) -> None:
         super().__init__(f"Value not present in the table the transformer was fitted on: \n{value} in column {column}.")
+
+
+class WrongFileExtensionError(Exception):
+    """Exception raised when the file has the wrong file extension."""
+
+    def __init__(self, file: str | Path, file_extension: str | list[str]) -> None:
+        super().__init__(
+            (
+                f"The file {file} has a wrong file extension. Please provide a file with the following extension(s):"
+                f" {file_extension}"
+            ),
+        )
diff --git a/tests/safeds/data/tabular/containers/_table/test_add_column.py b/tests/safeds/data/tabular/containers/_table/test_add_column.py
@@ -26,11 +26,11 @@ def test_should_add_column(table1: Table, column: Column, expected: Table) -> No
 
 def test_should_raise_error_if_column_name_exists() -> None:
     table1 = Table({"col1": [1, 2, 1], "col2": [1, 2, 4]})
-    with pytest.raises(DuplicateColumnNameError):
+    with pytest.raises(DuplicateColumnNameError, match=r"Column 'col1' already exists."):
         table1.add_column(Column("col1", ["a", "b", "c"]))
 
 
 def test_should_raise_error_if_column_size_invalid() -> None:
     table1 = Table({"col1": [1, 2, 1], "col2": [1, 2, 4]})
-    with pytest.raises(ColumnSizeError):
+    with pytest.raises(ColumnSizeError, match=r"Expected a column of size 3 but got column of size 4."):
         table1.add_column(Column("col3", ["a", "b", "c", "d"]))