perf: treat Tables specially when calling add_rows (#606)

Fixes partially #575 ### Summary of Changes Add special case to `add_rows` when adding a `Table`. The table is no longer split into a list of rows, instead it is directly added to the `DataFrame`. In special cases, this may improve the performance up to 1000x. --------- Co-authored-by: Lars Reimann <[email protected]>
Safe-DS · Apr 3, 2024 · e555b85 · e555b85
1 parent 98bccd0
commit e555b85
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 3 deletions.
diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py
@@ -1057,7 +1057,23 @@ def add_rows(self, rows: list[Row] | Table) -> Table:
         2  5  6
         """
         if isinstance(rows, Table):
-            rows = rows.to_rows()
+            if rows.number_of_rows == 0:
+                return self
+            if self.number_of_columns == 0:
+                return rows
+            different_column_names = set(self.column_names) - set(rows.column_names)
+            if len(different_column_names) > 0:
+                raise UnknownColumnNameError(
+                    sorted(
+                        different_column_names,
+                        key={val: ix for ix, val in enumerate(self.column_names)}.__getitem__,
+                    ),
+                )
+
+            new_df = pd.concat([self._data, rows._data]).infer_objects()
+            new_df.columns = self.column_names
+            schema = Schema._merge_multiple_schemas([self.schema, rows.schema])
+            return Table._from_pandas_dataframe(new_df, schema)
 
         if len(rows) == 0:
             return self

diff --git a/tests/safeds/data/tabular/containers/_table/test_add_rows.py b/tests/safeds/data/tabular/containers/_table/test_add_rows.py
@@ -21,8 +21,13 @@
             [Row({"col1": "d", "col2": 6}), Row({"col1": "e", "col2": 8})],
             Table({"col1": ["d", "e"], "col2": [6, 8]}),
         ),
+        (
+            Table({"col1": ["a", "b", "c"], "col2": [1, 2, 4]}),
+            [],
+            Table({"col1": ["a", "b", "c"], "col2": [1, 2, 4]}),
+        )
     ],
-    ids=["Rows with string and integer values", "different schema", "empty"],
+    ids=["Rows with string and integer values", "different schema", "empty", "add empty"],
 )
 def test_should_add_rows(table1: Table, rows: list[Row], table2: Table) -> None:
     table1 = table1.add_rows(rows)
@@ -80,8 +85,13 @@ def test_should_add_rows_from_table(table1: Table, table2: Table, expected: Tabl
             [Row({"col1": 2, "col3": 4}), Row({"colA": 5, "col2": "Hallo"})],
             r"Could not find column\(s\) 'col1, col2'",
         ),
+        (
+            Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}),
+            Table({"col1": [2, 5], "col3": [4, "Hallo"]}),
+            r"Could not find column\(s\) 'col2'",
+        ),
     ],
-    ids=["column names do not match", "multiple columns missing"],
+    ids=["column names do not match", "multiple columns missing", "column missing from other table"],
 )
 def test_should_raise_error_if_row_column_names_invalid(table: Table, rows: list[Row], expected_error_msg: str) -> None:
     with pytest.raises(UnknownColumnNameError, match=expected_error_msg):