diff --git a/Runtime/safe-ds/safeds/data/tabular/_table.py b/Runtime/safe-ds/safeds/data/tabular/_table.py index efb9ebb04..0c2a6b482 100644 --- a/Runtime/safe-ds/safeds/data/tabular/_table.py +++ b/Runtime/safe-ds/safeds/data/tabular/_table.py @@ -835,14 +835,14 @@ def summary(self) -> Table: for function in statistics.values(): try: - values.append(function()) + values.append(str(function())) except NonNumericColumnError: values.append("-") result = pd.concat([result, pd.DataFrame(values)], axis=1) result = pd.concat([pd.DataFrame(list(statistics.keys())), result], axis=1) - result.columns = [""] + self.get_column_names() + result.columns = ["metrics"] + self.get_column_names() return Table(result) diff --git a/Runtime/safe-ds/safeds/plotting/_correlation_heatmap.py b/Runtime/safe-ds/safeds/plotting/_correlation_heatmap.py index bcfd2ab01..a80716e76 100644 --- a/Runtime/safe-ds/safeds/plotting/_correlation_heatmap.py +++ b/Runtime/safe-ds/safeds/plotting/_correlation_heatmap.py @@ -29,6 +29,7 @@ def correlation_heatmap(table: Table) -> None: vmax=1, xticklabels=table.get_column_names(), yticklabels=table.get_column_names(), + cmap="vlag", ) plt.tight_layout() plt.show() diff --git a/Runtime/safe-ds/tests/data/tabular/_table/test_summary.py b/Runtime/safe-ds/tests/data/tabular/_table/test_summary.py index 630cb5d20..dd1db75ba 100644 --- a/Runtime/safe-ds/tests/data/tabular/_table/test_summary.py +++ b/Runtime/safe-ds/tests/data/tabular/_table/test_summary.py @@ -22,19 +22,31 @@ def test_summary() -> None: "row count", ], "col1": [ - 2.0, - 1.0, - 4.0 / 3, - 1.0, - 1.0, - 4.0, - 1.0 / 3, - table._data[0].std(), - 2.0 / 3, - 2.0 / 3, - 3.0, + "2", + "1", + str(4.0 / 3), + "1", + "1.0", + "4", + str(1.0 / 3), + str(table._data[0].std()), + str(2.0 / 3), + str(2.0 / 3), + "3", + ], + "col2": [ + "-", + "-", + "-", + "a", + "-", + "-", + "-", + "-", + "1.0", + str(1.0 / 3), + "3", ], - "col2": ["-", "-", "-", "a", "-", "-", "-", "-", 1.0, 1.0 / 3, 3], } ) ) diff --git a/docs/Stdlib/python/Tutorials/Resources/Heatmap.png b/docs/Stdlib/python/Tutorials/Resources/Heatmap.png index 181cd4420..e58486485 100644 Binary files a/docs/Stdlib/python/Tutorials/Resources/Heatmap.png and b/docs/Stdlib/python/Tutorials/Resources/Heatmap.png differ diff --git a/docs/Stdlib/python/Tutorials/Resources/Summary.png b/docs/Stdlib/python/Tutorials/Resources/Summary.png index a40810a81..5af1d4eb4 100644 Binary files a/docs/Stdlib/python/Tutorials/Resources/Summary.png and b/docs/Stdlib/python/Tutorials/Resources/Summary.png differ diff --git a/docs/Stdlib/python/Tutorials/visualization.md b/docs/Stdlib/python/Tutorials/visualization.md index fb8163405..b9e43d334 100644 --- a/docs/Stdlib/python/Tutorials/visualization.md +++ b/docs/Stdlib/python/Tutorials/visualization.md @@ -2,7 +2,8 @@ The following code will use a Jupyter Notebook environment. -First we need some data to visualize. For this we use the common example of the titanic disaster. +## Table & Statistics +First, we need some data to visualize. For this, we use the common example of the titanic disaster. !!! note You can download that dataset on [kaggle](https://www.kaggle.com/c/titanic). @@ -12,10 +13,10 @@ from safeds.data.tabular import Table data = Table.from_csv("path/to/your/data.csv") ``` -Now we want to have look at what our dataset looks like. For this we use Jupyter Notebooks native display function. +Now we want to have a look at what our dataset looks like. For this, we use Jupyter Notebooks native display function. ```python -data # calls display(data) +data    # calls display(data) ``` ![Table](./Resources/Table.png) @@ -23,22 +24,22 @@ data # calls display(data) Next some statistics. ```python -data.summary() # returns a table with various statistics for each column +data.summary()  # returns a table with various statistics for each column ``` ![Summary](./Resources/Summary.png) -As you can see here, the **idness** of the column _PassangerId_ is 1. This means, that every row has a unique value for +As you can see here, the **idness** of the column _PassengerId_ is 1. This means, that every row has a unique value for this column. Since this isn't helpful for our usecase we can drop it. ```python -data_cleaned = data.drop_columns(["PassangerId"]) +data_cleaned = data.drop_columns(["PassengerId"]) ``` - +## Heatmap Now we have a rough idea of what we are looking at. But we still don't really know a lot about our dataset. -So next we can start to plot a our columns against each other in a so called Heatmap, to understand which values relate to each other. +So next we can start to plot our columns against each other in a so called Heatmap, to understand which values relate to each other. -But since this type of diagramm only works for numerical values, we are going to use only those. +But since this type of diagram only works for numerical values, we are going to use only those. ```python from safeds.plotting import correlation_heatmap @@ -49,8 +50,9 @@ correlation_heatmap(data_only_numerics) ![Heatmap](./Resources/Heatmap.png) -As you can see, the columns _Fare_ and _Pclass_ (Passanger Class) seem to heavily correlate. Let's have another look at that. -We'll use a linechart to better understand their relationship. +As you can see, the columns _Fare_ and _Pclass_ (Passenger Class) seem to heavily correlate. Let's have another look at that. +## Lineplot +We'll use a lineplot to better understand their relationship. ```python from safeds.plotting import lineplot @@ -61,19 +63,31 @@ lineplot(data_cleaned, "Pclass", "Fare") The line itself represents the central tendency and the hued area around it a confidence interval for that estimate. -We can conclude that tickets for first class rooms are much more expensive compared to second and third class. -Also the difference between second and third is less pronounced. +We can conclude that tickets for first classrooms are much more expensive compared to second and third class. +Also, the difference between second and third is less pronounced. -Some other plots that might be useful are boxplots, histogams and scatterplots. +## Other plots +Some other plots that might be useful are boxplots, histograms and scatterplots. ```python -from safeds.plotting import boxplot, histogram, scatterplot +from safeds.plotting import boxplot boxplot(data_cleaned.get_column("Age")) -histogram(data_cleaned.get_column("Fare")) -scatterplot(data_cleaned, "Age", "Fare") ``` ![Boxplot](./Resources/Boxplot.png) + +```python +from safeds.plotting import histogram + +histogram(data_cleaned.get_column("Fare")) +``` ![Histogram](./Resources/Histogram.png) + +```python +from safeds.plotting import scatterplot + +scatterplot(data_cleaned, "Age", "Fare") +``` + ![Scatterplot](./Resources/Scatterplot.png)