Merge pull request #551 from sfu-db/feat/plot_diff

feat(eda): added new function plot_diff
sfu-db · Apr 2, 2021 · b6997b8 · b6997b8
2 parents 9c5f74d + 79523c3
commit b6997b8
Show file tree

Hide file tree

Showing 23 changed files with 1,633 additions and 437 deletions.
diff --git a/dataprep/eda/__init__.py b/dataprep/eda/__init__.py
@@ -20,6 +20,7 @@
     Text,
 )
 from .missing import compute_missing, plot_missing, render_missing
+from .diff import plot_diff, compute_diff, render_diff
 
 __all__ = [
     "plot_correlation",
@@ -41,6 +42,9 @@
     "DateTime",
     "Text",
     "create_report",
+    "plot_diff",
+    "compute_diff",
+    "render_diff",
 ]
 
 

diff --git a/dataprep/eda/configs.py b/dataprep/eda/configs.py
@@ -74,6 +74,15 @@ class Plot(BaseModel):
     report: bool = False
 
 
+class Diff(BaseModel):
+    """
+    Define the parameters in the plot_diff
+    """
+
+    label: Union[List[str], None] = None
+    baseline: int = 0
+
+
 class Stats(BaseModel):
     """
     enable: bool, default True
@@ -1117,6 +1126,7 @@ class Config(BaseModel):
     interactions: Interactions = Field(default_factory=Interactions)
     correlations: Correlations = Field(default_factory=Correlations)
     missingvalues: MissingValues = Field(default_factory=MissingValues)
+    diff: Diff = Field(default_factory=Diff)
 
     @classmethod
     def from_dict(
@@ -1129,8 +1139,8 @@ def from_dict(
         if display:
             try:
                 display = [DISPLAY_MAP[disp] for disp in display]
-                # set all plots not in display list to enable=False except for Plot class
-                for plot in set(vars(cfg).keys()) - set(display) - {"plot"}:
+                # set all plots not in display list to enable=False except for Plot and Diff class
+                for plot in set(vars(cfg).keys()) - set(display) - {"plot"} - {"diff"}:
                     setattr(getattr(cfg, plot), "enable", False)
             except KeyError:
                 display = [DISPLAY_REPORT_MAP[disp] for disp in display]

diff --git a/dataprep/eda/container.py b/dataprep/eda/container.py
@@ -39,7 +39,7 @@
     "correlation_scatter",
 }
 
-GRID_VISUAL_TYPES = {"distribution_grid", "missing_impact_1vn"}
+GRID_VISUAL_TYPES = {"distribution_grid", "missing_impact_1vn", "comparison_grid"}
 
 
 class Container:

diff --git a/dataprep/eda/create_report/formatter.py b/dataprep/eda/create_report/formatter.py
@@ -14,7 +14,7 @@
 from ..correlation.compute.overview import correlation_nxn
 from ..data_array import DataArray
 from ..distribution import render
-from ..distribution.compute.common import _calc_line_dt
+from ..utils import _calc_line_dt
 from ..distribution.compute.overview import calc_stats
 from ..distribution.compute.univariate import calc_stats_dt, cont_comps, nom_comps
 from ..distribution.render import format_cat_stats, format_num_stats, format_ov_stats, stats_viz_dt

diff --git a/dataprep/eda/diff/__init__.py b/dataprep/eda/diff/__init__.py
@@ -0,0 +1,64 @@
+"""
+    This module implements the plot_diff function.
+"""
+
+from typing import Optional, Union, List, Dict, Any
+import dask.dataframe as dd
+import pandas as pd
+
+from ..configs import Config
+from ..container import Container
+from ..dtypes import DTypeDef
+from ..progress_bar import ProgressBar
+from .compute import compute_diff
+from .render import render_diff
+
+__all__ = ["plot_diff", "compute_diff", "render_diff"]
+
+
+def plot_diff(
+    df: Union[List[Union[pd.DataFrame, dd.DataFrame]], Union[pd.DataFrame, dd.DataFrame]],
+    x: Optional[str] = None,
+    config: Optional[Dict[str, Any]] = None,
+    display: Optional[List[str]] = None,
+    dtype: Optional[DTypeDef] = None,
+    progress: bool = True,
+) -> Container:
+    """
+    This function is to compute and visualize the differences between 2 or more(up to 5) datasets.
+
+    Parameters
+    ----------
+    df
+        The DataFrame(s) to be compared.
+    x
+        The column to be emphasized in the comparision.
+    config
+        A dictionary for configuring the visualizations
+        E.g. config={"hist.bins": 20}
+    display
+        A list containing the names of the visualizations to display
+        E.g. display=["Histogram"]
+    dtype: str or DType or dict of str or dict of DType, default None
+        Specify Data Types for designated column or all columns.
+        E.g.  dtype = {"a": Continuous, "b": "Nominal"} or
+        dtype = {"a": Continuous(), "b": "nominal"}
+        or dtype = Continuous() or dtype = "Continuous" or dtype = Continuous().
+    progress
+        Whether to show the progress bar.
+
+    Examples
+    --------
+    >>> from dataprep.datasets import load_dataset
+    >>> from dataprep.eda import plot_diff
+    >>> df_train = load_dataset('house_prices_train')
+    >>> df_test = load_dataset('house_prices_test')
+    >>> plot_diff([df_train, df_test])
+    """
+    # pylint: disable=too-many-arguments
+    cfg = Config.from_dict(display, config)
+
+    with ProgressBar(minimum=1, disable=not progress):
+        intermediate = compute_diff(df, x=x, cfg=cfg, dtype=dtype)
+    to_render = render_diff(intermediate, cfg=cfg)
+    return Container(to_render, intermediate.visual_type, cfg)
diff --git a/dataprep/eda/diff/compute/__init__.py b/dataprep/eda/diff/compute/__init__.py
@@ -0,0 +1,80 @@
+"""Computations for plot_diff([df...])."""
+
+from typing import Optional, Union, List, Dict, Any
+import dask.dataframe as dd
+import pandas as pd
+from ....errors import DataprepError
+from ...intermediate import Intermediate
+from ...utils import to_dask
+from ...dtypes import DTypeDef, string_dtype_to_object
+from ...configs import Config
+from .multiple_df import compare_multiple_df  # type: ignore
+
+__all__ = ["compute_diff"]
+
+
+def compute_diff(
+    df: Union[List[Union[pd.DataFrame, dd.DataFrame]], Union[pd.DataFrame, dd.DataFrame]],
+    x: Optional[str] = None,
+    *,
+    cfg: Union[Config, Dict[str, Any], None] = None,
+    display: Optional[List[str]] = None,
+    dtype: Optional[DTypeDef] = None,
+) -> Intermediate:
+    """
+    All in one compute function.
+
+    Parameters
+    ----------
+    df
+        DataFrame from which visualizations are generated
+    cfg: Union[Config, Dict[str, Any], None], default None
+        When a user call plot(), the created Config object will be passed to compute().
+        When a user call compute() directly, if he/she wants to customize the output,
+        cfg is a dictionary for configuring. If not, cfg is None and
+        default values will be used for parameters.
+    display: Optional[List[str]], default None
+        A list containing the names of the visualizations to display. Only exist when
+        a user call compute() directly and want to customize the output
+    x: Optional[str], default None
+        A valid column name from the dataframe
+    dtype: str or DType or dict of str or dict of DType, default None
+        Specify Data Types for designated column or all columns.
+        E.g.  dtype = {"a": Continuous, "b": "Nominal"} or
+        dtype = {"a": Continuous(), "b": "nominal"}
+        or dtype = Continuous() or dtype = "Continuous" or dtype = Continuous()
+    """
+    if isinstance(cfg, dict):
+        cfg = Config.from_dict(display, cfg)
+    elif not cfg:
+        cfg = Config()
+
+    if isinstance(df, list):
+
+        if len(df) < 2:
+            raise DataprepError("plot_diff needs at least 2 DataFrames.")
+        if len(df) > 5:
+            raise DataprepError("Too many DataFrames, max: 5.")
+
+        label = cfg.diff.label
+        if not label:
+            cfg.diff.label = [f"df{i+1}" for i in range(len(df))]
+        elif len(df) != len(label):
+            raise ValueError("Number of the given label doesn't match the number of DataFrames.")
+
+        if cfg.diff.baseline > len(df) - 1:
+            raise ValueError("Baseline is out of the boundary of the input.")
+
+        df_list = list(map(to_dask, df))
+        for i, _ in enumerate(df_list):
+            df_list[i].columns = df_list[i].columns.astype(str)
+        df_list = list(map(string_dtype_to_object, df_list))
+
+        if x:
+            # return compare_multiple_on_column(df_list, x)
+            return Intermediate()
+        else:
+            return compare_multiple_df(df_list, cfg, dtype)  # type: ignore
+
+    else:
+        raise TypeError(f"Invalid input type: {type(df)}")