Skip to content

Commit

Permalink
Merge pull request #551 from sfu-db/feat/plot_diff
Browse files Browse the repository at this point in the history
feat(eda): added new function plot_diff
  • Loading branch information
brandonlockhart authored Apr 2, 2021
2 parents 9c5f74d + 79523c3 commit b6997b8
Show file tree
Hide file tree
Showing 23 changed files with 1,633 additions and 437 deletions.
4 changes: 4 additions & 0 deletions dataprep/eda/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
Text,
)
from .missing import compute_missing, plot_missing, render_missing
from .diff import plot_diff, compute_diff, render_diff

__all__ = [
"plot_correlation",
Expand All @@ -41,6 +42,9 @@
"DateTime",
"Text",
"create_report",
"plot_diff",
"compute_diff",
"render_diff",
]


Expand Down
14 changes: 12 additions & 2 deletions dataprep/eda/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,15 @@ class Plot(BaseModel):
report: bool = False


class Diff(BaseModel):
"""
Define the parameters in the plot_diff
"""

label: Union[List[str], None] = None
baseline: int = 0


class Stats(BaseModel):
"""
enable: bool, default True
Expand Down Expand Up @@ -1117,6 +1126,7 @@ class Config(BaseModel):
interactions: Interactions = Field(default_factory=Interactions)
correlations: Correlations = Field(default_factory=Correlations)
missingvalues: MissingValues = Field(default_factory=MissingValues)
diff: Diff = Field(default_factory=Diff)

@classmethod
def from_dict(
Expand All @@ -1129,8 +1139,8 @@ def from_dict(
if display:
try:
display = [DISPLAY_MAP[disp] for disp in display]
# set all plots not in display list to enable=False except for Plot class
for plot in set(vars(cfg).keys()) - set(display) - {"plot"}:
# set all plots not in display list to enable=False except for Plot and Diff class
for plot in set(vars(cfg).keys()) - set(display) - {"plot"} - {"diff"}:
setattr(getattr(cfg, plot), "enable", False)
except KeyError:
display = [DISPLAY_REPORT_MAP[disp] for disp in display]
Expand Down
2 changes: 1 addition & 1 deletion dataprep/eda/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
"correlation_scatter",
}

GRID_VISUAL_TYPES = {"distribution_grid", "missing_impact_1vn"}
GRID_VISUAL_TYPES = {"distribution_grid", "missing_impact_1vn", "comparison_grid"}


class Container:
Expand Down
2 changes: 1 addition & 1 deletion dataprep/eda/create_report/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from ..correlation.compute.overview import correlation_nxn
from ..data_array import DataArray
from ..distribution import render
from ..distribution.compute.common import _calc_line_dt
from ..utils import _calc_line_dt
from ..distribution.compute.overview import calc_stats
from ..distribution.compute.univariate import calc_stats_dt, cont_comps, nom_comps
from ..distribution.render import format_cat_stats, format_num_stats, format_ov_stats, stats_viz_dt
Expand Down
64 changes: 64 additions & 0 deletions dataprep/eda/diff/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"""
This module implements the plot_diff function.
"""

from typing import Optional, Union, List, Dict, Any
import dask.dataframe as dd
import pandas as pd

from ..configs import Config
from ..container import Container
from ..dtypes import DTypeDef
from ..progress_bar import ProgressBar
from .compute import compute_diff
from .render import render_diff

__all__ = ["plot_diff", "compute_diff", "render_diff"]


def plot_diff(
df: Union[List[Union[pd.DataFrame, dd.DataFrame]], Union[pd.DataFrame, dd.DataFrame]],
x: Optional[str] = None,
config: Optional[Dict[str, Any]] = None,
display: Optional[List[str]] = None,
dtype: Optional[DTypeDef] = None,
progress: bool = True,
) -> Container:
"""
This function is to compute and visualize the differences between 2 or more(up to 5) datasets.
Parameters
----------
df
The DataFrame(s) to be compared.
x
The column to be emphasized in the comparision.
config
A dictionary for configuring the visualizations
E.g. config={"hist.bins": 20}
display
A list containing the names of the visualizations to display
E.g. display=["Histogram"]
dtype: str or DType or dict of str or dict of DType, default None
Specify Data Types for designated column or all columns.
E.g. dtype = {"a": Continuous, "b": "Nominal"} or
dtype = {"a": Continuous(), "b": "nominal"}
or dtype = Continuous() or dtype = "Continuous" or dtype = Continuous().
progress
Whether to show the progress bar.
Examples
--------
>>> from dataprep.datasets import load_dataset
>>> from dataprep.eda import plot_diff
>>> df_train = load_dataset('house_prices_train')
>>> df_test = load_dataset('house_prices_test')
>>> plot_diff([df_train, df_test])
"""
# pylint: disable=too-many-arguments
cfg = Config.from_dict(display, config)

with ProgressBar(minimum=1, disable=not progress):
intermediate = compute_diff(df, x=x, cfg=cfg, dtype=dtype)
to_render = render_diff(intermediate, cfg=cfg)
return Container(to_render, intermediate.visual_type, cfg)
80 changes: 80 additions & 0 deletions dataprep/eda/diff/compute/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""Computations for plot_diff([df...])."""

from typing import Optional, Union, List, Dict, Any
import dask.dataframe as dd
import pandas as pd
from ....errors import DataprepError
from ...intermediate import Intermediate
from ...utils import to_dask
from ...dtypes import DTypeDef, string_dtype_to_object
from ...configs import Config
from .multiple_df import compare_multiple_df # type: ignore

__all__ = ["compute_diff"]


def compute_diff(
df: Union[List[Union[pd.DataFrame, dd.DataFrame]], Union[pd.DataFrame, dd.DataFrame]],
x: Optional[str] = None,
*,
cfg: Union[Config, Dict[str, Any], None] = None,
display: Optional[List[str]] = None,
dtype: Optional[DTypeDef] = None,
) -> Intermediate:
"""
All in one compute function.
Parameters
----------
df
DataFrame from which visualizations are generated
cfg: Union[Config, Dict[str, Any], None], default None
When a user call plot(), the created Config object will be passed to compute().
When a user call compute() directly, if he/she wants to customize the output,
cfg is a dictionary for configuring. If not, cfg is None and
default values will be used for parameters.
display: Optional[List[str]], default None
A list containing the names of the visualizations to display. Only exist when
a user call compute() directly and want to customize the output
x: Optional[str], default None
A valid column name from the dataframe
dtype: str or DType or dict of str or dict of DType, default None
Specify Data Types for designated column or all columns.
E.g. dtype = {"a": Continuous, "b": "Nominal"} or
dtype = {"a": Continuous(), "b": "nominal"}
or dtype = Continuous() or dtype = "Continuous" or dtype = Continuous()
"""
if isinstance(cfg, dict):
cfg = Config.from_dict(display, cfg)
elif not cfg:
cfg = Config()

if isinstance(df, list):

if len(df) < 2:
raise DataprepError("plot_diff needs at least 2 DataFrames.")
if len(df) > 5:
raise DataprepError("Too many DataFrames, max: 5.")

label = cfg.diff.label
if not label:
cfg.diff.label = [f"df{i+1}" for i in range(len(df))]
elif len(df) != len(label):
raise ValueError("Number of the given label doesn't match the number of DataFrames.")

if cfg.diff.baseline > len(df) - 1:
raise ValueError("Baseline is out of the boundary of the input.")

df_list = list(map(to_dask, df))
for i, _ in enumerate(df_list):
df_list[i].columns = df_list[i].columns.astype(str)
df_list = list(map(string_dtype_to_object, df_list))

if x:
# return compare_multiple_on_column(df_list, x)
return Intermediate()
else:
return compare_multiple_df(df_list, cfg, dtype) # type: ignore

else:
raise TypeError(f"Invalid input type: {type(df)}")
Loading

0 comments on commit b6997b8

Please sign in to comment.