-
Notifications
You must be signed in to change notification settings - Fork 206
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #551 from sfu-db/feat/plot_diff
feat(eda): added new function plot_diff
- Loading branch information
Showing
23 changed files
with
1,633 additions
and
437 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
""" | ||
This module implements the plot_diff function. | ||
""" | ||
|
||
from typing import Optional, Union, List, Dict, Any | ||
import dask.dataframe as dd | ||
import pandas as pd | ||
|
||
from ..configs import Config | ||
from ..container import Container | ||
from ..dtypes import DTypeDef | ||
from ..progress_bar import ProgressBar | ||
from .compute import compute_diff | ||
from .render import render_diff | ||
|
||
__all__ = ["plot_diff", "compute_diff", "render_diff"] | ||
|
||
|
||
def plot_diff( | ||
df: Union[List[Union[pd.DataFrame, dd.DataFrame]], Union[pd.DataFrame, dd.DataFrame]], | ||
x: Optional[str] = None, | ||
config: Optional[Dict[str, Any]] = None, | ||
display: Optional[List[str]] = None, | ||
dtype: Optional[DTypeDef] = None, | ||
progress: bool = True, | ||
) -> Container: | ||
""" | ||
This function is to compute and visualize the differences between 2 or more(up to 5) datasets. | ||
Parameters | ||
---------- | ||
df | ||
The DataFrame(s) to be compared. | ||
x | ||
The column to be emphasized in the comparision. | ||
config | ||
A dictionary for configuring the visualizations | ||
E.g. config={"hist.bins": 20} | ||
display | ||
A list containing the names of the visualizations to display | ||
E.g. display=["Histogram"] | ||
dtype: str or DType or dict of str or dict of DType, default None | ||
Specify Data Types for designated column or all columns. | ||
E.g. dtype = {"a": Continuous, "b": "Nominal"} or | ||
dtype = {"a": Continuous(), "b": "nominal"} | ||
or dtype = Continuous() or dtype = "Continuous" or dtype = Continuous(). | ||
progress | ||
Whether to show the progress bar. | ||
Examples | ||
-------- | ||
>>> from dataprep.datasets import load_dataset | ||
>>> from dataprep.eda import plot_diff | ||
>>> df_train = load_dataset('house_prices_train') | ||
>>> df_test = load_dataset('house_prices_test') | ||
>>> plot_diff([df_train, df_test]) | ||
""" | ||
# pylint: disable=too-many-arguments | ||
cfg = Config.from_dict(display, config) | ||
|
||
with ProgressBar(minimum=1, disable=not progress): | ||
intermediate = compute_diff(df, x=x, cfg=cfg, dtype=dtype) | ||
to_render = render_diff(intermediate, cfg=cfg) | ||
return Container(to_render, intermediate.visual_type, cfg) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
"""Computations for plot_diff([df...]).""" | ||
|
||
from typing import Optional, Union, List, Dict, Any | ||
import dask.dataframe as dd | ||
import pandas as pd | ||
from ....errors import DataprepError | ||
from ...intermediate import Intermediate | ||
from ...utils import to_dask | ||
from ...dtypes import DTypeDef, string_dtype_to_object | ||
from ...configs import Config | ||
from .multiple_df import compare_multiple_df # type: ignore | ||
|
||
__all__ = ["compute_diff"] | ||
|
||
|
||
def compute_diff( | ||
df: Union[List[Union[pd.DataFrame, dd.DataFrame]], Union[pd.DataFrame, dd.DataFrame]], | ||
x: Optional[str] = None, | ||
*, | ||
cfg: Union[Config, Dict[str, Any], None] = None, | ||
display: Optional[List[str]] = None, | ||
dtype: Optional[DTypeDef] = None, | ||
) -> Intermediate: | ||
""" | ||
All in one compute function. | ||
Parameters | ||
---------- | ||
df | ||
DataFrame from which visualizations are generated | ||
cfg: Union[Config, Dict[str, Any], None], default None | ||
When a user call plot(), the created Config object will be passed to compute(). | ||
When a user call compute() directly, if he/she wants to customize the output, | ||
cfg is a dictionary for configuring. If not, cfg is None and | ||
default values will be used for parameters. | ||
display: Optional[List[str]], default None | ||
A list containing the names of the visualizations to display. Only exist when | ||
a user call compute() directly and want to customize the output | ||
x: Optional[str], default None | ||
A valid column name from the dataframe | ||
dtype: str or DType or dict of str or dict of DType, default None | ||
Specify Data Types for designated column or all columns. | ||
E.g. dtype = {"a": Continuous, "b": "Nominal"} or | ||
dtype = {"a": Continuous(), "b": "nominal"} | ||
or dtype = Continuous() or dtype = "Continuous" or dtype = Continuous() | ||
""" | ||
if isinstance(cfg, dict): | ||
cfg = Config.from_dict(display, cfg) | ||
elif not cfg: | ||
cfg = Config() | ||
|
||
if isinstance(df, list): | ||
|
||
if len(df) < 2: | ||
raise DataprepError("plot_diff needs at least 2 DataFrames.") | ||
if len(df) > 5: | ||
raise DataprepError("Too many DataFrames, max: 5.") | ||
|
||
label = cfg.diff.label | ||
if not label: | ||
cfg.diff.label = [f"df{i+1}" for i in range(len(df))] | ||
elif len(df) != len(label): | ||
raise ValueError("Number of the given label doesn't match the number of DataFrames.") | ||
|
||
if cfg.diff.baseline > len(df) - 1: | ||
raise ValueError("Baseline is out of the boundary of the input.") | ||
|
||
df_list = list(map(to_dask, df)) | ||
for i, _ in enumerate(df_list): | ||
df_list[i].columns = df_list[i].columns.astype(str) | ||
df_list = list(map(string_dtype_to_object, df_list)) | ||
|
||
if x: | ||
# return compare_multiple_on_column(df_list, x) | ||
return Intermediate() | ||
else: | ||
return compare_multiple_df(df_list, cfg, dtype) # type: ignore | ||
|
||
else: | ||
raise TypeError(f"Invalid input type: {type(df)}") |
Oops, something went wrong.