From 323ae6b09d20ac63d23b2e65864b6cd8518c26d2 Mon Sep 17 00:00:00 2001 From: Devin Lu Date: Tue, 5 Oct 2021 00:10:01 -0700 Subject: [PATCH] feat(eda): refactored code and added density parameter to plot_diff(df) feat(eda): added density parameter to plot_diff(df) refactor(eda): ran just ci tests refactor: changed kde_viz_panel() into a wrapper --- dataprep/eda/configs.py | 1 + dataprep/eda/diff/compute/multiple_df.py | 46 +++++++++++++++++- dataprep/eda/diff/render.py | 60 +++++++++++++++++------- 3 files changed, 87 insertions(+), 20 deletions(-) diff --git a/dataprep/eda/configs.py b/dataprep/eda/configs.py index 859c2b358..b58e5a72d 100644 --- a/dataprep/eda/configs.py +++ b/dataprep/eda/configs.py @@ -81,6 +81,7 @@ class Diff(BaseModel): label: Union[List[str], None] = None baseline: int = 0 + density: bool = False class Stats(BaseModel): diff --git a/dataprep/eda/diff/compute/multiple_df.py b/dataprep/eda/diff/compute/multiple_df.py index 403ff34a1..b8b1a3224 100644 --- a/dataprep/eda/diff/compute/multiple_df.py +++ b/dataprep/eda/diff/compute/multiple_df.py @@ -5,6 +5,7 @@ from collections import UserList, OrderedDict from typing import Any, Callable, Dict, List, Tuple, Union, Optional +import math import pandas as pd import numpy as np import dask @@ -23,6 +24,7 @@ drop_null, DTypeDef, ) +from ...utils import gaussian_kde from ...configs import Config @@ -124,10 +126,23 @@ def getmask( return output - def self_map(self, func: Callable[[dd.Series], Any], **kwargs: Any) -> List[Any]: + def self_map( + self, + func: Callable[[dd.Series], Any], + condition: Optional[List[bool]] = None, + **kwargs: Any, + ) -> List[Any]: """ Map the data to the given function. """ + if condition: + rslt = [] + for cond, data in zip(condition, self.data): + if not cond: + rslt.append(func(data, **kwargs)) + else: + rslt.append(None) + return rslt return [func(srs, **kwargs) for srs in self.data] @@ -198,7 +213,9 @@ def compare_multiple_df( for col, dtp, datum, orig in data: if is_dtype(dtp, Continuous()): - if cfg.hist.enable: + if cfg.diff.density: + plot_data.append((col, dtp, (datum["kde"], datum["dens"]), orig)) + elif cfg.hist.enable: plot_data.append((col, dtp, datum["hist"], orig)) elif is_dtype(dtp, Nominal()): if cfg.bar.enable: @@ -266,6 +283,10 @@ def _cont_calcs(srs: Srs, cfg: Config) -> Dict[str, List[Any]]: min_max = srs.apply( "map_partitions", lambda x: pd.Series([x.max(), x.min()]), meta=pd.Series([], dtype=float) ).data + min_max_comp = [] + if cfg.diff.density: + for min_max_value in dask.compute(min_max)[0]: + min_max_comp.append(math.isclose(min_max_value.min(), min_max_value.max())) min_max = dd.concat(min_max).repartition(npartitions=1) # histogram @@ -273,6 +294,27 @@ def _cont_calcs(srs: Srs, cfg: Config) -> Dict[str, List[Any]]: da.histogram, bins=cfg.hist.bins, range=(min_max.min(), min_max.max()) ) + # compute the density histogram + if cfg.diff.density: + data["dens"] = srs.self_map( + da.histogram, + condition=min_max_comp, + bins=cfg.kde.bins, + range=(min_max.min(), min_max.max()), + density=True, + ) + # gaussian kernel density estimate + data["kde"] = [] + sample_data = dask.compute( + srs.apply( + "map_partitions", + lambda x: x.sample(min(1000, x.shape[0])), + meta=pd.Series([], dtype=float), + ).data + ) + for ind in range(len(sample_data[0])): + data["kde"].append(gaussian_kde(sample_data[0][ind])) + return data diff --git a/dataprep/eda/diff/render.py b/dataprep/eda/diff/render.py index 9bc79c2b9..dffe842c4 100644 --- a/dataprep/eda/diff/render.py +++ b/dataprep/eda/diff/render.py @@ -227,14 +227,14 @@ def hist_viz( return fig -def kde_viz( +def kde_viz_figure( hist: List[Tuple[np.ndarray, np.ndarray]], kde: np.ndarray, col: str, plot_width: int, plot_height: int, cfg: Config, -) -> Panel: +) -> Figure: """ Render histogram with overlayed kde """ @@ -285,6 +285,22 @@ def kde_viz( _format_axis(fig, df.iloc[0]["left"], df.iloc[-1]["right"], "x") if cfg.kde.yscale == "linear": _format_axis(fig, 0, max(df["dens"].max(), pdf.max()), "y") + return fig + + +def kde_viz_panel( + hist: List[Tuple[np.ndarray, np.ndarray]], + kde: np.ndarray, + col: str, + plot_width: int, + plot_height: int, + cfg: Config, +) -> Panel: + """ + Render histogram with overlayed kde + """ + # pylint: disable=too-many-arguments, too-many-locals + fig = kde_viz_figure(hist, kde, col, plot_width, plot_height, cfg) return Panel(child=row(fig), title="KDE Plot") @@ -593,7 +609,9 @@ def render_comparison_grid(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]: figs: List[Figure] = [] nrows = itmdt["stats"]["nrows"] titles: List[str] = [] + for col, dtp, data, orig in itmdt["data"]: + fig = None if is_dtype(dtp, Nominal()): df, ttl_grps = data fig = bar_viz( @@ -610,17 +628,22 @@ def render_comparison_grid(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]: baseline if len(df) > 1 else 0, ) elif is_dtype(dtp, Continuous()): - fig = hist_viz( - data, - nrows, - col, - cfg.hist.yscale, - plot_width, - plot_height, - False, - df_labels, - orig, - ) + if cfg.diff.density: + kde, dens = data + if kde is not None and not isinstance(dens, np.integer): + fig = kde_viz_figure(dens, kde, col, plot_width, plot_height, cfg) + else: + fig = hist_viz( + data, + nrows, + col, + cfg.hist.yscale, + plot_width, + plot_height, + False, + df_labels, + orig, + ) elif is_dtype(dtp, DateTime()): df, timeunit = data fig = dt_line_viz( @@ -634,10 +657,11 @@ def render_comparison_grid(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]: orig, df_labels, ) - fig.frame_height = plot_height - titles.append(fig.title.text) - fig.title.text = "" - figs.append(fig) + if fig is not None: + fig.frame_height = plot_height + titles.append(fig.title.text) + fig.title.text = "" + figs.append(fig) if cfg.stats.enable: toggle_content = "Stats" @@ -682,7 +706,7 @@ def render_comparison_continous(itmdt: Intermediate, cfg: Config) -> Dict[str, A not math.isclose(itmdt["stats"]["min"][0], itmdt["stats"]["max"][0]) ): dens, kde = data["dens"], data["kde"] - tabs.append(kde_viz(dens, kde, col, plot_width, plot_height, cfg)) + tabs.append(kde_viz_panel(dens, kde, col, plot_width, plot_height, cfg)) # htgs["KDE Plot"] = cfg.kde.how_to_guide(plot_height, plot_width) if cfg.box.enable: df_list = []