Skip to content

Commit

Permalink
feat(eda): refactored code and added density parameter to plot_diff(df)
Browse files Browse the repository at this point in the history
feat(eda): added density parameter to plot_diff(df)

refactor(eda): ran just ci tests

refactor: changed kde_viz_panel() into a wrapper
  • Loading branch information
Devin Lu authored and jinglinpeng committed Oct 26, 2021
1 parent 255b6a9 commit 323ae6b
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 20 deletions.
1 change: 1 addition & 0 deletions dataprep/eda/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ class Diff(BaseModel):

label: Union[List[str], None] = None
baseline: int = 0
density: bool = False


class Stats(BaseModel):
Expand Down
46 changes: 44 additions & 2 deletions dataprep/eda/diff/compute/multiple_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from collections import UserList, OrderedDict
from typing import Any, Callable, Dict, List, Tuple, Union, Optional

import math
import pandas as pd
import numpy as np
import dask
Expand All @@ -23,6 +24,7 @@
drop_null,
DTypeDef,
)
from ...utils import gaussian_kde
from ...configs import Config


Expand Down Expand Up @@ -124,10 +126,23 @@ def getmask(

return output

def self_map(self, func: Callable[[dd.Series], Any], **kwargs: Any) -> List[Any]:
def self_map(
self,
func: Callable[[dd.Series], Any],
condition: Optional[List[bool]] = None,
**kwargs: Any,
) -> List[Any]:
"""
Map the data to the given function.
"""
if condition:
rslt = []
for cond, data in zip(condition, self.data):
if not cond:
rslt.append(func(data, **kwargs))
else:
rslt.append(None)
return rslt
return [func(srs, **kwargs) for srs in self.data]


Expand Down Expand Up @@ -198,7 +213,9 @@ def compare_multiple_df(

for col, dtp, datum, orig in data:
if is_dtype(dtp, Continuous()):
if cfg.hist.enable:
if cfg.diff.density:
plot_data.append((col, dtp, (datum["kde"], datum["dens"]), orig))
elif cfg.hist.enable:
plot_data.append((col, dtp, datum["hist"], orig))
elif is_dtype(dtp, Nominal()):
if cfg.bar.enable:
Expand Down Expand Up @@ -266,13 +283,38 @@ def _cont_calcs(srs: Srs, cfg: Config) -> Dict[str, List[Any]]:
min_max = srs.apply(
"map_partitions", lambda x: pd.Series([x.max(), x.min()]), meta=pd.Series([], dtype=float)
).data
min_max_comp = []
if cfg.diff.density:
for min_max_value in dask.compute(min_max)[0]:
min_max_comp.append(math.isclose(min_max_value.min(), min_max_value.max()))
min_max = dd.concat(min_max).repartition(npartitions=1)

# histogram
data["hist"] = srs.self_map(
da.histogram, bins=cfg.hist.bins, range=(min_max.min(), min_max.max())
)

# compute the density histogram
if cfg.diff.density:
data["dens"] = srs.self_map(
da.histogram,
condition=min_max_comp,
bins=cfg.kde.bins,
range=(min_max.min(), min_max.max()),
density=True,
)
# gaussian kernel density estimate
data["kde"] = []
sample_data = dask.compute(
srs.apply(
"map_partitions",
lambda x: x.sample(min(1000, x.shape[0])),
meta=pd.Series([], dtype=float),
).data
)
for ind in range(len(sample_data[0])):
data["kde"].append(gaussian_kde(sample_data[0][ind]))

return data


Expand Down
60 changes: 42 additions & 18 deletions dataprep/eda/diff/render.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,14 +227,14 @@ def hist_viz(
return fig


def kde_viz(
def kde_viz_figure(
hist: List[Tuple[np.ndarray, np.ndarray]],
kde: np.ndarray,
col: str,
plot_width: int,
plot_height: int,
cfg: Config,
) -> Panel:
) -> Figure:
"""
Render histogram with overlayed kde
"""
Expand Down Expand Up @@ -285,6 +285,22 @@ def kde_viz(
_format_axis(fig, df.iloc[0]["left"], df.iloc[-1]["right"], "x")
if cfg.kde.yscale == "linear":
_format_axis(fig, 0, max(df["dens"].max(), pdf.max()), "y")
return fig


def kde_viz_panel(
hist: List[Tuple[np.ndarray, np.ndarray]],
kde: np.ndarray,
col: str,
plot_width: int,
plot_height: int,
cfg: Config,
) -> Panel:
"""
Render histogram with overlayed kde
"""
# pylint: disable=too-many-arguments, too-many-locals
fig = kde_viz_figure(hist, kde, col, plot_width, plot_height, cfg)
return Panel(child=row(fig), title="KDE Plot")


Expand Down Expand Up @@ -593,7 +609,9 @@ def render_comparison_grid(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]:
figs: List[Figure] = []
nrows = itmdt["stats"]["nrows"]
titles: List[str] = []

for col, dtp, data, orig in itmdt["data"]:
fig = None
if is_dtype(dtp, Nominal()):
df, ttl_grps = data
fig = bar_viz(
Expand All @@ -610,17 +628,22 @@ def render_comparison_grid(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]:
baseline if len(df) > 1 else 0,
)
elif is_dtype(dtp, Continuous()):
fig = hist_viz(
data,
nrows,
col,
cfg.hist.yscale,
plot_width,
plot_height,
False,
df_labels,
orig,
)
if cfg.diff.density:
kde, dens = data
if kde is not None and not isinstance(dens, np.integer):
fig = kde_viz_figure(dens, kde, col, plot_width, plot_height, cfg)
else:
fig = hist_viz(
data,
nrows,
col,
cfg.hist.yscale,
plot_width,
plot_height,
False,
df_labels,
orig,
)
elif is_dtype(dtp, DateTime()):
df, timeunit = data
fig = dt_line_viz(
Expand All @@ -634,10 +657,11 @@ def render_comparison_grid(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]:
orig,
df_labels,
)
fig.frame_height = plot_height
titles.append(fig.title.text)
fig.title.text = ""
figs.append(fig)
if fig is not None:
fig.frame_height = plot_height
titles.append(fig.title.text)
fig.title.text = ""
figs.append(fig)

if cfg.stats.enable:
toggle_content = "Stats"
Expand Down Expand Up @@ -682,7 +706,7 @@ def render_comparison_continous(itmdt: Intermediate, cfg: Config) -> Dict[str, A
not math.isclose(itmdt["stats"]["min"][0], itmdt["stats"]["max"][0])
):
dens, kde = data["dens"], data["kde"]
tabs.append(kde_viz(dens, kde, col, plot_width, plot_height, cfg))
tabs.append(kde_viz_panel(dens, kde, col, plot_width, plot_height, cfg))
# htgs["KDE Plot"] = cfg.kde.how_to_guide(plot_height, plot_width)
if cfg.box.enable:
df_list = []
Expand Down

0 comments on commit 323ae6b

Please sign in to comment.