From eeb210dbe5804d1dd7782e995de29ad5bfc439be Mon Sep 17 00:00:00 2001 From: Devin Date: Tue, 14 Dec 2021 18:07:09 -0800 Subject: [PATCH] feat(eda): enriched show details tab by adding plots and overview statistics feat(eda): displayed overview statistics in show details tab feat(eda): added titles in show details tab, styled HTML too feat(eda): added sub plots in show details tab --- .gitignore | 3 + .../eda/create_diff_report/diff_formatter.py | 27 ++++- .../create_diff_report/templates/scripts.html | 24 ++++ .../create_diff_report/templates/styles.html | 64 ++++++++++ .../templates/variables.html | 114 ++++++++++++++++++ dataprep/eda/diff/render.py | 1 - 6 files changed, 229 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index c626da821..e97ae7ff0 100644 --- a/.gitignore +++ b/.gitignore @@ -116,3 +116,6 @@ profiling report.xml .vim .DS_Store + +# personal testing +Untitled.ipynb \ No newline at end of file diff --git a/dataprep/eda/create_diff_report/diff_formatter.py b/dataprep/eda/create_diff_report/diff_formatter.py index 0a345693e..3d25d6bfa 100644 --- a/dataprep/eda/create_diff_report/diff_formatter.py +++ b/dataprep/eda/create_diff_report/diff_formatter.py @@ -16,7 +16,13 @@ from ..configs import Config from ..distribution.compute.overview import calc_stats from ..distribution.compute.univariate import cont_comps, nom_comps -from ..distribution.render import format_cat_stats, format_num_stats, format_ov_stats, stats_viz_dt +from ..distribution.render import ( + format_cat_stats, + format_num_stats, + format_ov_stats, + stats_viz_dt, +) +from ..distribution import render from ..distribution.compute.overview import ( _nom_calcs, _cont_calcs, @@ -137,8 +143,6 @@ def format_basic(df_list: List[pd.DataFrame], cfg: Config) -> Dict[str, Any]: for df in df_list: df = EDAFrame(df) setattr(getattr(cfg, "plot"), "report", True) - # data, completions = basic_computations(df, cfg) - # data = dask.delayed(basic_computations)(df, cfg) data = basic_computations(df, cfg) with catch_warnings(): filterwarnings( @@ -354,14 +358,17 @@ def _format_variables(df: EDAFrame, cfg: Config, data: Dict[str, Any]) -> Dict[s try: stats: Any = None # needed for pylint dtp = df.get_eda_dtype(col) + tab_names: List[str] = [] if isinstance(dtp, Continuous): itmdt = Intermediate(col=col, data=data[col], visual_type="numerical_column") stats = format_num_stats(data[col]) + tab_names = ["Stats", "KDE Plot", "Normal Q-Q Plot", "Box Plot"] elif type(dtp) in [Nominal, SmallCardNum, GeoGraphy, GeoPoint]: itmdt = Intermediate(col=col, data=data[col], visual_type="categorical_column") stats = format_cat_stats( data[col]["stats"], data[col]["len_stats"], data[col]["letter_stats"] ) + tab_names = ["Stats", "Pie Chart", "Word Cloud", "Word Frequency", "Word Length"] elif isinstance(dtp, DateTime): itmdt = Intermediate( col=col, @@ -373,9 +380,23 @@ def _format_variables(df: EDAFrame, cfg: Config, data: Dict[str, Any]) -> Dict[s else: raise RuntimeError(f"the type of column {col} is unknown: {type(dtp)}") + rndrd = render(itmdt, cfg) + layout = rndrd["layout"] + figs_var: List[Figure] = [] + for tab in layout: + try: + fig = tab.children[0] + except AttributeError: + fig = tab + # fig.title = Title(text=tab.title, align="center") + figs_var.append(fig) + comp = components(figs_var) + res["variables"][col] = { "tabledata": stats, "col_type": itmdt.visual_type.replace("_column", ""), + "tab_names": tab_names, + "plots": comp, } except: diff --git a/dataprep/eda/create_diff_report/templates/scripts.html b/dataprep/eda/create_diff_report/templates/scripts.html index 66f4379af..7af36063d 100644 --- a/dataprep/eda/create_diff_report/templates/scripts.html +++ b/dataprep/eda/create_diff_report/templates/scripts.html @@ -1,5 +1,13 @@ {{ context.components.graphs[0] }} +{% for var in context.components.dfs[0].variables.values() %} +{{ var.plots[0] }} +{% endfor %} + +{% for var in context.components.dfs[1].variables.values() %} +{{ var.plots[0] }} +{% endfor %} +