From 4696e598aa25a6e2a1b264b4125cedccce6845a0 Mon Sep 17 00:00:00 2001 From: jinglinpeng Date: Wed, 10 Mar 2021 17:37:22 -0800 Subject: [PATCH] feat(type): detect column as categorical for small unique values --- dataprep/eda/create_report/formatter.py | 4 ++-- dataprep/eda/dtypes.py | 7 ++++++- dataprep/eda/missing/compute/common.py | 10 ++++------ dataprep/tests/eda/random_data_generator.py | 2 +- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/dataprep/eda/create_report/formatter.py b/dataprep/eda/create_report/formatter.py index a3d72cc49..09788cb45 100644 --- a/dataprep/eda/create_report/formatter.py +++ b/dataprep/eda/create_report/formatter.py @@ -27,7 +27,6 @@ _insight_pagination, ) from ..dtypes import ( - CATEGORICAL_DTYPES, Continuous, DateTime, Nominal, @@ -263,7 +262,8 @@ def basic_computations( df_num = df.select_num_columns() data["num_cols"] = df_num.columns - first_rows = df.select_dtypes(CATEGORICAL_DTYPES).head + first_rows = df.head + # variables if cfg.variables.enable: for col in df.columns: diff --git a/dataprep/eda/dtypes.py b/dataprep/eda/dtypes.py index c4f33ba82..133ab1f55 100644 --- a/dataprep/eda/dtypes.py +++ b/dataprep/eda/dtypes.py @@ -199,7 +199,12 @@ def detect_without_known(col: dd.Series) -> DType: return Nominal() elif is_continuous(col.dtype): - return Continuous() + # detect as categorical if distinct value is small + nuniques = col.nunique_approx().compute() + if nuniques < 10: + return Nominal() + else: + return Continuous() elif is_datetime(col.dtype): return DateTime() diff --git a/dataprep/eda/missing/compute/common.py b/dataprep/eda/missing/compute/common.py index 427957c56..5dfccff49 100644 --- a/dataprep/eda/missing/compute/common.py +++ b/dataprep/eda/missing/compute/common.py @@ -51,8 +51,9 @@ def histogram( """Calculate "histogram" for both numerical and categorical.""" if len(arr.shape) != 1: raise ValueError("Histogram only supports 1-d array.") - - if is_dtype(detect_dtype(arr, dtype), Continuous()): + srs = dd.from_dask_array(arr) + detected_type = detect_dtype(srs, dtype) + if is_dtype(detected_type, Continuous()): if range is not None: minimum, maximum = range else: @@ -67,14 +68,11 @@ def histogram( if not return_edges: return counts, centers return counts, centers, edges - elif is_dtype(detect_dtype(arr, dtype), Nominal()) or is_dtype( - detect_dtype(arr, dtype), GeoGraphy() - ): + elif is_dtype(detected_type, Nominal()) or is_dtype(detected_type, GeoGraphy()): # Dask array's unique is way slower than the values_counts on Series # See https://github.com/dask/dask/issues/2851 # centers, counts = da.unique(arr, return_counts=True) - srs = dd.from_dask_array(arr) value_counts = srs.value_counts() counts = value_counts.to_dask_array() diff --git a/dataprep/tests/eda/random_data_generator.py b/dataprep/tests/eda/random_data_generator.py index 22c75ebc8..05d8d50a5 100644 --- a/dataprep/tests/eda/random_data_generator.py +++ b/dataprep/tests/eda/random_data_generator.py @@ -23,7 +23,7 @@ def _gen_random_int_series( ) -> pd.Series: """Return a randonly generated int Series, where the value is in [low, high]""" rand = _resolve_random_state(random_state) - arr = rand.random_integers(low=low, high=high, size=size) + arr = rand.randint(low=low, high=high, size=size) return pd.Series(arr)