From 4696e598aa25a6e2a1b264b4125cedccce6845a0 Mon Sep 17 00:00:00 2001
From: jinglinpeng <jlpengcs@gmail.com>
Date: Wed, 10 Mar 2021 17:37:22 -0800
Subject: [PATCH] feat(type): detect column as categorical for small unique
 values

---
 dataprep/eda/create_report/formatter.py     |  4 ++--
 dataprep/eda/dtypes.py                      |  7 ++++++-
 dataprep/eda/missing/compute/common.py      | 10 ++++------
 dataprep/tests/eda/random_data_generator.py |  2 +-
 4 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/dataprep/eda/create_report/formatter.py b/dataprep/eda/create_report/formatter.py
index a3d72cc49..09788cb45 100644
--- a/dataprep/eda/create_report/formatter.py
+++ b/dataprep/eda/create_report/formatter.py
@@ -27,7 +27,6 @@
     _insight_pagination,
 )
 from ..dtypes import (
-    CATEGORICAL_DTYPES,
     Continuous,
     DateTime,
     Nominal,
@@ -263,7 +262,8 @@ def basic_computations(
 
     df_num = df.select_num_columns()
     data["num_cols"] = df_num.columns
-    first_rows = df.select_dtypes(CATEGORICAL_DTYPES).head
+    first_rows = df.head
+
     # variables
     if cfg.variables.enable:
         for col in df.columns:
diff --git a/dataprep/eda/dtypes.py b/dataprep/eda/dtypes.py
index c4f33ba82..133ab1f55 100644
--- a/dataprep/eda/dtypes.py
+++ b/dataprep/eda/dtypes.py
@@ -199,7 +199,12 @@ def detect_without_known(col: dd.Series) -> DType:
             return Nominal()
 
     elif is_continuous(col.dtype):
-        return Continuous()
+        # detect as categorical if distinct value is small
+        nuniques = col.nunique_approx().compute()
+        if nuniques < 10:
+            return Nominal()
+        else:
+            return Continuous()
 
     elif is_datetime(col.dtype):
         return DateTime()
diff --git a/dataprep/eda/missing/compute/common.py b/dataprep/eda/missing/compute/common.py
index 427957c56..5dfccff49 100644
--- a/dataprep/eda/missing/compute/common.py
+++ b/dataprep/eda/missing/compute/common.py
@@ -51,8 +51,9 @@ def histogram(
     """Calculate "histogram" for both numerical and categorical."""
     if len(arr.shape) != 1:
         raise ValueError("Histogram only supports 1-d array.")
-
-    if is_dtype(detect_dtype(arr, dtype), Continuous()):
+    srs = dd.from_dask_array(arr)
+    detected_type = detect_dtype(srs, dtype)
+    if is_dtype(detected_type, Continuous()):
         if range is not None:
             minimum, maximum = range
         else:
@@ -67,14 +68,11 @@ def histogram(
         if not return_edges:
             return counts, centers
         return counts, centers, edges
-    elif is_dtype(detect_dtype(arr, dtype), Nominal()) or is_dtype(
-        detect_dtype(arr, dtype), GeoGraphy()
-    ):
+    elif is_dtype(detected_type, Nominal()) or is_dtype(detected_type, GeoGraphy()):
         # Dask array's unique is way slower than the values_counts on Series
         # See https://github.com/dask/dask/issues/2851
         # centers, counts = da.unique(arr, return_counts=True)
 
-        srs = dd.from_dask_array(arr)
         value_counts = srs.value_counts()
 
         counts = value_counts.to_dask_array()
diff --git a/dataprep/tests/eda/random_data_generator.py b/dataprep/tests/eda/random_data_generator.py
index 22c75ebc8..05d8d50a5 100644
--- a/dataprep/tests/eda/random_data_generator.py
+++ b/dataprep/tests/eda/random_data_generator.py
@@ -23,7 +23,7 @@ def _gen_random_int_series(
 ) -> pd.Series:
     """Return a randonly generated int Series, where the value is in [low, high]"""
     rand = _resolve_random_state(random_state)
-    arr = rand.random_integers(low=low, high=high, size=size)
+    arr = rand.randint(low=low, high=high, size=size)
     return pd.Series(arr)