Skip to content

Commit

Permalink
fix(eda): change dtype 'string' to 'object'
Browse files Browse the repository at this point in the history
  • Loading branch information
yuzhenmao authored and brandonlockhart committed Sep 30, 2020
1 parent df32e1d commit 8ddddbc
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 4 deletions.
10 changes: 9 additions & 1 deletion dataprep/eda/create_report/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,14 @@
from ..distribution.compute.overview import calc_stats
from ..distribution.compute.univariate import cont_comps, nom_comps
from ..distribution.render import format_cat_stats, format_num_stats, format_ov_stats
from ..dtypes import CATEGORICAL_DTYPES, Continuous, Nominal, detect_dtype, is_dtype
from ..dtypes import (
CATEGORICAL_DTYPES,
Continuous,
Nominal,
detect_dtype,
is_dtype,
string_dtype_to_object,
)
from ..intermediate import Intermediate
from ..missing import render_missing
from ..missing.compute.nullivariate import compute_missing_nullivariate
Expand Down Expand Up @@ -51,6 +58,7 @@ def format_report(
# pylint: disable=too-many-locals,too-many-statements
with ProgressBar(minimum=1, disable=not progress):
df = to_dask(df)
df = string_dtype_to_object(df)
if mode == "basic":
comps = format_basic(df)
# elif mode == "full":
Expand Down
4 changes: 2 additions & 2 deletions dataprep/eda/distribution/compute/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import dask.dataframe as dd
import pandas as pd

from ...dtypes import DTypeDef
from ...dtypes import DTypeDef, string_dtype_to_object
from ...intermediate import Intermediate
from ...utils import to_dask
from .bivariate import compute_bivariate
Expand Down Expand Up @@ -93,9 +93,9 @@ def compute(
dtype = {"a": Continuous(), "b": "nominal"}
or dtype = Continuous() or dtype = "Continuous" or dtype = Continuous()
""" # pylint: disable=too-many-locals

df = to_dask(df)
df.columns = df.columns.astype(str)
df = string_dtype_to_object(df)

if not any((x, y, z)):
return compute_overview(df, bins, ngroups, largest, timeunit, dtype)
Expand Down
14 changes: 14 additions & 0 deletions dataprep/eda/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
CATEGORICAL_PANDAS_DTYPES = [pd.CategoricalDtype, pd.PeriodDtype]
CATEGORICAL_DTYPES = CATEGORICAL_NUMPY_DTYPES + CATEGORICAL_PANDAS_DTYPES

STRING_PANDAS_DTYPES = [pd.StringDtype]
STRING_DTYPES = STRING_PANDAS_DTYPES

NUMERICAL_NUMPY_DTYPES = [np.number]
NUMERICAL_DTYPES = NUMERICAL_NUMPY_DTYPES

Expand Down Expand Up @@ -256,6 +259,17 @@ def is_pandas_categorical(dtype: Any) -> bool:
return any(isinstance(dtype, c) for c in CATEGORICAL_PANDAS_DTYPES)


def string_dtype_to_object(df: dd.DataFrame) -> dd.DataFrame:
"""
Convert string dtype to object dtype
"""
for col in df.columns:
if any(isinstance(df[col].dtype, c) for c in STRING_DTYPES):
df[col] = df[col].astype(object)

return df


def drop_null(
var: Union[dd.Series, pd.DataFrame, dd.DataFrame]
) -> Union[pd.Series, dd.Series, pd.DataFrame, dd.DataFrame]:
Expand Down
3 changes: 2 additions & 1 deletion dataprep/eda/missing/compute/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from warnings import catch_warnings, filterwarnings

from ...data_array import DataArray, DataFrame
from ...dtypes import DTypeDef
from ...dtypes import DTypeDef, string_dtype_to_object
from ...intermediate import Intermediate
from .bivariate import compute_missing_bivariate
from .nullivariate import compute_missing_nullivariate
Expand Down Expand Up @@ -53,6 +53,7 @@ def compute_missing( # pylint: disable=too-many-arguments
>>> plot_missing(df, "HDI_for_year")
>>> plot_missing(df, "HDI_for_year", "population")
"""
df = string_dtype_to_object(df)
df = DataArray(df)

# pylint: disable=no-else-raise
Expand Down

0 comments on commit 8ddddbc

Please sign in to comment.