Skip to content

Commit

Permalink
fix(eda): fix the error of numerical cell in object column
Browse files Browse the repository at this point in the history
  • Loading branch information
jinglinpeng committed Feb 13, 2021
1 parent 08e73c0 commit 91c4f9d
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 28 deletions.
8 changes: 3 additions & 5 deletions dataprep/eda/create_report/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,11 +228,9 @@ def basic_computations(df: dd.DataFrame, cfg: Config) -> Tuple[Dict[str, Any], D
if is_dtype(detect_dtype(df.frame[col]), Continuous()):
data[col] = cont_comps(df.frame[col], cfg)
elif is_dtype(detect_dtype(df.frame[col]), Nominal()):
# cast the column as string type if it contains a mutable type
try:
first_rows[col].apply(hash)
except TypeError:
df.frame[col] = df.frame[col].astype(str)
# Since it will throw error if column is object while some cells are
# numerical, we transform column to string first.
df.frame[col] = df.frame[col].astype(str)
data[col] = nom_comps(df.frame[col], first_rows[col], cfg)
elif is_dtype(detect_dtype(df.frame[col]), DateTime()):
data[col] = {}
Expand Down
20 changes: 7 additions & 13 deletions dataprep/eda/distribution/compute/bivariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,9 @@ def compute_bivariate(
):
x, y = (x, y) if is_dtype(xtype, Nominal()) else (y, x)
df = df[[x, y]]
try:
df.head()[x].apply(hash)
except TypeError:
df[x] = df[x].astype(str)
# Since it will throw error if column is object while some cells are
# numerical, we transform column to string first.
df[x] = df[x].astype(str)

(comps,) = dask.compute(_nom_cont_comps(df.dropna(), cfg))

Expand Down Expand Up @@ -155,15 +154,10 @@ def compute_bivariate(
)
elif is_dtype(xtype, Nominal()) and is_dtype(ytype, Nominal()):
df = df[[x, y]]
head = df.head()
try:
head[x].apply(hash)
except TypeError:
df[x] = df[x].astype(str)
try:
head[y].apply(hash)
except TypeError:
df[y] = df[y].astype(str)
# Since it will throw error if column is object while some cells are
# numerical, we transform column to string first.
df[x] = df[x].astype(str)
df[y] = df[y].astype(str)

(comps,) = dask.compute(df.dropna().groupby([x, y]).size())
return Intermediate(
Expand Down
8 changes: 3 additions & 5 deletions dataprep/eda/distribution/compute/overview.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,9 @@ def compute_overview(df: dd.DataFrame, cfg: Config, dtype: Optional[DTypeDef]) -
if is_dtype(col_dtype, Continuous()) and (cfg.hist.enable or cfg.insight.enable):
data.append((col, Continuous(), _cont_calcs(df[col].dropna(), cfg)))
elif is_dtype(col_dtype, Nominal()) and (cfg.bar.enable or cfg.insight.enable):
# cast the column as string type if it contains a mutable type
try:
head[col].apply(hash)
except TypeError:
df[col] = df[col].astype(str)
# Since it will throw error if column is object while some cells are
# numerical, we transform column to string first.
df[col] = df[col].astype(str)
data.append((col, Nominal(), _nom_calcs(df[col].dropna(), head[col], cfg)))
elif is_dtype(col_dtype, DateTime()) and (cfg.line.enable or cfg.insight.enable):
data.append((col, DateTime(), dask.delayed(_calc_line_dt)(df[[col]], cfg.line.unit)))
Expand Down
8 changes: 3 additions & 5 deletions dataprep/eda/distribution/compute/univariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,10 @@ def compute_univariate(

if is_dtype(col_dtype, Nominal()):
head = df[x].head() # dd.Series.head() triggers a (small) data read
# cast the column as string type if it contains a mutable type
try:
head.apply(hash)
except TypeError:
df[x] = df[x].astype(str)

# Since it will throw error if column is object while some cells are
# numerical, we transform column to string first.
df[x] = df[x].astype(str)
# all computations for plot(df, Nominal())
(data,) = dask.compute(nom_comps(df[x], head, cfg))

Expand Down

0 comments on commit 91c4f9d

Please sign in to comment.