Skip to content

Commit

Permalink
Bug fixing for hasnans when non-DoubleType. (#1681)
Browse files Browse the repository at this point in the history
`ks.Series.hasnans` and `ks.Index.hasnans` seems not work properly for non-DoubleType.


```python
>>> ks.Series([True, True, np.nan]).hasnans
Traceback (most recent call last):
...
pyspark.sql.utils.AnalysisException: cannot resolve 'isnan(`0`)' due to data type mismatch: argument 1 requires (double or float) type, however, '`0`' is of boolean type.;;
'Aggregate [max((isnull(0#12) OR isnan(0#12))) AS max(((0 IS NULL) OR isnan(0)))#21]
+- Project [__index_level_0__#11L, 0#12, monotonically_increasing_id() AS __natural_order__#15L]
   +- LogicalRDD [__index_level_0__#11L, 0#12], false
```

This PR fixed it.
  • Loading branch information
itholic authored Jul 31, 2020
1 parent b5aacc7 commit fb2eedc
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 2 deletions.
14 changes: 12 additions & 2 deletions databricks/koalas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,14 @@
from pandas.core.accessor import CachedAccessor
from pyspark import sql as spark
from pyspark.sql import functions as F, Window, Column
from pyspark.sql.types import DateType, DoubleType, FloatType, LongType, StringType, TimestampType
from pyspark.sql.types import (
DateType,
DoubleType,
FloatType,
LongType,
StringType,
TimestampType,
)

from databricks import koalas as ks # For running doctests and reference resolution in PyCharm.
from databricks.koalas import numpy_compat
Expand Down Expand Up @@ -391,7 +398,10 @@ def hasnans(self):
sdf = self._internal.spark_frame
scol = self.spark.column

return sdf.select(F.max(scol.isNull() | F.isnan(scol))).collect()[0][0]
if isinstance(self.spark.data_type, (DoubleType, FloatType)):
return sdf.select(F.max(scol.isNull() | F.isnan(scol))).collect()[0][0]
else:
return sdf.select(F.max(scol.isNull())).collect()[0][0]

@property
def is_monotonic(self):
Expand Down
19 changes: 19 additions & 0 deletions databricks/koalas/tests/test_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1368,3 +1368,22 @@ def test_abs(self):
kidx = ks.MultiIndex.from_tuples([(1, 2)], names=["level1", "level2"])
with self.assertRaisesRegex(TypeError, "perform __abs__ with this index"):
abs(kidx)

def test_hasnans(self):
# BooleanType
pidx = pd.Index([True, False, True, True])
kidx = ks.from_pandas(pidx)
self.assert_eq(pidx.hasnans, kidx.hasnans)

pidx = pd.Index([True, False, np.nan, True])
kidx = ks.from_pandas(pidx)
self.assert_eq(pidx.hasnans, kidx.hasnans)

# TimestampType
pser = pd.Series([pd.Timestamp("2020-07-30") for _ in range(3)])
kser = ks.from_pandas(pser)
self.assert_eq(pser.hasnans, kser.hasnans)

pser = pd.Series([pd.Timestamp("2020-07-30"), np.nan, pd.Timestamp("2020-07-30")])
kser = ks.from_pandas(pser)
self.assert_eq(pser.hasnans, kser.hasnans)
19 changes: 19 additions & 0 deletions databricks/koalas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1894,3 +1894,22 @@ def test_tail(self):
self.assert_eq(pser.tail(-1001), kser.tail(-1001))
with self.assertRaisesRegex(TypeError, "bad operand type for unary -: 'str'"):
kser.tail("10")

def test_hasnans(self):
# BooleanType
pser = pd.Series([True, False, True, True])
kser = ks.from_pandas(pser)
self.assert_eq(pser.hasnans, kser.hasnans)

pser = pd.Series([True, False, np.nan, True])
kser = ks.from_pandas(pser)
self.assert_eq(pser.hasnans, kser.hasnans)

# TimestampType
pser = pd.Series([pd.Timestamp("2020-07-30") for _ in range(3)])
kser = ks.from_pandas(pser)
self.assert_eq(pser.hasnans, kser.hasnans)

pser = pd.Series([pd.Timestamp("2020-07-30"), np.nan, pd.Timestamp("2020-07-30")])
kser = ks.from_pandas(pser)
self.assert_eq(pser.hasnans, kser.hasnans)

0 comments on commit fb2eedc

Please sign in to comment.