Bug fixing for hasnans when non-DoubleType. (#1681)

`ks.Series.hasnans` and `ks.Index.hasnans` seems not work properly for non-DoubleType. ```python >>> ks.Series([True, True, np.nan]).hasnans Traceback (most recent call last): ... pyspark.sql.utils.AnalysisException: cannot resolve 'isnan(`0`)' due to data type mismatch: argument 1 requires (double or float) type, however, '`0`' is of boolean type.;; 'Aggregate [max((isnull(0#12) OR isnan(0#12))) AS max(((0 IS NULL) OR isnan(0)))#21] +- Project [__index_level_0__#11L, 0#12, monotonically_increasing_id() AS __natural_order__#15L] +- LogicalRDD [__index_level_0__#11L, 0#12], false ``` This PR fixed it.
databricks · Jul 31, 2020 · fb2eedc · fb2eedc
1 parent b5aacc7
commit fb2eedc
Show file tree

Hide file tree

Showing 3 changed files with 50 additions and 2 deletions.
diff --git a/databricks/koalas/base.py b/databricks/koalas/base.py
@@ -29,7 +29,14 @@
 from pandas.core.accessor import CachedAccessor
 from pyspark import sql as spark
 from pyspark.sql import functions as F, Window, Column
-from pyspark.sql.types import DateType, DoubleType, FloatType, LongType, StringType, TimestampType
+from pyspark.sql.types import (
+    DateType,
+    DoubleType,
+    FloatType,
+    LongType,
+    StringType,
+    TimestampType,
+)
 
 from databricks import koalas as ks  # For running doctests and reference resolution in PyCharm.
 from databricks.koalas import numpy_compat
@@ -391,7 +398,10 @@ def hasnans(self):
         sdf = self._internal.spark_frame
         scol = self.spark.column
 
-        return sdf.select(F.max(scol.isNull() | F.isnan(scol))).collect()[0][0]
+        if isinstance(self.spark.data_type, (DoubleType, FloatType)):
+            return sdf.select(F.max(scol.isNull() | F.isnan(scol))).collect()[0][0]
+        else:
+            return sdf.select(F.max(scol.isNull())).collect()[0][0]
 
     @property
     def is_monotonic(self):

diff --git a/databricks/koalas/tests/test_indexes.py b/databricks/koalas/tests/test_indexes.py
@@ -1368,3 +1368,22 @@ def test_abs(self):
         kidx = ks.MultiIndex.from_tuples([(1, 2)], names=["level1", "level2"])
         with self.assertRaisesRegex(TypeError, "perform __abs__ with this index"):
             abs(kidx)
+
+    def test_hasnans(self):
+        # BooleanType
+        pidx = pd.Index([True, False, True, True])
+        kidx = ks.from_pandas(pidx)
+        self.assert_eq(pidx.hasnans, kidx.hasnans)
+
+        pidx = pd.Index([True, False, np.nan, True])
+        kidx = ks.from_pandas(pidx)
+        self.assert_eq(pidx.hasnans, kidx.hasnans)
+
+        # TimestampType
+        pser = pd.Series([pd.Timestamp("2020-07-30") for _ in range(3)])
+        kser = ks.from_pandas(pser)
+        self.assert_eq(pser.hasnans, kser.hasnans)
+
+        pser = pd.Series([pd.Timestamp("2020-07-30"), np.nan, pd.Timestamp("2020-07-30")])
+        kser = ks.from_pandas(pser)
+        self.assert_eq(pser.hasnans, kser.hasnans)
diff --git a/databricks/koalas/tests/test_series.py b/databricks/koalas/tests/test_series.py
@@ -1894,3 +1894,22 @@ def test_tail(self):
             self.assert_eq(pser.tail(-1001), kser.tail(-1001))
             with self.assertRaisesRegex(TypeError, "bad operand type for unary -: 'str'"):
                 kser.tail("10")
+
+    def test_hasnans(self):
+        # BooleanType
+        pser = pd.Series([True, False, True, True])
+        kser = ks.from_pandas(pser)
+        self.assert_eq(pser.hasnans, kser.hasnans)
+
+        pser = pd.Series([True, False, np.nan, True])
+        kser = ks.from_pandas(pser)
+        self.assert_eq(pser.hasnans, kser.hasnans)
+
+        # TimestampType
+        pser = pd.Series([pd.Timestamp("2020-07-30") for _ in range(3)])
+        kser = ks.from_pandas(pser)
+        self.assert_eq(pser.hasnans, kser.hasnans)
+
+        pser = pd.Series([pd.Timestamp("2020-07-30"), np.nan, pd.Timestamp("2020-07-30")])
+        kser = ks.from_pandas(pser)
+        self.assert_eq(pser.hasnans, kser.hasnans)