From fb2eedcfad66635039c05009586f310cc79fe17a Mon Sep 17 00:00:00 2001 From: Haejoon Lee <44108233+itholic@users.noreply.github.com> Date: Fri, 31 Jul 2020 17:32:12 +0900 Subject: [PATCH] Bug fixing for hasnans when non-DoubleType. (#1681) `ks.Series.hasnans` and `ks.Index.hasnans` seems not work properly for non-DoubleType. ```python >>> ks.Series([True, True, np.nan]).hasnans Traceback (most recent call last): ... pyspark.sql.utils.AnalysisException: cannot resolve 'isnan(`0`)' due to data type mismatch: argument 1 requires (double or float) type, however, '`0`' is of boolean type.;; 'Aggregate [max((isnull(0#12) OR isnan(0#12))) AS max(((0 IS NULL) OR isnan(0)))#21] +- Project [__index_level_0__#11L, 0#12, monotonically_increasing_id() AS __natural_order__#15L] +- LogicalRDD [__index_level_0__#11L, 0#12], false ``` This PR fixed it. --- databricks/koalas/base.py | 14 ++++++++++++-- databricks/koalas/tests/test_indexes.py | 19 +++++++++++++++++++ databricks/koalas/tests/test_series.py | 19 +++++++++++++++++++ 3 files changed, 50 insertions(+), 2 deletions(-) diff --git a/databricks/koalas/base.py b/databricks/koalas/base.py index 072eb86fc3..7a9a6caf26 100644 --- a/databricks/koalas/base.py +++ b/databricks/koalas/base.py @@ -29,7 +29,14 @@ from pandas.core.accessor import CachedAccessor from pyspark import sql as spark from pyspark.sql import functions as F, Window, Column -from pyspark.sql.types import DateType, DoubleType, FloatType, LongType, StringType, TimestampType +from pyspark.sql.types import ( + DateType, + DoubleType, + FloatType, + LongType, + StringType, + TimestampType, +) from databricks import koalas as ks # For running doctests and reference resolution in PyCharm. from databricks.koalas import numpy_compat @@ -391,7 +398,10 @@ def hasnans(self): sdf = self._internal.spark_frame scol = self.spark.column - return sdf.select(F.max(scol.isNull() | F.isnan(scol))).collect()[0][0] + if isinstance(self.spark.data_type, (DoubleType, FloatType)): + return sdf.select(F.max(scol.isNull() | F.isnan(scol))).collect()[0][0] + else: + return sdf.select(F.max(scol.isNull())).collect()[0][0] @property def is_monotonic(self): diff --git a/databricks/koalas/tests/test_indexes.py b/databricks/koalas/tests/test_indexes.py index 604e43e0b9..e3bc25bc5b 100644 --- a/databricks/koalas/tests/test_indexes.py +++ b/databricks/koalas/tests/test_indexes.py @@ -1368,3 +1368,22 @@ def test_abs(self): kidx = ks.MultiIndex.from_tuples([(1, 2)], names=["level1", "level2"]) with self.assertRaisesRegex(TypeError, "perform __abs__ with this index"): abs(kidx) + + def test_hasnans(self): + # BooleanType + pidx = pd.Index([True, False, True, True]) + kidx = ks.from_pandas(pidx) + self.assert_eq(pidx.hasnans, kidx.hasnans) + + pidx = pd.Index([True, False, np.nan, True]) + kidx = ks.from_pandas(pidx) + self.assert_eq(pidx.hasnans, kidx.hasnans) + + # TimestampType + pser = pd.Series([pd.Timestamp("2020-07-30") for _ in range(3)]) + kser = ks.from_pandas(pser) + self.assert_eq(pser.hasnans, kser.hasnans) + + pser = pd.Series([pd.Timestamp("2020-07-30"), np.nan, pd.Timestamp("2020-07-30")]) + kser = ks.from_pandas(pser) + self.assert_eq(pser.hasnans, kser.hasnans) diff --git a/databricks/koalas/tests/test_series.py b/databricks/koalas/tests/test_series.py index 82d03d8027..10f475250d 100644 --- a/databricks/koalas/tests/test_series.py +++ b/databricks/koalas/tests/test_series.py @@ -1894,3 +1894,22 @@ def test_tail(self): self.assert_eq(pser.tail(-1001), kser.tail(-1001)) with self.assertRaisesRegex(TypeError, "bad operand type for unary -: 'str'"): kser.tail("10") + + def test_hasnans(self): + # BooleanType + pser = pd.Series([True, False, True, True]) + kser = ks.from_pandas(pser) + self.assert_eq(pser.hasnans, kser.hasnans) + + pser = pd.Series([True, False, np.nan, True]) + kser = ks.from_pandas(pser) + self.assert_eq(pser.hasnans, kser.hasnans) + + # TimestampType + pser = pd.Series([pd.Timestamp("2020-07-30") for _ in range(3)]) + kser = ks.from_pandas(pser) + self.assert_eq(pser.hasnans, kser.hasnans) + + pser = pd.Series([pd.Timestamp("2020-07-30"), np.nan, pd.Timestamp("2020-07-30")]) + kser = ks.from_pandas(pser) + self.assert_eq(pser.hasnans, kser.hasnans)