Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug fixing for hasnans when non-DoubleType. #1681

Merged
merged 6 commits into from
Jul 31, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions databricks/koalas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,14 @@
from pandas.core.accessor import CachedAccessor
from pyspark import sql as spark
from pyspark.sql import functions as F, Window, Column
from pyspark.sql.types import DateType, DoubleType, FloatType, LongType, StringType, TimestampType
from pyspark.sql.types import (
DateType,
DoubleType,
FloatType,
LongType,
StringType,
TimestampType,
)

from databricks import koalas as ks # For running doctests and reference resolution in PyCharm.
from databricks.koalas import numpy_compat
Expand Down Expand Up @@ -391,7 +398,10 @@ def hasnans(self):
sdf = self._internal.spark_frame
scol = self.spark.column

return sdf.select(F.max(scol.isNull() | F.isnan(scol))).collect()[0][0]
itholic marked this conversation as resolved.
Show resolved Hide resolved
if isinstance(self.spark.data_type, (DoubleType, FloatType)):
return sdf.select(F.max(scol.isNull() | F.isnan(scol))).collect()[0][0]
else:
return sdf.select(F.max(scol.isNull())).collect()[0][0]

@property
def is_monotonic(self):
Expand Down
19 changes: 19 additions & 0 deletions databricks/koalas/tests/test_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1364,3 +1364,22 @@ def test_abs(self):
kidx = ks.MultiIndex.from_tuples([(1, 2)], names=["level1", "level2"])
with self.assertRaisesRegex(TypeError, "perform __abs__ with this index"):
abs(kidx)

def test_hasnans(self):
# BooleanType
pidx = pd.Index([True, False, True, True])
kidx = ks.from_pandas(pidx)
self.assert_eq(pidx.hasnans, kidx.hasnans)

pidx = pd.Index([True, False, np.nan, True])
kidx = ks.from_pandas(pidx)
self.assert_eq(pidx.hasnans, kidx.hasnans)

# TimestampType
pser = pd.Series([pd.Timestamp("2020-07-30") for _ in range(3)])
kser = ks.from_pandas(pser)
self.assert_eq(pser.hasnans, kser.hasnans)

pser = pd.Series([pd.Timestamp("2020-07-30"), np.nan, pd.Timestamp("2020-07-30")])
kser = ks.from_pandas(pser)
self.assert_eq(pser.hasnans, kser.hasnans)
19 changes: 19 additions & 0 deletions databricks/koalas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1894,3 +1894,22 @@ def test_tail(self):
self.assert_eq(pser.tail(-1001), kser.tail(-1001))
with self.assertRaisesRegex(TypeError, "bad operand type for unary -: 'str'"):
kser.tail("10")

def test_hasnans(self):
# BooleanType
pser = pd.Series([True, False, True, True])
kser = ks.from_pandas(pser)
self.assert_eq(pser.hasnans, kser.hasnans)

pser = pd.Series([True, False, np.nan, True])
kser = ks.from_pandas(pser)
self.assert_eq(pser.hasnans, kser.hasnans)

# TimestampType
pser = pd.Series([pd.Timestamp("2020-07-30") for _ in range(3)])
kser = ks.from_pandas(pser)
self.assert_eq(pser.hasnans, kser.hasnans)

pser = pd.Series([pd.Timestamp("2020-07-30"), np.nan, pd.Timestamp("2020-07-30")])
kser = ks.from_pandas(pser)
self.assert_eq(pser.hasnans, kser.hasnans)