Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Series.div when divide by zero #1412

Merged
merged 10 commits into from
Apr 16, 2020
38 changes: 28 additions & 10 deletions databricks/koalas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,8 +184,14 @@ def __sub__(self, other):
return _column_op(spark.Column.__sub__)(self, other)

__mul__ = _column_op(spark.Column.__mul__)
__div__ = _numpy_column_op(spark.Column.__div__)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI: In Python 3.x, __div__ and __rdiv__ no more supported. Using __truediv__ and __rtruediv__instead.

__truediv__ = _numpy_column_op(spark.Column.__truediv__)

def __truediv__(self, other):
def truediv(left, right):
itholic marked this conversation as resolved.
Show resolved Hide resolved
return F.when(F.lit(right == 0), F.lit(np.inf).__div__(left)).otherwise(
left.__truediv__(right)
)

return _numpy_column_op(truediv)(self, other)

def __mod__(self, other):
def mod(left, right):
Expand All @@ -202,18 +208,30 @@ def __radd__(self, other):

__rsub__ = _column_op(spark.Column.__rsub__)
__rmul__ = _column_op(spark.Column.__rmul__)
__rdiv__ = _numpy_column_op(spark.Column.__rdiv__)
__rtruediv__ = _numpy_column_op(spark.Column.__rtruediv__)

def __rtruediv__(self, other):
def rtruediv(left, right):
return F.when(left == 0, F.lit(np.inf).__div__(right)).otherwise(
F.lit(right).__truediv__(left)
)

return _numpy_column_op(rtruediv)(self, other)

def __floordiv__(self, other):
return self._with_new_scol(
F.floor(_numpy_column_op(spark.Column.__div__)(self, other)._scol)
)
def floordiv(left, right):
return F.when(F.lit(right == 0), F.lit(np.inf).__div__(left)).otherwise(
F.floor(left.__div__(right))
)

return _numpy_column_op(floordiv)(self, other)

def __rfloordiv__(self, other):
return self._with_new_scol(
F.floor(_numpy_column_op(spark.Column.__rdiv__)(self, other)._scol)
)
def rfloordiv(left, right):
return F.when(F.lit(left == 0), F.lit(np.inf).__div__(right)).otherwise(
F.floor(F.lit(right).__div__(left))
)

return _numpy_column_op(rfloordiv)(self, other)

def __rmod__(self, other):
def rmod(left, right):
Expand Down
22 changes: 11 additions & 11 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@

>>> df.rdiv(10)
angles degrees
circle NaN 0.027778
circle inf 0.027778
triangle 3.333333 0.055556
rectangle 2.500000 0.027778

Expand All @@ -180,7 +180,7 @@

>>> df.rtruediv(10)
angles degrees
circle NaN 0.027778
circle inf 0.027778
triangle 3.333333 0.055556
rectangle 2.500000 0.027778

Expand Down Expand Up @@ -228,21 +228,21 @@

>>> df // 10
angles degrees
circle 0 36
triangle 0 18
rectangle 0 36
circle 0.0 36.0
triangle 0.0 18.0
rectangle 0.0 36.0

>>> df.floordiv(10)
angles degrees
circle 0 36
triangle 0 18
rectangle 0 36
circle 0.0 36.0
triangle 0.0 18.0
rectangle 0.0 36.0
Copy link
Contributor Author

@itholic itholic Apr 10, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For floordiv and rfloordiv, the result will always be a float since we cannot predict the result type of each column which is determined before executes the job by Spark.

Copy link
Contributor Author

@itholic itholic Apr 10, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In short, since there is a possibility that the result can be a Infinity which is float, the result always be a float.

Someone can give an opinion about this?


>>> df.rfloordiv(10)
angles degrees
circle NaN 0
triangle 3.0 0
rectangle 2.0 0
circle inf 0.0
triangle 3.0 0.0
rectangle 2.0 0.0

Mod by constant with reverse version.

Expand Down
17 changes: 17 additions & 0 deletions databricks/koalas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1446,3 +1446,20 @@ def test_squeeze(self):
kser = ks.Series([90, 91, 85], index=midx)
pser = kser.to_pandas()
self.assert_eq(kser.squeeze(), pser.squeeze())

def test_div_zero(self):
pser = pd.Series([100, None, -300, None, 500, -700], name="Koalas")
kser = ks.from_pandas(pser)

self.assert_eq(repr(pser.div(0)), repr(kser.div(0)))
self.assert_eq(repr(pser.truediv(0)), repr(kser.truediv(0)))
self.assert_eq(repr(pser / 0), repr(kser / 0))

# floordiv has different behavior in pandas > 1.0.0
if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"):
self.assert_eq(repr(pser.floordiv(0)), repr(kser.floordiv(0)))
self.assert_eq(repr(pser // 0), repr(kser // 0))
else:
result = ks.Series([np.inf, np.nan, -np.inf, np.nan, np.inf, -np.inf], name="Koalas")
itholic marked this conversation as resolved.
Show resolved Hide resolved
self.assert_eq(repr(kser.floordiv(0)), repr(result))
self.assert_eq(repr(kser // 0), repr(result))