Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dispatch frame methods to series versions instead of re-implementing masking etc #19611

Merged
merged 13 commits into from
Feb 18, 2018
Merged
78 changes: 17 additions & 61 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3944,71 +3944,39 @@ def _combine_frame(self, other, func, fill_value=None, level=None):
new_index, new_columns = this.index, this.columns

def _arith_op(left, right):
# for the mixed_type case where we iterate over columns,
# _arith_op(left, right) is equivalent to
# left._binop(right, func, fill_value=fill_value)
left, right = ops.fill_binop(left, right, fill_value)
return func(left, right)

if this._is_mixed_type or other._is_mixed_type:

# unique
# iterate over columns
if this.columns.is_unique:

def f(col):
r = _arith_op(this[col].values, other[col].values)
return self._constructor_sliced(r, index=new_index,
dtype=r.dtype)

result = {col: f(col) for col in this}

# non-unique
# unique columns
result = {col: _arith_op(this[col], other[col])
for col in this}
result = self._constructor(result, index=new_index,
columns=new_columns, copy=False)
else:

def f(i):
r = _arith_op(this.iloc[:, i].values,
other.iloc[:, i].values)
return self._constructor_sliced(r, index=new_index,
dtype=r.dtype)

result = {i: f(i) for i, col in enumerate(this.columns)}
# non-unique columns
result = {i: _arith_op(this.iloc[:, i], other.iloc[:, i])
for i, col in enumerate(this.columns)}
result = self._constructor(result, index=new_index, copy=False)
result.columns = new_columns
return result
return result

else:
result = _arith_op(this.values, other.values)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like to dispatch to Series here, but presumably doing the operation once instead of per-column has a perf benefit. One idea that didn't work on the first try was to ravel() this.values and other.values, wrap them in Series, then operate and re-shape.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is pretty tricky, yes that would work for most ufuncs, but is not generally applicable e.g. concat is a ufunc too.
so hold off on this as other low hanging fruit w/o opening up this box.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

more to the point is to push this down to the block manager for execution which handles the mixed dtypes case, this is just a special case of that.


return self._constructor(result, index=new_index, columns=new_columns,
copy=False)

def _combine_series(self, other, func, fill_value=None, axis=None,
level=None, try_cast=True):
if fill_value is not None:
raise NotImplementedError("fill_value {fill} not supported."
.format(fill=fill_value))

if axis is not None:
axis = self._get_axis_name(axis)
if axis == 'index':
return self._combine_match_index(other, func, level=level)
else:
return self._combine_match_columns(other, func, level=level,
try_cast=try_cast)
else:
if not len(other):
return self * np.nan

if not len(self):
# Ambiguous case, use _series so works with DataFrame
return self._constructor(data=self._series, index=self.index,
columns=self.columns)

# default axis is columns
return self._combine_match_columns(other, func, level=level,
try_cast=try_cast)

def _combine_match_index(self, other, func, level=None):
left, right = self.align(other, join='outer', axis=0, level=level,
copy=False)
return self._constructor(func(left.values.T, right.values).T,
new_data = func(left.values.T, right.values).T
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can do better than operating on left.values.T and right.values. Especially with mixed-dtypes this seems like low-hanging fruit.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

most of these types of operations need to be pushed down to internals via block operations. this should dispatch to ._data.eval to do this (like the other match functions)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yah, but I'm not there yet. These particular cases are tricky because SparseDataFrame implements _combine_match_foo differently

return self._constructor(new_data,
index=left.index, columns=self.columns,
copy=False)

Expand All @@ -4027,7 +3995,8 @@ def _combine_const(self, other, func, errors='raise', try_cast=True):
try_cast=try_cast)
return self._constructor(new_data)

def _compare_frame_evaluate(self, other, func, str_rep, try_cast=True):
def _compare_frame(self, other, func, str_rep, try_cast=True):
# compare_frame assumes self._indexed_same(other)

import pandas.core.computation.expressions as expressions
# unique
Expand All @@ -4052,19 +4021,6 @@ def _compare(a, b):
result.columns = self.columns
return result

def _compare_frame(self, other, func, str_rep, try_cast=True):
if not self._indexed_same(other):
raise ValueError('Can only compare identically-labeled '
'DataFrame objects')
return self._compare_frame_evaluate(other, func, str_rep,
try_cast=try_cast)

def _flex_compare_frame(self, other, func, str_rep, level, try_cast=True):
if not self._indexed_same(other):
self, other = self.align(other, 'outer', level=level, copy=False)
return self._compare_frame_evaluate(other, func, str_rep,
try_cast=try_cast)

def combine(self, other, func, fill_value=None, overwrite=True):
"""
Add two DataFrame objects and do not propagate NaN values, so if for a
Expand Down
49 changes: 15 additions & 34 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
import pandas.core.algorithms as algos
import pandas.core.sorting as sorting
from pandas.io.formats.printing import pprint_thing
from pandas.core.ops import _comp_method_OBJECT_ARRAY
from pandas.core.ops import _comp_method_OBJECT_ARRAY, make_invalid_op
from pandas.core.config import get_option
from pandas.core.strings import StringMethods

Expand All @@ -82,26 +82,6 @@ def _try_get_item(x):
return x


def _make_invalid_op(name):
"""
Return a binary method that always raises a TypeError.

Parameters
----------
name : str

Returns
-------
invalid_op : function
"""
def invalid_op(self, other=None):
raise TypeError("cannot perform {name} with this index type: "
"{typ}".format(name=name, typ=type(self)))

invalid_op.__name__ = name
return invalid_op


class InvalidIndexError(Exception):
pass

Expand Down Expand Up @@ -3994,22 +3974,23 @@ def _evaluate_compare(self, other):
@classmethod
def _add_numeric_methods_add_sub_disabled(cls):
""" add in the numeric add/sub methods to disable """
cls.__add__ = cls.__radd__ = __iadd__ = _make_invalid_op('__add__') # noqa
cls.__sub__ = __isub__ = _make_invalid_op('__sub__') # noqa
cls.__add__ = cls.__radd__ = __iadd__ = make_invalid_op('__add__') # noqa
cls.__sub__ = __isub__ = make_invalid_op('__sub__') # noqa

@classmethod
def _add_numeric_methods_disabled(cls):
""" add in numeric methods to disable other than add/sub """
cls.__pow__ = cls.__rpow__ = _make_invalid_op('__pow__')
cls.__mul__ = cls.__rmul__ = _make_invalid_op('__mul__')
cls.__floordiv__ = cls.__rfloordiv__ = _make_invalid_op('__floordiv__')
cls.__truediv__ = cls.__rtruediv__ = _make_invalid_op('__truediv__')
cls.__pow__ = make_invalid_op('__pow__')
cls.__rpow__ = make_invalid_op('__rpow__')
cls.__mul__ = cls.__rmul__ = make_invalid_op('__mul__')
cls.__floordiv__ = cls.__rfloordiv__ = make_invalid_op('__floordiv__')
cls.__truediv__ = cls.__rtruediv__ = make_invalid_op('__truediv__')
if not compat.PY3:
cls.__div__ = cls.__rdiv__ = _make_invalid_op('__div__')
cls.__neg__ = _make_invalid_op('__neg__')
cls.__pos__ = _make_invalid_op('__pos__')
cls.__abs__ = _make_invalid_op('__abs__')
cls.__inv__ = _make_invalid_op('__inv__')
cls.__div__ = cls.__rdiv__ = make_invalid_op('__div__')
cls.__neg__ = make_invalid_op('__neg__')
cls.__pos__ = make_invalid_op('__pos__')
cls.__abs__ = make_invalid_op('__abs__')
cls.__inv__ = make_invalid_op('__inv__')

def _maybe_update_attributes(self, attrs):
""" Update Index attributes (e.g. freq) depending on op """
Expand Down Expand Up @@ -4207,8 +4188,8 @@ def logical_func(self, *args, **kwargs):
@classmethod
def _add_logical_methods_disabled(cls):
""" add in logical methods to disable """
cls.all = _make_invalid_op('all')
cls.any = _make_invalid_op('any')
cls.all = make_invalid_op('all')
cls.any = make_invalid_op('any')


Index._add_numeric_methods_disabled()
Expand Down
102 changes: 89 additions & 13 deletions pandas/core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,26 @@ def rxor(left, right):

# -----------------------------------------------------------------------------

def make_invalid_op(name):
"""
Return a binary method that always raises a TypeError.

Parameters
----------
name : str

Returns
-------
invalid_op : function
"""
def invalid_op(self, other=None):
raise TypeError("cannot perform {name} with this index type: "
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you might need to parameterize this on the work 'index' but not sure (IOW can it be a Series)? or maybe just remove the word index ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ATM it is only used for indexes. I could add an assertion that self is an Index so that we know to change it if/when the wording ceases to be accurate.

"{typ}".format(name=name, typ=type(self).__name__))

invalid_op.__name__ = name
return invalid_op


def _gen_eval_kwargs(name):
"""
Find the keyword arguments to pass to numexpr for the given operation.
Expand Down Expand Up @@ -1047,8 +1067,8 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0):
elif isinstance(other, (np.ndarray, list, tuple)):
if len(other) != len(self):
raise ValueError('Lengths must be equal')
return self._binop(self._constructor(other, self.index), op,
level=level, fill_value=fill_value)
other = self._constructor(other, self.index)
return self._binop(other, op, level=level, fill_value=fill_value)
else:
if fill_value is not None:
self = self.fillna(fill_value)
Expand All @@ -1071,6 +1091,51 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0):
# -----------------------------------------------------------------------------
# DataFrame

def _combine_series_frame(self, other, func, fill_value=None, axis=None,
level=None, try_cast=True):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a doc-string

"""
Apply binary operator `func` to self, other using alignment and fill
conventions determined by the fill_value, axis, level, and try_cast kwargs.

Parameters
----------
self : DataFrame
other : Series
func : binary operator
fill_value : object (default None)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use the format
fill_value : object, default None

no parens

axis : {0, 1, 'columns', 'index', None} (default None)
level : int or None (default None)
try_cast : bool (default True)

Returns
-------
result : DataFrame
"""
if fill_value is not None:
raise NotImplementedError("fill_value {fill} not supported."
.format(fill=fill_value))

if axis is not None:
axis = self._get_axis_name(axis)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we normally use axis numbers internally (and not names). I agree its a bit easier on the eyes to do this, but we should be consistent (so revert here, ok to change it all over the base in another PR).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK. This isn't a change from the current implementation, just moving it from frame. Will change.

if axis == 'index':
return self._combine_match_index(other, func, level=level)
else:
return self._combine_match_columns(other, func, level=level,
try_cast=try_cast)
else:
if not len(other):
return self * np.nan

if not len(self):
# Ambiguous case, use _series so works with DataFrame
return self._constructor(data=self._series, index=self.index,
columns=self.columns)

# default axis is columns
return self._combine_match_columns(other, func, level=level,
try_cast=try_cast)


def _align_method_FRAME(left, right, axis):
""" convert rhs to meet lhs dims if input is list, tuple or np.ndarray """

Expand Down Expand Up @@ -1179,8 +1244,9 @@ def f(self, other, axis=default_axis, level=None, fill_value=None):
if isinstance(other, ABCDataFrame): # Another DataFrame
return self._combine_frame(other, na_op, fill_value, level)
elif isinstance(other, ABCSeries):
return self._combine_series(other, na_op, fill_value, axis, level,
try_cast=True)
return _combine_series_frame(self, other, na_op,
fill_value=fill_value, axis=axis,
level=level, try_cast=True)
else:
if fill_value is not None:
self = self.fillna(fill_value)
Expand Down Expand Up @@ -1209,13 +1275,17 @@ def f(self, other, axis=default_axis, level=None):

other = _align_method_FRAME(self, other, axis)

if isinstance(other, ABCDataFrame): # Another DataFrame
return self._flex_compare_frame(other, na_op, str_rep, level,
try_cast=False)
if isinstance(other, ABCDataFrame):
# Another DataFrame
if not self._indexed_same(other):
self, other = self.align(other, 'outer',
level=level, copy=False)
return self._compare_frame(other, na_op, str_rep, try_cast=False)

elif isinstance(other, ABCSeries):
return self._combine_series(other, na_op, None, axis, level,
try_cast=False)
return _combine_series_frame(self, other, na_op,
fill_value=None, axis=axis,
level=level, try_cast=False)
else:
return self._combine_const(other, na_op, try_cast=False)

Expand All @@ -1227,11 +1297,17 @@ def f(self, other, axis=default_axis, level=None):
def _comp_method_FRAME(func, name, str_rep):
@Appender('Wrapper for comparison method {name}'.format(name=name))
def f(self, other):
if isinstance(other, ABCDataFrame): # Another DataFrame
return self._compare_frame(other, func, str_rep)
if isinstance(other, ABCDataFrame):
# Another DataFrame
if not self._indexed_same(other):
raise ValueError('Can only compare identically-labeled '
'DataFrame objects')
return self._compare_frame(other, func, str_rep, try_cast=True)

elif isinstance(other, ABCSeries):
return self._combine_series(other, func,
axis=None, try_cast=False)
return _combine_series_frame(self, other, func,
fill_value=None, axis=None,
level=None, try_cast=False)
else:

# straight boolean comparisons we want to allow all columns
Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/frame/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,23 @@ def test_tz_aware_scalar_comparison(self, timestamps):
# -------------------------------------------------------------------
# Arithmetic

class TestFrameFlexArithmetic(object):
def test_df_add_flex_filled_mixed_dtypes(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this represent a change in user facing API?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, just a case that is not currently tested. I made a mistake in an early commit in this PR; this test would have caught it.

# GH#19611
dti = pd.date_range('2016-01-01', periods=3)
ser = pd.Series(['1 Day', 'NaT', '2 Days'], dtype='timedelta64[ns]')
df = pd.DataFrame({'A': dti, 'B': ser})
other = pd.DataFrame({'A': ser, 'B': ser})
fill = pd.Timedelta(days=1).to_timedelta64()
result = df.add(other, fill_value=fill)

expected = pd.DataFrame(
{'A': pd.Series(['2016-01-02', '2016-01-03', '2016-01-05'],
dtype='datetime64[ns]'),
'B': ser * 2})
tm.assert_frame_equal(result, expected)


class TestFrameMulDiv(object):
"""Tests for DataFrame multiplication and division"""
# ------------------------------------------------------------------
Expand Down