Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dispatch frame methods to series versions instead of re-implementing masking etc #19611

Merged
merged 13 commits into from
Feb 18, 2018
Merged
75 changes: 14 additions & 61 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3920,6 +3920,7 @@ def _combine_frame(self, other, func, fill_value=None, level=None):
new_index, new_columns = this.index, this.columns

def _arith_op(left, right):
# analogous to Series._binop
if fill_value is not None:
left_mask = isna(left)
right_mask = isna(right)
Expand All @@ -3934,67 +3935,31 @@ def _arith_op(left, right):
return func(left, right)

if this._is_mixed_type or other._is_mixed_type:

# unique
# iterate over columns
if this.columns.is_unique:

def f(col):
r = _arith_op(this[col].values, other[col].values)
return self._constructor_sliced(r, index=new_index,
dtype=r.dtype)

result = {col: f(col) for col in this}

# non-unique
# unique columns
result = {col: func(this[col], other[col]) for col in this}
result = self._constructor(result, index=new_index,
columns=new_columns, copy=False)
else:

def f(i):
r = _arith_op(this.iloc[:, i].values,
other.iloc[:, i].values)
return self._constructor_sliced(r, index=new_index,
dtype=r.dtype)

result = {i: f(i) for i, col in enumerate(this.columns)}
# non-unique columns
result = {i: func(this.iloc[:, i], other.iloc[:, i])
for i, col in enumerate(this.columns)}
result = self._constructor(result, index=new_index, copy=False)
result.columns = new_columns
return result
return result

else:
result = _arith_op(this.values, other.values)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like to dispatch to Series here, but presumably doing the operation once instead of per-column has a perf benefit. One idea that didn't work on the first try was to ravel() this.values and other.values, wrap them in Series, then operate and re-shape.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is pretty tricky, yes that would work for most ufuncs, but is not generally applicable e.g. concat is a ufunc too.
so hold off on this as other low hanging fruit w/o opening up this box.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

more to the point is to push this down to the block manager for execution which handles the mixed dtypes case, this is just a special case of that.


return self._constructor(result, index=new_index, columns=new_columns,
copy=False)

def _combine_series(self, other, func, fill_value=None, axis=None,
level=None, try_cast=True):
if fill_value is not None:
raise NotImplementedError("fill_value {fill} not supported."
.format(fill=fill_value))

if axis is not None:
axis = self._get_axis_name(axis)
if axis == 'index':
return self._combine_match_index(other, func, level=level)
else:
return self._combine_match_columns(other, func, level=level,
try_cast=try_cast)
else:
if not len(other):
return self * np.nan

if not len(self):
# Ambiguous case, use _series so works with DataFrame
return self._constructor(data=self._series, index=self.index,
columns=self.columns)

# default axis is columns
return self._combine_match_columns(other, func, level=level,
try_cast=try_cast)

def _combine_match_index(self, other, func, level=None):
left, right = self.align(other, join='outer', axis=0, level=level,
copy=False)
return self._constructor(func(left.values.T, right.values).T,
new_data = func(left.values.T, right.values).T
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can do better than operating on left.values.T and right.values. Especially with mixed-dtypes this seems like low-hanging fruit.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

most of these types of operations need to be pushed down to internals via block operations. this should dispatch to ._data.eval to do this (like the other match functions)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yah, but I'm not there yet. These particular cases are tricky because SparseDataFrame implements _combine_match_foo differently

return self._constructor(new_data,
index=left.index, columns=self.columns,
copy=False)

Expand All @@ -4013,7 +3978,8 @@ def _combine_const(self, other, func, errors='raise', try_cast=True):
try_cast=try_cast)
return self._constructor(new_data)

def _compare_frame_evaluate(self, other, func, str_rep, try_cast=True):
def _compare_frame(self, other, func, str_rep, try_cast=True):
# compare_frame assumes self._indexed_same(other)

import pandas.core.computation.expressions as expressions
# unique
Expand All @@ -4038,19 +4004,6 @@ def _compare(a, b):
result.columns = self.columns
return result

def _compare_frame(self, other, func, str_rep, try_cast=True):
if not self._indexed_same(other):
raise ValueError('Can only compare identically-labeled '
'DataFrame objects')
return self._compare_frame_evaluate(other, func, str_rep,
try_cast=try_cast)

def _flex_compare_frame(self, other, func, str_rep, level, try_cast=True):
if not self._indexed_same(other):
self, other = self.align(other, 'outer', level=level, copy=False)
return self._compare_frame_evaluate(other, func, str_rep,
try_cast=try_cast)

def combine(self, other, func, fill_value=None, overwrite=True):
"""
Add two DataFrame objects and do not propagate NaN values, so if for a
Expand Down
64 changes: 51 additions & 13 deletions pandas/core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -974,8 +974,8 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0):
elif isinstance(other, (np.ndarray, list, tuple)):
if len(other) != len(self):
raise ValueError('Lengths must be equal')
return self._binop(self._constructor(other, self.index), op,
level=level, fill_value=fill_value)
other = self._constructor(other, self.index)
return self._binop(other, op, level=level, fill_value=fill_value)
else:
if fill_value is not None:
self = self.fillna(fill_value)
Expand All @@ -998,6 +998,33 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0):
# -----------------------------------------------------------------------------
# DataFrame

def _combine_series_frame(self, other, func, fill_value=None, axis=None,
level=None, try_cast=True):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a doc-string

if fill_value is not None:
raise NotImplementedError("fill_value {fill} not supported."
.format(fill=fill_value))

if axis is not None:
axis = self._get_axis_name(axis)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we normally use axis numbers internally (and not names). I agree its a bit easier on the eyes to do this, but we should be consistent (so revert here, ok to change it all over the base in another PR).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK. This isn't a change from the current implementation, just moving it from frame. Will change.

if axis == 'index':
return self._combine_match_index(other, func, level=level)
else:
return self._combine_match_columns(other, func, level=level,
try_cast=try_cast)
else:
if not len(other):
return self * np.nan

if not len(self):
# Ambiguous case, use _series so works with DataFrame
return self._constructor(data=self._series, index=self.index,
columns=self.columns)

# default axis is columns
return self._combine_match_columns(other, func, level=level,
try_cast=try_cast)


def _align_method_FRAME(left, right, axis):
""" convert rhs to meet lhs dims if input is list, tuple or np.ndarray """

Expand Down Expand Up @@ -1106,8 +1133,9 @@ def f(self, other, axis=default_axis, level=None, fill_value=None):
if isinstance(other, ABCDataFrame): # Another DataFrame
return self._combine_frame(other, na_op, fill_value, level)
elif isinstance(other, ABCSeries):
return self._combine_series(other, na_op, fill_value, axis, level,
try_cast=True)
return _combine_series_frame(self, other, na_op,
fill_value=fill_value, axis=axis,
level=level, try_cast=True)
else:
if fill_value is not None:
self = self.fillna(fill_value)
Expand Down Expand Up @@ -1152,13 +1180,17 @@ def f(self, other, axis=default_axis, level=None):

other = _align_method_FRAME(self, other, axis)

if isinstance(other, ABCDataFrame): # Another DataFrame
return self._flex_compare_frame(other, na_op, str_rep, level,
try_cast=False)
if isinstance(other, ABCDataFrame):
# Another DataFrame
if not self._indexed_same(other):
self, other = self.align(other, 'outer',
level=level, copy=False)
return self._compare_frame(other, na_op, str_rep, try_cast=False)

elif isinstance(other, ABCSeries):
return self._combine_series(other, na_op, None, axis, level,
try_cast=False)
return _combine_series_frame(self, other, na_op,
fill_value=None, axis=axis,
level=level, try_cast=False)
else:
return self._combine_const(other, na_op, try_cast=False)

Expand All @@ -1170,11 +1202,17 @@ def f(self, other, axis=default_axis, level=None):
def _comp_method_FRAME(func, name, str_rep):
@Appender('Wrapper for comparison method {name}'.format(name=name))
def f(self, other):
if isinstance(other, ABCDataFrame): # Another DataFrame
return self._compare_frame(other, func, str_rep)
if isinstance(other, ABCDataFrame):
# Another DataFrame
if not self._indexed_same(other):
raise ValueError('Can only compare identically-labeled '
'DataFrame objects')
return self._compare_frame(other, func, str_rep, try_cast=True)

elif isinstance(other, ABCSeries):
return self._combine_series(other, func,
axis=None, try_cast=False)
return _combine_series_frame(self, other, func,
fill_value=None, axis=None,
level=None, try_cast=False)
else:

# straight boolean comparisons we want to allow all columns
Expand Down