-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
dispatch frame methods to series versions instead of re-implementing masking etc #19611
Changes from 11 commits
6d59458
4540e82
0ffe430
92fc179
ff292f3
8a82a23
a221fa8
3473910
1a47d83
02eceab
956b9f9
7583e7d
118cd5d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3944,71 +3944,39 @@ def _combine_frame(self, other, func, fill_value=None, level=None): | |
new_index, new_columns = this.index, this.columns | ||
|
||
def _arith_op(left, right): | ||
# for the mixed_type case where we iterate over columns, | ||
# _arith_op(left, right) is equivalent to | ||
# left._binop(right, func, fill_value=fill_value) | ||
left, right = ops.fill_binop(left, right, fill_value) | ||
return func(left, right) | ||
|
||
if this._is_mixed_type or other._is_mixed_type: | ||
|
||
# unique | ||
# iterate over columns | ||
if this.columns.is_unique: | ||
|
||
def f(col): | ||
r = _arith_op(this[col].values, other[col].values) | ||
return self._constructor_sliced(r, index=new_index, | ||
dtype=r.dtype) | ||
|
||
result = {col: f(col) for col in this} | ||
|
||
# non-unique | ||
# unique columns | ||
result = {col: _arith_op(this[col], other[col]) | ||
for col in this} | ||
result = self._constructor(result, index=new_index, | ||
columns=new_columns, copy=False) | ||
else: | ||
|
||
def f(i): | ||
r = _arith_op(this.iloc[:, i].values, | ||
other.iloc[:, i].values) | ||
return self._constructor_sliced(r, index=new_index, | ||
dtype=r.dtype) | ||
|
||
result = {i: f(i) for i, col in enumerate(this.columns)} | ||
# non-unique columns | ||
result = {i: _arith_op(this.iloc[:, i], other.iloc[:, i]) | ||
for i, col in enumerate(this.columns)} | ||
result = self._constructor(result, index=new_index, copy=False) | ||
result.columns = new_columns | ||
return result | ||
return result | ||
|
||
else: | ||
result = _arith_op(this.values, other.values) | ||
|
||
return self._constructor(result, index=new_index, columns=new_columns, | ||
copy=False) | ||
|
||
def _combine_series(self, other, func, fill_value=None, axis=None, | ||
level=None, try_cast=True): | ||
if fill_value is not None: | ||
raise NotImplementedError("fill_value {fill} not supported." | ||
.format(fill=fill_value)) | ||
|
||
if axis is not None: | ||
axis = self._get_axis_name(axis) | ||
if axis == 'index': | ||
return self._combine_match_index(other, func, level=level) | ||
else: | ||
return self._combine_match_columns(other, func, level=level, | ||
try_cast=try_cast) | ||
else: | ||
if not len(other): | ||
return self * np.nan | ||
|
||
if not len(self): | ||
# Ambiguous case, use _series so works with DataFrame | ||
return self._constructor(data=self._series, index=self.index, | ||
columns=self.columns) | ||
|
||
# default axis is columns | ||
return self._combine_match_columns(other, func, level=level, | ||
try_cast=try_cast) | ||
|
||
def _combine_match_index(self, other, func, level=None): | ||
left, right = self.align(other, join='outer', axis=0, level=level, | ||
copy=False) | ||
return self._constructor(func(left.values.T, right.values).T, | ||
new_data = func(left.values.T, right.values).T | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can do better than operating on There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. most of these types of operations need to be pushed down to internals via block operations. this should dispatch to ._data.eval to do this (like the other match functions) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yah, but I'm not there yet. These particular cases are tricky because SparseDataFrame implements |
||
return self._constructor(new_data, | ||
index=left.index, columns=self.columns, | ||
copy=False) | ||
|
||
|
@@ -4027,7 +3995,8 @@ def _combine_const(self, other, func, errors='raise', try_cast=True): | |
try_cast=try_cast) | ||
return self._constructor(new_data) | ||
|
||
def _compare_frame_evaluate(self, other, func, str_rep, try_cast=True): | ||
def _compare_frame(self, other, func, str_rep, try_cast=True): | ||
# compare_frame assumes self._indexed_same(other) | ||
|
||
import pandas.core.computation.expressions as expressions | ||
# unique | ||
|
@@ -4052,19 +4021,6 @@ def _compare(a, b): | |
result.columns = self.columns | ||
return result | ||
|
||
def _compare_frame(self, other, func, str_rep, try_cast=True): | ||
if not self._indexed_same(other): | ||
raise ValueError('Can only compare identically-labeled ' | ||
'DataFrame objects') | ||
return self._compare_frame_evaluate(other, func, str_rep, | ||
try_cast=try_cast) | ||
|
||
def _flex_compare_frame(self, other, func, str_rep, level, try_cast=True): | ||
if not self._indexed_same(other): | ||
self, other = self.align(other, 'outer', level=level, copy=False) | ||
return self._compare_frame_evaluate(other, func, str_rep, | ||
try_cast=try_cast) | ||
|
||
def combine(self, other, func, fill_value=None, overwrite=True): | ||
""" | ||
Add two DataFrame objects and do not propagate NaN values, so if for a | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -96,6 +96,26 @@ def rxor(left, right): | |
|
||
# ----------------------------------------------------------------------------- | ||
|
||
def make_invalid_op(name): | ||
""" | ||
Return a binary method that always raises a TypeError. | ||
|
||
Parameters | ||
---------- | ||
name : str | ||
|
||
Returns | ||
------- | ||
invalid_op : function | ||
""" | ||
def invalid_op(self, other=None): | ||
raise TypeError("cannot perform {name} with this index type: " | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you might need to parameterize this on the work 'index' but not sure (IOW can it be a Series)? or maybe just remove the word index ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ATM it is only used for indexes. I could add an assertion that self is an Index so that we know to change it if/when the wording ceases to be accurate. |
||
"{typ}".format(name=name, typ=type(self).__name__)) | ||
|
||
invalid_op.__name__ = name | ||
return invalid_op | ||
|
||
|
||
def _gen_eval_kwargs(name): | ||
""" | ||
Find the keyword arguments to pass to numexpr for the given operation. | ||
|
@@ -1047,8 +1067,8 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0): | |
elif isinstance(other, (np.ndarray, list, tuple)): | ||
if len(other) != len(self): | ||
raise ValueError('Lengths must be equal') | ||
return self._binop(self._constructor(other, self.index), op, | ||
level=level, fill_value=fill_value) | ||
other = self._constructor(other, self.index) | ||
return self._binop(other, op, level=level, fill_value=fill_value) | ||
else: | ||
if fill_value is not None: | ||
self = self.fillna(fill_value) | ||
|
@@ -1071,6 +1091,51 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0): | |
# ----------------------------------------------------------------------------- | ||
# DataFrame | ||
|
||
def _combine_series_frame(self, other, func, fill_value=None, axis=None, | ||
level=None, try_cast=True): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a doc-string |
||
""" | ||
Apply binary operator `func` to self, other using alignment and fill | ||
conventions determined by the fill_value, axis, level, and try_cast kwargs. | ||
|
||
Parameters | ||
---------- | ||
self : DataFrame | ||
other : Series | ||
func : binary operator | ||
fill_value : object (default None) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use the format no parens |
||
axis : {0, 1, 'columns', 'index', None} (default None) | ||
level : int or None (default None) | ||
try_cast : bool (default True) | ||
|
||
Returns | ||
------- | ||
result : DataFrame | ||
""" | ||
if fill_value is not None: | ||
raise NotImplementedError("fill_value {fill} not supported." | ||
.format(fill=fill_value)) | ||
|
||
if axis is not None: | ||
axis = self._get_axis_name(axis) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we normally use axis numbers internally (and not names). I agree its a bit easier on the eyes to do this, but we should be consistent (so revert here, ok to change it all over the base in another PR). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK. This isn't a change from the current implementation, just moving it from frame. Will change. |
||
if axis == 'index': | ||
return self._combine_match_index(other, func, level=level) | ||
else: | ||
return self._combine_match_columns(other, func, level=level, | ||
try_cast=try_cast) | ||
else: | ||
if not len(other): | ||
return self * np.nan | ||
|
||
if not len(self): | ||
# Ambiguous case, use _series so works with DataFrame | ||
return self._constructor(data=self._series, index=self.index, | ||
columns=self.columns) | ||
|
||
# default axis is columns | ||
return self._combine_match_columns(other, func, level=level, | ||
try_cast=try_cast) | ||
|
||
|
||
def _align_method_FRAME(left, right, axis): | ||
""" convert rhs to meet lhs dims if input is list, tuple or np.ndarray """ | ||
|
||
|
@@ -1179,8 +1244,9 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): | |
if isinstance(other, ABCDataFrame): # Another DataFrame | ||
return self._combine_frame(other, na_op, fill_value, level) | ||
elif isinstance(other, ABCSeries): | ||
return self._combine_series(other, na_op, fill_value, axis, level, | ||
try_cast=True) | ||
return _combine_series_frame(self, other, na_op, | ||
fill_value=fill_value, axis=axis, | ||
level=level, try_cast=True) | ||
else: | ||
if fill_value is not None: | ||
self = self.fillna(fill_value) | ||
|
@@ -1209,13 +1275,17 @@ def f(self, other, axis=default_axis, level=None): | |
|
||
other = _align_method_FRAME(self, other, axis) | ||
|
||
if isinstance(other, ABCDataFrame): # Another DataFrame | ||
return self._flex_compare_frame(other, na_op, str_rep, level, | ||
try_cast=False) | ||
if isinstance(other, ABCDataFrame): | ||
# Another DataFrame | ||
if not self._indexed_same(other): | ||
self, other = self.align(other, 'outer', | ||
level=level, copy=False) | ||
return self._compare_frame(other, na_op, str_rep, try_cast=False) | ||
|
||
elif isinstance(other, ABCSeries): | ||
return self._combine_series(other, na_op, None, axis, level, | ||
try_cast=False) | ||
return _combine_series_frame(self, other, na_op, | ||
fill_value=None, axis=axis, | ||
level=level, try_cast=False) | ||
else: | ||
return self._combine_const(other, na_op, try_cast=False) | ||
|
||
|
@@ -1227,11 +1297,17 @@ def f(self, other, axis=default_axis, level=None): | |
def _comp_method_FRAME(func, name, str_rep): | ||
@Appender('Wrapper for comparison method {name}'.format(name=name)) | ||
def f(self, other): | ||
if isinstance(other, ABCDataFrame): # Another DataFrame | ||
return self._compare_frame(other, func, str_rep) | ||
if isinstance(other, ABCDataFrame): | ||
# Another DataFrame | ||
if not self._indexed_same(other): | ||
raise ValueError('Can only compare identically-labeled ' | ||
'DataFrame objects') | ||
return self._compare_frame(other, func, str_rep, try_cast=True) | ||
|
||
elif isinstance(other, ABCSeries): | ||
return self._combine_series(other, func, | ||
axis=None, try_cast=False) | ||
return _combine_series_frame(self, other, func, | ||
fill_value=None, axis=None, | ||
level=None, try_cast=False) | ||
else: | ||
|
||
# straight boolean comparisons we want to allow all columns | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -72,6 +72,23 @@ def test_tz_aware_scalar_comparison(self, timestamps): | |
# ------------------------------------------------------------------- | ||
# Arithmetic | ||
|
||
class TestFrameFlexArithmetic(object): | ||
def test_df_add_flex_filled_mixed_dtypes(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does this represent a change in user facing API? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, just a case that is not currently tested. I made a mistake in an early commit in this PR; this test would have caught it. |
||
# GH#19611 | ||
dti = pd.date_range('2016-01-01', periods=3) | ||
ser = pd.Series(['1 Day', 'NaT', '2 Days'], dtype='timedelta64[ns]') | ||
df = pd.DataFrame({'A': dti, 'B': ser}) | ||
other = pd.DataFrame({'A': ser, 'B': ser}) | ||
fill = pd.Timedelta(days=1).to_timedelta64() | ||
result = df.add(other, fill_value=fill) | ||
|
||
expected = pd.DataFrame( | ||
{'A': pd.Series(['2016-01-02', '2016-01-03', '2016-01-05'], | ||
dtype='datetime64[ns]'), | ||
'B': ser * 2}) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
class TestFrameMulDiv(object): | ||
"""Tests for DataFrame multiplication and division""" | ||
# ------------------------------------------------------------------ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd like to dispatch to Series here, but presumably doing the operation once instead of per-column has a perf benefit. One idea that didn't work on the first try was to ravel()
this.values
andother.values
, wrap them inSeries
, then operate and re-shape.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is pretty tricky, yes that would work for most ufuncs, but is not generally applicable e.g. concat is a ufunc too.
so hold off on this as other low hanging fruit w/o opening up this box.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
more to the point is to push this down to the block manager for execution which handles the mixed dtypes case, this is just a special case of that.