-
-
Notifications
You must be signed in to change notification settings - Fork 18.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Stop concat from attempting to sort mismatched columns by default #20613
Changes from 37 commits
913723b
5da763f
02b2db9
a497763
954a1b6
2a20377
35570c4
983d0c1
27d2d32
4960e3f
8bbbdd5
dcfa6d0
2eaeb1e
bc7dd48
b3f95dd
f37d7ef
e467f91
058fae5
04e5151
c864679
7e975c9
a8ba430
62b1e7b
0ace673
d5cafdf
ce8ff05
ce756d4
362e84d
0210d33
06772b4
95cdf67
d10f5bd
e47cbb9
0182c98
7e58998
5b58e75
074d03c
5e1b024
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -507,7 +507,7 @@ def is_any_frame(): | |
for r in compat.itervalues(result)) | ||
|
||
if isinstance(result, list): | ||
return concat(result, keys=keys, axis=1), True | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thre are a bunch more concats in this same section. basically they control the resulting ordeing of the aggregation. I guess There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. or maybe these should be |
||
return concat(result, keys=keys, axis=1, sort=True), True | ||
|
||
elif is_any_frame(): | ||
# we have a dict of DataFrames | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6038,7 +6038,8 @@ def infer(x): | |
# ---------------------------------------------------------------------- | ||
# Merging / joining methods | ||
|
||
def append(self, other, ignore_index=False, verify_integrity=False): | ||
def append(self, other, ignore_index=False, | ||
verify_integrity=False, sort=None): | ||
""" | ||
Append rows of `other` to the end of this frame, returning a new | ||
object. Columns not in this frame are added as new columns. | ||
|
@@ -6051,6 +6052,14 @@ def append(self, other, ignore_index=False, verify_integrity=False): | |
If True, do not use the index labels. | ||
verify_integrity : boolean, default False | ||
If True, raise ValueError on creating index with duplicates. | ||
sort : boolean, default None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. put before verify_integrity |
||
Sort columns if the columns of `self` and `other` are not aligned. | ||
The default sorting is deprecated and will change to not-sorting | ||
in a future version of pandas. Explicitly pass ``sort=True`` to | ||
silence the warning and sort. Explicitly pass ``sort=False`` to | ||
silence the warning and not sort. | ||
|
||
.. versionadded:: 0.23.0 | ||
|
||
Returns | ||
------- | ||
|
@@ -6162,7 +6171,8 @@ def append(self, other, ignore_index=False, verify_integrity=False): | |
else: | ||
to_concat = [self, other] | ||
return concat(to_concat, ignore_index=ignore_index, | ||
verify_integrity=verify_integrity) | ||
verify_integrity=verify_integrity, | ||
sort=sort) | ||
|
||
def join(self, other, on=None, how='left', lsuffix='', rsuffix='', | ||
sort=False): | ||
|
@@ -7481,7 +7491,7 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): | |
from pandas.core.index import _get_objs_combined_axis | ||
|
||
if columns is None: | ||
columns = _get_objs_combined_axis(data) | ||
columns = _get_objs_combined_axis(data, sort=False) | ||
|
||
indexer_cache = {} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1098,7 +1098,8 @@ def reset_identity(values): | |
group_names = self.grouper.names | ||
|
||
result = concat(values, axis=self.axis, keys=group_keys, | ||
levels=group_levels, names=group_names) | ||
levels=group_levels, names=group_names, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here, there are like 10 calls to concat. I think should be explicit about sort |
||
sort=False) | ||
else: | ||
|
||
# GH5610, returns a MI, with the first level being a | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,6 @@ | ||
import textwrap | ||
import warnings | ||
|
||
from pandas.core.indexes.base import (Index, | ||
_new_Index, | ||
_ensure_index, | ||
|
@@ -17,6 +20,16 @@ | |
from pandas._libs import lib | ||
from pandas._libs.tslib import NaT | ||
|
||
_sort_msg = textwrap.dedent("""\ | ||
Sorting because non-concatenation axis is not aligned. A future version | ||
of pandas will change to not sort by default. | ||
|
||
To accept the future behavior, pass 'sort=True'. | ||
|
||
To retain the current behavior and silence the warning, pass sort=False | ||
""") | ||
|
||
|
||
# TODO: there are many places that rely on these private methods existing in | ||
# pandas.core.index | ||
__all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index', | ||
|
@@ -31,33 +44,40 @@ | |
'_all_indexes_same'] | ||
|
||
|
||
def _get_objs_combined_axis(objs, intersect=False, axis=0): | ||
def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True): | ||
# Extract combined index: return intersection or union (depending on the | ||
# value of "intersect") of indexes on given axis, or None if all objects | ||
# lack indexes (e.g. they are numpy arrays) | ||
obs_idxes = [obj._get_axis(axis) for obj in objs | ||
if hasattr(obj, '_get_axis')] | ||
if obs_idxes: | ||
return _get_combined_index(obs_idxes, intersect=intersect) | ||
return _get_combined_index(obs_idxes, intersect=intersect, sort=sort) | ||
|
||
|
||
def _get_combined_index(indexes, intersect=False): | ||
def _get_combined_index(indexes, intersect=False, sort=False): | ||
# TODO: handle index names! | ||
indexes = com._get_distinct_objs(indexes) | ||
if len(indexes) == 0: | ||
return Index([]) | ||
if len(indexes) == 1: | ||
return indexes[0] | ||
if intersect: | ||
index = Index([]) | ||
elif len(indexes) == 1: | ||
index = indexes[0] | ||
elif intersect: | ||
index = indexes[0] | ||
for other in indexes[1:]: | ||
index = index.intersection(other) | ||
return index | ||
union = _union_indexes(indexes) | ||
return _ensure_index(union) | ||
else: | ||
index = _union_indexes(indexes, sort=sort) | ||
index = _ensure_index(index) | ||
|
||
if sort: | ||
try: | ||
index = index.sort_values() | ||
except TypeError: | ||
pass | ||
return index | ||
|
||
|
||
def _union_indexes(indexes): | ||
def _union_indexes(indexes, sort=True): | ||
if len(indexes) == 0: | ||
raise AssertionError('Must have at least 1 Index to union') | ||
if len(indexes) == 1: | ||
|
@@ -74,7 +94,8 @@ def conv(i): | |
i = i.tolist() | ||
return i | ||
|
||
return Index(lib.fast_unique_multiple_list([conv(i) for i in inds])) | ||
return Index( | ||
lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort)) | ||
|
||
if kind == 'special': | ||
result = indexes[0] | ||
|
@@ -89,13 +110,19 @@ def conv(i): | |
index = indexes[0] | ||
for other in indexes[1:]: | ||
if not index.equals(other): | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. isn't this already passed in as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do you mean? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I mean this is deeply buried here, why is this showing a user facing warning? IOW what is the user call that triggers this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Where else would it go? Before this point we don't know what the indexes to align are, and whether or not they're aligned. |
||
if sort is None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would move |
||
# TODO: remove once pd.concat sort default changes | ||
warnings.warn(_sort_msg, FutureWarning, stacklevel=8) | ||
sort = True | ||
|
||
return _unique_indices(indexes) | ||
|
||
name = _get_consensus_names(indexes)[0] | ||
if name != index.name: | ||
index = index._shallow_copy(name=name) | ||
return index | ||
else: | ||
else: # kind='list' | ||
return _unique_indices(indexes) | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1499,8 +1499,11 @@ def _extract_axis(self, data, axis=0, intersect=False): | |
raw_lengths.append(v.shape[axis]) | ||
|
||
if have_frames: | ||
# we want the "old" behavior here, of sorting only | ||
# 1. we're doing a union (intersect=False) | ||
# 2. the indices are not aligned. | ||
index = _get_objs_combined_axis(data.values(), axis=axis, | ||
intersect=intersect) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so this always warns? shouldn't this be |
||
intersect=intersect, sort=None) | ||
|
||
if have_raw_arrays: | ||
lengths = list(set(raw_lengths)) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,7 +20,7 @@ | |
|
||
def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, | ||
keys=None, levels=None, names=None, verify_integrity=False, | ||
copy=True): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. actually move before verify_integrity There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pls do this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why make this an API breaking change? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. because it more logical. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How so? I'm OK with breaking API when necessary, but this seems unnecessary. |
||
sort=None, copy=True): | ||
""" | ||
Concatenate pandas objects along a particular axis with optional set logic | ||
along the other axes. | ||
|
@@ -60,6 +60,19 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, | |
verify_integrity : boolean, default False | ||
Check whether the new concatenated axis contains duplicates. This can | ||
be very expensive relative to the actual data concatenation | ||
sort : boolean, default None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same |
||
Sort non-concatenation axis if it is not already aligned when `join` | ||
is 'outer'. The current default of sorting is deprecated and will | ||
change to not-sorting in a future version of pandas. | ||
|
||
Explicitly pass ``sort=True`` to silence the warning and sort. | ||
Explicitly pass ``sort=False`` to silence the warning and not sort. | ||
|
||
This has no effect when ``join='inner'``, which already preserves | ||
the order of the non-concatenation axis. | ||
|
||
.. versionadded:: 0.23.0 | ||
|
||
copy : boolean, default True | ||
If False, do not copy data unnecessarily | ||
|
||
|
@@ -209,7 +222,7 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, | |
ignore_index=ignore_index, join=join, | ||
keys=keys, levels=levels, names=names, | ||
verify_integrity=verify_integrity, | ||
copy=copy) | ||
copy=copy, sort=sort) | ||
return op.get_result() | ||
|
||
|
||
|
@@ -220,7 +233,8 @@ class _Concatenator(object): | |
|
||
def __init__(self, objs, axis=0, join='outer', join_axes=None, | ||
keys=None, levels=None, names=None, | ||
ignore_index=False, verify_integrity=False, copy=True): | ||
ignore_index=False, verify_integrity=False, copy=True, | ||
sort=False): | ||
if isinstance(objs, (NDFrame, compat.string_types)): | ||
raise TypeError('first argument must be an iterable of pandas ' | ||
'objects, you passed an object of type ' | ||
|
@@ -355,6 +369,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, | |
self.keys = keys | ||
self.names = names or getattr(keys, 'names', None) | ||
self.levels = levels | ||
self.sort = sort | ||
|
||
self.ignore_index = ignore_index | ||
self.verify_integrity = verify_integrity | ||
|
@@ -447,7 +462,8 @@ def _get_comb_axis(self, i): | |
data_axis = self.objs[0]._get_block_manager_axis(i) | ||
try: | ||
return _get_objs_combined_axis(self.objs, axis=data_axis, | ||
intersect=self.intersect) | ||
intersect=self.intersect, | ||
sort=self.sort) | ||
except IndexError: | ||
types = [type(x).__name__ for x in self.objs] | ||
raise TypeError("Cannot concatenate list of {types}" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
need a ref here