From 28f274cb107ef0a252eb4cb954c1cfbe43968395 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 20 Sep 2023 10:04:23 -0700 Subject: [PATCH] BUG: IntervalIndex.get_indexer incorrectly matching ints to datetimes (#54964) * REF: separate out _nbins_to_bins * Cast x to Index early * BUG: IntervalIndex.get_indexer incorrectly matching ints to datetimes * GH ref --- doc/source/whatsnew/v2.2.0.rst | 3 + pandas/core/indexes/base.py | 6 +- pandas/core/reshape/tile.py | 168 +++++++----------- .../tests/indexes/interval/test_indexing.py | 2 +- pandas/tests/reshape/test_cut.py | 26 ++- 5 files changed, 92 insertions(+), 113 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 3ad28222cc57a..8eab623a2b5f7 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -288,6 +288,8 @@ Strings Interval ^^^^^^^^ - Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown. (:issue:`55015`) +- Bug in :meth:`IntervalIndex.get_indexer` with datetime or timedelta intervals incorrectly matching on integer targets (:issue:`47772`) +- Bug in :meth:`IntervalIndex.get_indexer` with timezone-aware datetime intervals incorrectly matching on a sequence of timezone-naive targets (:issue:`47772`) - Indexing @@ -349,6 +351,7 @@ Styler Other ^^^^^ +- Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`) .. ***DO NOT USE THIS SECTION*** diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3adb4dfa227db..8703fef1e5940 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3938,12 +3938,8 @@ def _should_partial_index(self, target: Index) -> bool: if isinstance(self.dtype, IntervalDtype): if isinstance(target.dtype, IntervalDtype): return False - # See https://github.com/pandas-dev/pandas/issues/47772 the commented - # out code can be restored (instead of hardcoding `return True`) - # once that issue is fixed # "Index" has no attribute "left" - # return self.left._should_compare(target) # type: ignore[attr-defined] - return True + return self.left._should_compare(target) # type: ignore[attr-defined] return False @final diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 126f589f5df71..980e8aa41669f 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -17,10 +17,8 @@ Timestamp, lib, ) -from pandas._libs.lib import infer_dtype from pandas.core.dtypes.common import ( - DT64NS_DTYPE, ensure_platform_int, is_bool_dtype, is_integer, @@ -243,7 +241,7 @@ def cut( original = x x_idx = _preprocess_for_cut(x) - x_idx, dtype = _coerce_to_type(x_idx) + x_idx, _ = _coerce_to_type(x_idx) if not np.iterable(bins): bins = _nbins_to_bins(x_idx, bins, right) @@ -254,16 +252,8 @@ def cut( else: bins = Index(bins) - if isinstance(getattr(bins, "dtype", None), DatetimeTZDtype): - bins = np.asarray(bins, dtype=DT64NS_DTYPE) - else: - bins = np.asarray(bins) - bins = _convert_bin_to_numeric_type(bins, dtype) - - # GH 26045: cast to float64 to avoid an overflow - if (np.diff(bins.astype("float64")) < 0).any(): + if not bins.is_monotonic_increasing: raise ValueError("bins must increase monotonically.") - bins = Index(bins) fac, bins = _bins_to_cuts( x_idx, @@ -272,12 +262,11 @@ def cut( labels=labels, precision=precision, include_lowest=include_lowest, - dtype=dtype, duplicates=duplicates, ordered=ordered, ) - return _postprocess_for_cut(fac, bins, retbins, dtype, original) + return _postprocess_for_cut(fac, bins, retbins, original) def qcut( @@ -343,13 +332,11 @@ def qcut( """ original = x x_idx = _preprocess_for_cut(x) - x_idx, dtype = _coerce_to_type(x_idx) + x_idx, _ = _coerce_to_type(x_idx) quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q - x_np = np.asarray(x_idx) - x_np = x_np[~np.isnan(x_np)] - bins = np.quantile(x_np, quantiles) + bins = x_idx.to_series().dropna().quantile(quantiles) fac, bins = _bins_to_cuts( x_idx, @@ -357,11 +344,10 @@ def qcut( labels=labels, precision=precision, include_lowest=True, - dtype=dtype, duplicates=duplicates, ) - return _postprocess_for_cut(fac, bins, retbins, dtype, original) + return _postprocess_for_cut(fac, bins, retbins, original) def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index: @@ -378,18 +364,41 @@ def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index: rng = (x_idx.min(), x_idx.max()) mn, mx = rng - if np.isinf(mn) or np.isinf(mx): + is_dt_or_td = lib.is_np_dtype(x_idx.dtype, "mM") or isinstance( + x_idx.dtype, DatetimeTZDtype + ) + + if is_numeric_dtype(x_idx.dtype) and (np.isinf(mn) or np.isinf(mx)): # GH#24314 raise ValueError( "cannot specify integer `bins` when input data contains infinity" ) if mn == mx: # adjust end points before binning - mn -= 0.001 * abs(mn) if mn != 0 else 0.001 - mx += 0.001 * abs(mx) if mx != 0 else 0.001 - bins = np.linspace(mn, mx, nbins + 1, endpoint=True) + if is_dt_or_td: + # using seconds=1 is pretty arbitrary here + td = Timedelta(seconds=1) + # Use DatetimeArray/TimedeltaArray method instead of linspace + # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]" + # has no attribute "_generate_range" + bins = x_idx._values._generate_range( # type: ignore[union-attr] + start=mn - td, end=mx + td, periods=nbins + 1, freq=None + ) + else: + mn -= 0.001 * abs(mn) if mn != 0 else 0.001 + mx += 0.001 * abs(mx) if mx != 0 else 0.001 + + bins = np.linspace(mn, mx, nbins + 1, endpoint=True) else: # adjust end points after binning - bins = np.linspace(mn, mx, nbins + 1, endpoint=True) + if is_dt_or_td: + # Use DatetimeArray/TimedeltaArray method instead of linspace + # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]" + # has no attribute "_generate_range" + bins = x_idx._values._generate_range( # type: ignore[union-attr] + start=mn, end=mx, periods=nbins + 1, freq=None + ) + else: + bins = np.linspace(mn, mx, nbins + 1, endpoint=True) adj = (mx - mn) * 0.001 # 0.1% of the range if right: bins[0] -= adj @@ -400,13 +409,12 @@ def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index: def _bins_to_cuts( - x: Index, + x_idx: Index, bins: Index, right: bool = True, labels=None, precision: int = 3, include_lowest: bool = False, - dtype: DtypeObj | None = None, duplicates: str = "raise", ordered: bool = True, ): @@ -422,7 +430,7 @@ def _bins_to_cuts( if isinstance(bins, IntervalIndex): # we have a fast-path here - ids = bins.get_indexer(x) + ids = bins.get_indexer(x_idx) cat_dtype = CategoricalDtype(bins, ordered=True) result = Categorical.from_codes(ids, dtype=cat_dtype, validate=False) return result, bins @@ -437,12 +445,29 @@ def _bins_to_cuts( bins = unique_bins side: Literal["left", "right"] = "left" if right else "right" - ids = ensure_platform_int(bins.searchsorted(x, side=side)) + + try: + ids = bins.searchsorted(x_idx, side=side) + except TypeError as err: + # e.g. test_datetime_nan_error if bins are DatetimeArray and x_idx + # is integers + if x_idx.dtype.kind == "m": + raise ValueError("bins must be of timedelta64 dtype") from err + elif x_idx.dtype.kind == bins.dtype.kind == "M": + raise ValueError( + "Cannot use timezone-naive bins with timezone-aware values, " + "or vice-versa" + ) from err + elif x_idx.dtype.kind == "M": + raise ValueError("bins must be of datetime64 dtype") from err + else: + raise + ids = ensure_platform_int(ids) if include_lowest: - ids[np.asarray(x) == bins[0]] = 1 + ids[x_idx == bins[0]] = 1 - na_mask = isna(x) | (ids == len(bins)) | (ids == 0) + na_mask = isna(x_idx) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: @@ -454,7 +479,7 @@ def _bins_to_cuts( if labels is None: labels = _format_labels( - bins, precision, right=right, include_lowest=include_lowest, dtype=dtype + bins, precision, right=right, include_lowest=include_lowest ) elif ordered and len(set(labels)) != len(labels): raise ValueError( @@ -513,68 +538,7 @@ def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]: x_arr = x.to_numpy(dtype=np.float64, na_value=np.nan) x = Index(x_arr) - if dtype is not None: - # GH 19768: force NaT to NaN during integer conversion - x_arr = np.where(x.notna(), x.view(np.int64), np.nan) - x = Index(x_arr) - - return x, dtype - - -def _convert_bin_to_numeric_type(bins, dtype: DtypeObj | None): - """ - if the passed bin is of datetime/timedelta type, - this method converts it to integer - - Parameters - ---------- - bins : list-like of bins - dtype : dtype of data - - Raises - ------ - ValueError if bins are not of a compat dtype to dtype - """ - bins_dtype = infer_dtype(bins, skipna=False) - if lib.is_np_dtype(dtype, "m"): - if bins_dtype in ["timedelta", "timedelta64"]: - bins = to_timedelta(bins).view(np.int64) - else: - raise ValueError("bins must be of timedelta64 dtype") - elif lib.is_np_dtype(dtype, "M") or isinstance(dtype, DatetimeTZDtype): - if bins_dtype in ["datetime", "datetime64"]: - bins = to_datetime(bins) - if lib.is_np_dtype(bins.dtype, "M"): - # As of 2.0, to_datetime may give non-nano, so we need to convert - # here until the rest of this file recognizes non-nano - bins = bins.astype("datetime64[ns]", copy=False) - bins = bins.view(np.int64) - else: - raise ValueError("bins must be of datetime64 dtype") - - return bins - - -def _convert_bin_to_datelike_type(bins, dtype: DtypeObj | None): - """ - Convert bins to a DatetimeIndex or TimedeltaIndex if the original dtype is - datelike - - Parameters - ---------- - bins : list-like of bins - dtype : dtype of data - - Returns - ------- - bins : Array-like of bins, DatetimeIndex or TimedeltaIndex if dtype is - datelike - """ - if isinstance(dtype, DatetimeTZDtype): - bins = to_datetime(bins.astype(np.int64), utc=True).tz_convert(dtype.tz) - elif lib.is_np_dtype(dtype, "mM"): - bins = Index(bins.astype(np.int64), dtype=dtype) - return bins + return Index(x), dtype def _format_labels( @@ -582,21 +546,20 @@ def _format_labels( precision: int, right: bool = True, include_lowest: bool = False, - dtype: DtypeObj | None = None, ): """based on the dtype, return our labels""" closed: IntervalLeftRight = "right" if right else "left" formatter: Callable[[Any], Timestamp] | Callable[[Any], Timedelta] - if isinstance(dtype, DatetimeTZDtype): - formatter = lambda x: Timestamp(x, tz=dtype.tz) + if isinstance(bins.dtype, DatetimeTZDtype): + formatter = lambda x: x adjust = lambda x: x - Timedelta("1ns") - elif lib.is_np_dtype(dtype, "M"): - formatter = Timestamp + elif lib.is_np_dtype(bins.dtype, "M"): + formatter = lambda x: x adjust = lambda x: x - Timedelta("1ns") - elif lib.is_np_dtype(dtype, "m"): - formatter = Timedelta + elif lib.is_np_dtype(bins.dtype, "m"): + formatter = lambda x: x adjust = lambda x: x - Timedelta("1ns") else: precision = _infer_precision(precision, bins) @@ -628,7 +591,7 @@ def _preprocess_for_cut(x) -> Index: return Index(x) -def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, original): +def _postprocess_for_cut(fac, bins, retbins: bool, original): """ handles post processing for the cut method where we combine the index information if the originally passed @@ -640,7 +603,6 @@ def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, origi if not retbins: return fac - bins = _convert_bin_to_datelike_type(bins, dtype) if isinstance(bins, Index) and is_numeric_dtype(bins.dtype): bins = bins._values diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index e65ae52e348c6..db8f697b95cd8 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -307,9 +307,9 @@ def test_get_indexer_datetime(self): result = ii.get_indexer(DatetimeIndex(["2018-01-02"]).astype(str)) tm.assert_numpy_array_equal(result, expected) - # TODO this should probably be deprecated? # https://github.com/pandas-dev/pandas/issues/47772 result = ii.get_indexer(DatetimeIndex(["2018-01-02"]).asi8) + expected = np.array([-1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 3a284f7732ac1..8c4c51289870b 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -495,15 +495,33 @@ def test_datetime_cut(data): tm.assert_series_equal(Series(result), expected) -@pytest.mark.parametrize( - "bins", - [ - 3, +@pytest.mark.parametrize("box", [list, np.array, Index, Series]) +def test_datetime_tz_cut_mismatched_tzawareness(box): + # GH#54964 + bins = box( [ Timestamp("2013-01-01 04:57:07.200000"), Timestamp("2013-01-01 21:00:00"), Timestamp("2013-01-02 13:00:00"), Timestamp("2013-01-03 05:00:00"), + ] + ) + ser = Series(date_range("20130101", periods=3, tz="US/Eastern")) + + msg = "Cannot use timezone-naive bins with timezone-aware values" + with pytest.raises(ValueError, match=msg): + cut(ser, bins) + + +@pytest.mark.parametrize( + "bins", + [ + 3, + [ + Timestamp("2013-01-01 04:57:07.200000", tz="UTC").tz_convert("US/Eastern"), + Timestamp("2013-01-01 21:00:00", tz="UTC").tz_convert("US/Eastern"), + Timestamp("2013-01-02 13:00:00", tz="UTC").tz_convert("US/Eastern"), + Timestamp("2013-01-03 05:00:00", tz="UTC").tz_convert("US/Eastern"), ], ], )