From 5a588ace39ecd38cfa9a6bff291602a306a12fff Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 19 Sep 2020 15:53:29 -0700 Subject: [PATCH 1/4] REF: de-duplicate IntervalArray._validate_foo (#36483) --- pandas/core/arrays/interval.py | 62 +++++++++---------- pandas/tests/arrays/interval/test_interval.py | 4 ++ .../tests/indexes/interval/test_interval.py | 7 ++- 3 files changed, 39 insertions(+), 34 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index f9f68004bcc23..ebabc7edcbf43 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -20,7 +20,6 @@ is_datetime64_any_dtype, is_float_dtype, is_integer_dtype, - is_interval, is_interval_dtype, is_list_like, is_object_dtype, @@ -813,7 +812,9 @@ def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs): fill_left = fill_right = fill_value if allow_fill: - fill_left, fill_right = self._validate_fill_value(fill_value) + if (np.asarray(indices) == -1).any(): + # We have excel tests that pass fill_value=True, xref GH#36466 + fill_left, fill_right = self._validate_fill_value(fill_value) left_take = take( self.left, indices, allow_fill=allow_fill, fill_value=fill_left @@ -824,20 +825,33 @@ def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs): return self._shallow_copy(left_take, right_take) - def _validate_fill_value(self, value): - if is_interval(value): - self._check_closed_matches(value, name="fill_value") - fill_left, fill_right = value.left, value.right - elif not is_scalar(value) and notna(value): - msg = ( - "'IntervalArray.fillna' only supports filling with a " - "'scalar pandas.Interval or NA'. " - f"Got a '{type(value).__name__}' instead." - ) - raise ValueError(msg) + def _validate_listlike(self, value): + # list-like of intervals + try: + array = IntervalArray(value) + # TODO: self._check_closed_matches(array, name="value") + value_left, value_right = array.left, array.right + except TypeError as err: + # wrong type: not interval or NA + msg = f"'value' should be an interval type, got {type(value)} instead." + raise TypeError(msg) from err + return value_left, value_right + + def _validate_scalar(self, value): + if isinstance(value, Interval): + self._check_closed_matches(value, name="value") + left, right = value.left, value.right + elif is_valid_nat_for_dtype(value, self.left.dtype): + # GH#18295 + left = right = value else: - fill_left = fill_right = self.left._na_value - return fill_left, fill_right + raise ValueError( + "can only insert Interval objects and NA into an IntervalArray" + ) + return left, right + + def _validate_fill_value(self, value): + return self._validate_scalar(value) def _validate_fillna_value(self, value): if not isinstance(value, Interval): @@ -851,26 +865,12 @@ def _validate_fillna_value(self, value): return value.left, value.right def _validate_insert_value(self, value): - if isinstance(value, Interval): - if value.closed != self.closed: - raise ValueError( - "inserted item must be closed on the same side as the index" - ) - left_insert = value.left - right_insert = value.right - elif is_valid_nat_for_dtype(value, self.left.dtype): - # GH#18295 - left_insert = right_insert = value - else: - raise ValueError( - "can only insert Interval objects and NA into an IntervalIndex" - ) - return left_insert, right_insert + return self._validate_scalar(value) def _validate_setitem_value(self, value): needs_float_conversion = False - if is_scalar(value) and isna(value): + if is_valid_nat_for_dtype(value, self.left.dtype): # na value: need special casing to set directly on numpy arrays if is_integer_dtype(self.dtype.subtype): # can't set NaN on a numpy integer array diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 0176755b54dd1..e5ccb51ce36f5 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -105,6 +105,10 @@ def test_set_na(self, left_right_dtypes): left, right = left_right_dtypes result = IntervalArray.from_arrays(left, right) + if result.dtype.subtype.kind not in ["m", "M"]: + msg = "'value' should be an interval type, got <.*NaTType'> instead." + with pytest.raises(TypeError, match=msg): + result[0] = pd.NaT if result.dtype.subtype.kind in ["i", "u"]: msg = "Cannot set float NaN to integer-backed IntervalArray" with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 734c98af3d058..b81f0f27e60ad 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -191,13 +191,14 @@ def test_insert(self, data): tm.assert_index_equal(result, expected) # invalid type - msg = "can only insert Interval objects and NA into an IntervalIndex" + msg = "can only insert Interval objects and NA into an IntervalArray" with pytest.raises(ValueError, match=msg): data.insert(1, "foo") # invalid closed - msg = "inserted item must be closed on the same side as the index" + msg = "'value.closed' is 'left', expected 'right'." for closed in {"left", "right", "both", "neither"} - {item.closed}: + msg = f"'value.closed' is '{closed}', expected '{item.closed}'." with pytest.raises(ValueError, match=msg): bad_item = Interval(item.left, item.right, closed=closed) data.insert(1, bad_item) @@ -211,7 +212,7 @@ def test_insert(self, data): if data.left.dtype.kind not in ["m", "M"]: # trying to insert pd.NaT into a numeric-dtyped Index should cast/raise - msg = "can only insert Interval objects and NA into an IntervalIndex" + msg = "can only insert Interval objects and NA into an IntervalArray" with pytest.raises(ValueError, match=msg): result = data.insert(1, pd.NaT) else: From ffb7b94346662b2bd636892188ea2ebefe42e664 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 19 Sep 2020 19:13:12 -0700 Subject: [PATCH 2/4] TYP: core.missing; PERF for needs_i8_conversion (#36485) --- pandas/core/arrays/datetimelike.py | 14 ++------------ pandas/core/dtypes/common.py | 4 ++++ pandas/core/missing.py | 31 +++++++++++++++--------------- 3 files changed, 21 insertions(+), 28 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 026aad5ad6eb7..45cabe8f0b498 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1005,19 +1005,9 @@ def fillna(self, value=None, method=None, limit=None): else: func = missing.backfill_1d - values = self._ndarray - if not is_period_dtype(self.dtype): - # For PeriodArray self._ndarray is i8, which gets copied - # by `func`. Otherwise we need to make a copy manually - # to avoid modifying `self` in-place. - values = values.copy() - + values = self.copy() new_values = func(values, limit=limit, mask=mask) - if is_datetime64tz_dtype(self.dtype): - # we need to pass int64 values to the constructor to avoid - # re-localizing incorrectly - new_values = new_values.view("i8") - new_values = type(self)(new_values, dtype=self.dtype) + new_values = self._from_backing_data(new_values) else: # fill with value new_values = self.copy() diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 5987fdabf78bb..acbdbfd7707e3 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1215,6 +1215,10 @@ def needs_i8_conversion(arr_or_dtype) -> bool: """ if arr_or_dtype is None: return False + if isinstance(arr_or_dtype, (np.dtype, ExtensionDtype)): + # fastpath + dtype = arr_or_dtype + return dtype.kind in ["m", "M"] or dtype.type is Period return ( is_datetime_or_timedelta_dtype(arr_or_dtype) or is_datetime64tz_dtype(arr_or_dtype) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index be66b19d10064..9b96c8f01153b 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -7,17 +7,15 @@ import numpy as np from pandas._libs import algos, lib +from pandas._typing import DtypeObj from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.cast import infer_dtype_from_array from pandas.core.dtypes.common import ( ensure_float64, - is_datetime64_dtype, - is_datetime64tz_dtype, is_integer_dtype, is_numeric_v_string_like, is_scalar, - is_timedelta64_dtype, needs_i8_conversion, ) from pandas.core.dtypes.missing import isna @@ -72,7 +70,7 @@ def mask_missing(arr, values_to_mask): return mask -def clean_fill_method(method, allow_nearest=False): +def clean_fill_method(method, allow_nearest: bool = False): # asfreq is compat for resampling if method in [None, "asfreq"]: return None @@ -543,7 +541,12 @@ def _cubicspline_interpolate(xi, yi, x, axis=0, bc_type="not-a-knot", extrapolat def interpolate_2d( - values, method="pad", axis=0, limit=None, fill_value=None, dtype=None + values, + method="pad", + axis=0, + limit=None, + fill_value=None, + dtype: Optional[DtypeObj] = None, ): """ Perform an actual interpolation of values, values will be make 2-d if @@ -584,18 +587,14 @@ def interpolate_2d( return values -def _cast_values_for_fillna(values, dtype): +def _cast_values_for_fillna(values, dtype: DtypeObj): """ Cast values to a dtype that algos.pad and algos.backfill can handle. """ # TODO: for int-dtypes we make a copy, but for everything else this # alters the values in-place. Is this intentional? - if ( - is_datetime64_dtype(dtype) - or is_datetime64tz_dtype(dtype) - or is_timedelta64_dtype(dtype) - ): + if needs_i8_conversion(dtype): values = values.view(np.int64) elif is_integer_dtype(values): @@ -605,7 +604,7 @@ def _cast_values_for_fillna(values, dtype): return values -def _fillna_prep(values, mask=None, dtype=None): +def _fillna_prep(values, mask=None, dtype: Optional[DtypeObj] = None): # boilerplate for pad_1d, backfill_1d, pad_2d, backfill_2d if dtype is None: dtype = values.dtype @@ -620,19 +619,19 @@ def _fillna_prep(values, mask=None, dtype=None): return values, mask -def pad_1d(values, limit=None, mask=None, dtype=None): +def pad_1d(values, limit=None, mask=None, dtype: Optional[DtypeObj] = None): values, mask = _fillna_prep(values, mask, dtype) algos.pad_inplace(values, mask, limit=limit) return values -def backfill_1d(values, limit=None, mask=None, dtype=None): +def backfill_1d(values, limit=None, mask=None, dtype: Optional[DtypeObj] = None): values, mask = _fillna_prep(values, mask, dtype) algos.backfill_inplace(values, mask, limit=limit) return values -def pad_2d(values, limit=None, mask=None, dtype=None): +def pad_2d(values, limit=None, mask=None, dtype: Optional[DtypeObj] = None): values, mask = _fillna_prep(values, mask, dtype) if np.all(values.shape): @@ -643,7 +642,7 @@ def pad_2d(values, limit=None, mask=None, dtype=None): return values -def backfill_2d(values, limit=None, mask=None, dtype=None): +def backfill_2d(values, limit=None, mask=None, dtype: Optional[DtypeObj] = None): values, mask = _fillna_prep(values, mask, dtype) if np.all(values.shape): From 742d06d42a1c15e54ebedd1f3b724b712b9ea24e Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Sat, 19 Sep 2020 21:13:54 -0500 Subject: [PATCH 3/4] Don't unlabel stale PR on update (#36487) --- .github/workflows/stale-pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/stale-pr.yml b/.github/workflows/stale-pr.yml index e3b8d9336a5a6..e77bf2b81fc86 100644 --- a/.github/workflows/stale-pr.yml +++ b/.github/workflows/stale-pr.yml @@ -17,5 +17,5 @@ jobs: exempt-pr-labels: "Needs Review,Blocked,Needs Discussion" days-before-stale: 30 days-before-close: -1 - remove-stale-when-updated: true + remove-stale-when-updated: false debug-only: false From 4679970b2d64d3c55150e0953d04a15d8ae23a48 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 20 Sep 2020 16:20:32 -0700 Subject: [PATCH 4/4] re-remove --- pandas/tests/scalar/test_na_scalar.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index 10d366fe485da..5c4d7e191d1bb 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -305,11 +305,3 @@ def test_pickle_roundtrip_containers(as_frame, values, dtype): s = s.to_frame(name="A") result = tm.round_trip_pickle(s) tm.assert_equal(result, s) - - -@pytest.mark.parametrize("array", [np.array(["a"], dtype=object), ["a"]]) -def test_array_contains_na(array): - # GH 31922 - msg = "boolean value of NA is ambiguous" - with pytest.raises(TypeError, match=msg): - NA in array