diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index b280e82c73b89..2b70078c50059 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -90,6 +90,43 @@ be removed in the future, see :ref:`here ` for more about :class:`NumericIndex`. + +.. _whatsnew_140.enhancements.ExtensionIndex: + +Index can hold arbitrary ExtensionArrays +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Until now, passing a custom :class:`ExtensionArray` to ``pd.Index`` would cast the +array to ``object`` dtype. Now :class:`Index` can directly hold arbitrary ExtensionArrays (:issue:`43930`). + +*Previous behavior*: + +.. ipython:: python + + arr = pd.array([1, 2, pd.NA]) + idx = pd.Index(arr) + +In the old behavior, ``idx`` would be object-dtype: + +*Previous behavior*: + +.. code-block:: ipython + + In [1]: idx + Out[1]: Index([1, 2, ], dtype='object') + +With the new behavior, we keep the original dtype: + +*New behavior*: + +.. ipython:: python + + idx + +One exception to this is ``SparseArray``, which will continue to cast to numpy +dtype until pandas 2.0. At that point it will retain its dtype like other +ExtensionArrays. + .. _whatsnew_140.enhancements.styler: Styler diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 969da5aa53e3e..c3b86165e6d2c 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -33,7 +33,9 @@ from pandas._libs import ( hashtable as _hash, ) +from pandas._libs.lib cimport eq_NA_compat from pandas._libs.missing cimport ( + C_NA as NA, checknull, is_matching_na, ) @@ -62,7 +64,7 @@ cdef ndarray _get_bool_indexer(ndarray values, object val): if values.descr.type_num == cnp.NPY_OBJECT: # i.e. values.dtype == object if not checknull(val): - indexer = values == val + indexer = eq_NA_compat(values, val) else: # We need to check for _matching_ NA values diff --git a/pandas/_libs/lib.pxd b/pandas/_libs/lib.pxd index b3c72c30a74de..46a339f2e7cbb 100644 --- a/pandas/_libs/lib.pxd +++ b/pandas/_libs/lib.pxd @@ -1 +1,6 @@ +from numpy cimport ndarray + + cdef bint c_is_list_like(object, bint) except -1 + +cpdef ndarray eq_NA_compat(ndarray[object] arr, object key) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 8f9016e726f1e..950277ce608eb 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -3050,6 +3050,27 @@ def is_bool_list(obj: list) -> bool: return True +cpdef ndarray eq_NA_compat(ndarray[object] arr, object key): + """ + Check for `arr == key`, treating all values as not-equal to pd.NA. + + key is assumed to have `not isna(key)` + """ + cdef: + ndarray[uint8_t, cast=True] result = np.empty(len(arr), dtype=bool) + Py_ssize_t i + object item + + for i in range(len(arr)): + item = arr[i] + if item is C_NA: + result[i] = False + else: + result[i] = item == key + + return result + + def dtypes_all_equal(list types not None) -> bool: """ Faster version for: diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 77c477d3f9229..ea75af20bb0b6 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -404,9 +404,9 @@ def _get_ilevel_values(index, level): # skip exact index checking when `check_categorical` is False if check_exact and check_categorical: if not left.equals(right): - diff = ( - np.sum((left._values != right._values).astype(int)) * 100.0 / len(left) - ) + mismatch = left._values != right._values + + diff = np.sum(mismatch.astype(int)) * 100.0 / len(left) msg = f"{obj} values are different ({np.round(diff, 5)} %)" raise_assert_detail(obj, msg, left, right) else: diff --git a/pandas/conftest.py b/pandas/conftest.py index 7b8fa00f8aed6..9009484f8d386 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -67,6 +67,14 @@ MultiIndex, ) +try: + import pyarrow as pa +except ImportError: + has_pyarrow = False +else: + del pa + has_pyarrow = True + # Until https://github.com/numpy/numpy/issues/19078 is sorted out, just suppress suppress_npdev_promotion_warning = pytest.mark.filterwarnings( "ignore:Promotion of numbers and bools:FutureWarning" @@ -549,7 +557,15 @@ def _create_mi_with_dt64tz_level(): "mi-with-dt64tz-level": _create_mi_with_dt64tz_level(), "multi": _create_multiindex(), "repeats": Index([0, 0, 1, 1, 2, 2]), + "nullable_int": Index(np.arange(100), dtype="Int64"), + "nullable_uint": Index(np.arange(100), dtype="UInt16"), + "nullable_float": Index(np.arange(100), dtype="Float32"), + "nullable_bool": Index(np.arange(100).astype(bool), dtype="boolean"), + "string-python": Index(pd.array(tm.makeStringIndex(100), dtype="string[python]")), } +if has_pyarrow: + idx = Index(pd.array(tm.makeStringIndex(100), dtype="string[pyarrow]")) + indices_dict["string-pyarrow"] = idx @pytest.fixture(params=indices_dict.keys()) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 782fad435c1c5..b8e4331cfa3de 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -714,10 +714,7 @@ def value_counts(self, dropna: bool = True) -> Series: data = self._data[~self._mask] value_counts = Index(data).value_counts() - # TODO(ExtensionIndex) - # if we have allow Index to hold an ExtensionArray - # this is easier - index = value_counts.index._values.astype(object) + index = value_counts.index # if we want nans, count the mask if dropna: @@ -727,10 +724,9 @@ def value_counts(self, dropna: bool = True) -> Series: counts[:-1] = value_counts counts[-1] = self._mask.sum() - index = Index( - np.concatenate([index, np.array([self.dtype.na_value], dtype=object)]), - dtype=object, - ) + index = index.insert(len(index), self.dtype.na_value) + + index = index.astype(self.dtype) mask = np.zeros(len(counts), dtype="bool") counts = IntegerArray(counts, mask) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 2bf8452903302..6bad864ef776f 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -470,7 +470,9 @@ def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: def value_counts(self, dropna: bool = True): from pandas import value_counts - return value_counts(self._ndarray, dropna=dropna).astype("Int64") + result = value_counts(self._ndarray, dropna=dropna).astype("Int64") + result.index = result.index.astype(self.dtype) + return result def memory_usage(self, deep: bool = False) -> int: result = self._ndarray.nbytes diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index fb16834e5b4b2..431568c3a3b9d 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -313,6 +313,13 @@ def __getitem__( elif isinstance(item, tuple): item = unpack_tuple_and_ellipses(item) + # error: Non-overlapping identity check (left operand type: + # "Union[Union[int, integer[Any]], Union[slice, List[int], + # ndarray[Any, Any]]]", right operand type: "ellipsis") + if item is Ellipsis: # type: ignore[comparison-overlap] + # TODO: should be handled by pyarrow? + item = slice(None) + if is_scalar(item) and not is_integer(item): # e.g. "foo" or 2.5 # exception message copied from numpy @@ -615,8 +622,7 @@ def value_counts(self, dropna: bool = True) -> Series: # No missing values so we can adhere to the interface and return a numpy array. counts = np.array(counts) - # Index cannot hold ExtensionArrays yet - index = Index(type(self)(values)).astype(object) + index = Index(type(self)(values)) return Series(counts, index=index).astype("Int64") diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 60c8426ff3c6c..3a7d8a3191ef3 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1325,12 +1325,8 @@ def is_bool_dtype(arr_or_dtype) -> bool: # now we use the special definition for Index if isinstance(arr_or_dtype, ABCIndex): - - # TODO(jreback) - # we don't have a boolean Index class - # so its object, we need to infer to - # guess this - return arr_or_dtype.is_object() and arr_or_dtype.inferred_type == "boolean" + # Allow Index[object] that is all-bools or Index["boolean"] + return arr_or_dtype.inferred_type == "boolean" elif isinstance(dtype, ExtensionDtype): return getattr(dtype, "_is_boolean", False) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d5b1292435f04..f0b8f73bf2af1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -23,6 +23,7 @@ from pandas._config import get_option from pandas._libs import ( + NaT, algos as libalgos, index as libindex, lib, @@ -34,7 +35,6 @@ ) from pandas._libs.tslibs import ( IncompatibleFrequency, - NaTType, OutOfBoundsDatetime, Timestamp, tz_compare, @@ -139,6 +139,7 @@ tz_to_dtype, validate_tz_from_dtype, ) +from pandas.core.arrays.masked import BaseMaskedArray from pandas.core.arrays.sparse import SparseDtype from pandas.core.base import ( IndexOpsMixin, @@ -360,7 +361,10 @@ def _outer_indexer( _typ: str = "index" _data: ExtensionArray | np.ndarray - _data_cls: type[np.ndarray] | type[ExtensionArray] = np.ndarray + _data_cls: type[ExtensionArray] | tuple[type[np.ndarray], type[ExtensionArray]] = ( + np.ndarray, + ExtensionArray, + ) _id: object | None = None _name: Hashable = None # MultiIndex.levels previously allowed setting the index name. We @@ -415,8 +419,9 @@ def __new__( validate_tz_from_dtype(dtype, tz) dtype = tz_to_dtype(tz) - if isinstance(data, PandasArray): - # ensure users don't accidentally put a PandasArray in an index. + if type(data) is PandasArray: + # ensure users don't accidentally put a PandasArray in an index, + # but don't unpack StringArray data = data.to_numpy() if isinstance(dtype, PandasDtype): dtype = dtype.numpy_dtype @@ -438,7 +443,6 @@ def __new__( ea_cls = dtype.construct_array_type() data = ea_cls._from_sequence(data, dtype=dtype, copy=copy) - data = np.asarray(data, dtype=object) disallow_kwargs(kwargs) return Index._simple_new(data, name=name) @@ -451,8 +455,8 @@ def __new__( return result.astype(dtype, copy=False) return result - data = np.array(data, dtype=object, copy=copy) disallow_kwargs(kwargs) + data = extract_array(data, extract_numpy=True) return Index._simple_new(data, name=name) # index-like @@ -568,6 +572,14 @@ def _dtype_to_subclass(cls, dtype: DtypeObj): return PeriodIndex elif isinstance(dtype, SparseDtype): + warnings.warn( + "In a future version, passing a SparseArray to pd.Index " + "will store that array directly instead of converting to a " + "dense numpy ndarray. To retain the old behavior, use " + "pd.Index(arr.to_numpy()) instead", + FutureWarning, + stacklevel=find_stack_level(), + ) return cls._dtype_to_subclass(dtype.subtype) return Index @@ -830,9 +842,21 @@ def _cleanup(self) -> None: self._engine.clear_mapping() @cache_readonly - def _engine(self) -> libindex.IndexEngine: + def _engine( + self, + ) -> libindex.IndexEngine: # For base class (object dtype) we get ObjectEngine + if isinstance(self._values, BaseMaskedArray): + # TODO(ExtensionIndex): use libindex.NullableEngine(self._values) + return libindex.ObjectEngine(self._get_engine_target()) + elif ( + isinstance(self._values, ExtensionArray) + and self._engine_type is libindex.ObjectEngine + ): + # TODO(ExtensionIndex): use libindex.ExtensionEngine(self._values) + return libindex.ObjectEngine(self._get_engine_target()) + # to avoid a reference cycle, bind `target_values` to a local variable, so # `self` is not passed into the lambda. target_values = self._get_engine_target() @@ -957,6 +981,9 @@ def view(self, cls=None): return self._data.view(cls) idx_cls = self._dtype_to_subclass(dtype) + # NB: we only get here for subclasses that override + # _data_cls such that it is a type and not a tuple + # of types. arr_cls = idx_cls._data_cls arr = arr_cls(self._data.view("i8"), dtype=dtype) return idx_cls._simple_new(arr, name=self.name) @@ -1074,9 +1101,16 @@ def take( # Note: we discard fill_value and use self._na_value, only relevant # in the case where allow_fill is True and fill_value is not None - taken = algos.take( - self._values, indices, allow_fill=allow_fill, fill_value=self._na_value - ) + values = self._values + if isinstance(values, np.ndarray): + taken = algos.take( + values, indices, allow_fill=allow_fill, fill_value=self._na_value + ) + else: + # algos.take passes 'axis' keyword which not all EAs accept + taken = values.take( + indices, allow_fill=allow_fill, fill_value=self._na_value + ) # _constructor so RangeIndex->Int64Index return self._constructor._simple_new(taken, name=self.name) @@ -2581,8 +2615,15 @@ def __reduce__(self): # -------------------------------------------------------------------- # Null Handling Methods - _na_value: float | NaTType = np.nan - """The expected NA value to use with this index.""" + @cache_readonly + def _na_value(self): + """The expected NA value to use with this index.""" + dtype = self.dtype + if isinstance(dtype, np.dtype): + if dtype.kind in ["m", "M"]: + return NaT + return np.nan + return dtype.na_value @cache_readonly def _isnan(self) -> npt.NDArray[np.bool_]: @@ -3194,10 +3235,13 @@ def _wrap_setop_result(self, other: Index, result) -> Index: name = get_op_result_name(self, other) if isinstance(result, Index): if result.name != name: - return result.rename(name) - return result + result = result.rename(name) else: - return self._shallow_copy(result, name=name) + result = self._shallow_copy(result, name=name) + + # TODO(ExtensionIndex): revert this astype; it is a kludge to make + # it possible to split ExtensionEngine from ExtensionIndex PR. + return result.astype(self.dtype, copy=False) # TODO: standardize return type of non-union setops type(self vs other) @final @@ -3463,7 +3507,8 @@ def symmetric_difference(self, other, result_name=None, sort=None): res_values = concat_compat([left_diff, right_diff]) res_values = _maybe_try_sort(res_values, sort) - result = Index(res_values, name=result_name) + # pass dtype so we retain object dtype + result = Index(res_values, name=result_name, dtype=res_values.dtype) if self._is_multi: self = cast("MultiIndex", self) @@ -3487,7 +3532,13 @@ def _assert_can_do_setop(self, other) -> bool: def _convert_can_do_setop(self, other) -> tuple[Index, Hashable]: if not isinstance(other, Index): - other = Index(other, name=self.name) + # TODO(2.0): no need to special-case here once _with_infer + # deprecation is enforced + if hasattr(other, "dtype"): + other = Index(other, name=self.name, dtype=other.dtype) + else: + # e.g. list + other = Index(other, name=self.name) result_name = self.name else: result_name = get_op_result_name(self, other) @@ -4760,6 +4811,9 @@ def _get_engine_target(self) -> np.ndarray: """ # error: Incompatible return value type (got "Union[ExtensionArray, # ndarray]", expected "ndarray") + if type(self) is Index and isinstance(self._values, ExtensionArray): + # TODO(ExtensionIndex): remove special-case, just use self._values + return self._values.astype(object) return self._values # type: ignore[return-value] def _from_join_target(self, result: np.ndarray) -> ArrayLike: @@ -5843,9 +5897,6 @@ def _maybe_promote(self, other: Index) -> tuple[Index, Index]: elif self.inferred_type == "timedelta" and isinstance(other, ABCTimedeltaIndex): # TODO: we dont have tests that get here return type(other)(self), other - elif self.inferred_type == "boolean": - if not is_object_dtype(self.dtype): - return self.astype("object"), other.astype("object") elif self.dtype.kind == "u" and other.dtype.kind == "i": # GH#41873 diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 1cc15e9a5569f..4074f3e81e0a3 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -457,7 +457,7 @@ def reindex( else: # e.g. test_reindex_with_categoricalindex, test_reindex_duplicate_target new_target = np.asarray(new_target) - new_target = Index(new_target, name=self.name) + new_target = Index._with_infer(new_target, name=self.name) return new_target, indexer diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 731efdc3b17f0..589b92f392ca8 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -24,7 +24,6 @@ ) from pandas._libs.tslibs import ( BaseOffset, - NaTType, Resolution, Tick, parsing, @@ -154,9 +153,6 @@ def __contains__(self, key: Any) -> bool: _can_hold_na = True - _na_value: NaTType = NaT - """The expected NA value to use with this index.""" - def _convert_tolerance(self, tolerance, target): tolerance = np.asarray(to_timedelta(tolerance).to_numpy()) return super()._convert_tolerance(tolerance, target) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 13fbbd764c016..8f776d280afad 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1003,7 +1003,10 @@ def _validate_key(self, key, axis: int): # slice of labels (where start-end in labels) # slice of integers (only if in the labels) # boolean not in slice and with boolean index - if isinstance(key, bool) and not is_bool_dtype(self.obj._get_axis(axis)): + if isinstance(key, bool) and not ( + is_bool_dtype(self.obj._get_axis(axis)) + or self.obj._get_axis(axis).dtype.name == "boolean" + ): raise KeyError( f"{key}: boolean label can not be used without a boolean index" ) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 063e3c7e6dd19..c3e72f460caa6 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -1373,18 +1373,14 @@ def test_integer_array_add_list_like( left = container + box_1d_array(data) right = box_1d_array(data) + container - if Series == box_pandas_1d_array: - expected = Series(expected_data, dtype="Int64") - elif Series == box_1d_array: - if box_pandas_1d_array is tm.to_array: - expected = Series(expected_data, dtype="Int64") - else: - expected = Series(expected_data, dtype="object") - elif Index in (box_pandas_1d_array, box_1d_array): - expected = Int64Index(expected_data) + if Series in [box_1d_array, box_pandas_1d_array]: + cls = Series + elif Index in [box_1d_array, box_pandas_1d_array]: + cls = Index else: - # box_pandas_1d_array is tm.to_array; preserves IntegerArray - expected = array(expected_data, dtype="Int64") + cls = array + + expected = cls(expected_data, dtype="Int64") tm.assert_equal(left, expected) tm.assert_equal(right, expected) diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py index 78992f3124779..8e9112b531fad 100644 --- a/pandas/tests/arrays/boolean/test_function.py +++ b/pandas/tests/arrays/boolean/test_function.py @@ -92,18 +92,21 @@ def test_ufunc_reduce_raises(values): def test_value_counts_na(): arr = pd.array([True, False, pd.NA], dtype="boolean") result = arr.value_counts(dropna=False) - expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64") + expected = pd.Series([1, 1, 1], index=arr, dtype="Int64") + assert expected.index.dtype == arr.dtype tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) - expected = pd.Series([1, 1], index=[True, False], dtype="Int64") + expected = pd.Series([1, 1], index=arr[:-1], dtype="Int64") + assert expected.index.dtype == arr.dtype tm.assert_series_equal(result, expected) def test_value_counts_with_normalize(): ser = pd.Series([True, False, pd.NA], dtype="boolean") result = ser.value_counts(normalize=True) - expected = pd.Series([1, 1], index=[True, False], dtype="Float64") / 2 + expected = pd.Series([1, 1], index=ser[:-1], dtype="Float64") / 2 + assert expected.index.dtype == "boolean" tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index fd7e040a471f4..96c66ac80a7e5 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -726,7 +726,8 @@ def test_categorical_extension_array_nullable(self, nulls_fixture): # GH: arr = pd.arrays.StringArray._from_sequence([nulls_fixture] * 2) result = Categorical(arr) - expected = Categorical(Series([pd.NA, pd.NA], dtype="object")) + assert arr.dtype == result.categories.dtype + expected = Categorical(Series([pd.NA, pd.NA], dtype=arr.dtype)) tm.assert_categorical_equal(result, expected) def test_from_sequence_copy(self): diff --git a/pandas/tests/arrays/floating/test_function.py b/pandas/tests/arrays/floating/test_function.py index 3fe869280dc2c..fbdf419811e24 100644 --- a/pandas/tests/arrays/floating/test_function.py +++ b/pandas/tests/arrays/floating/test_function.py @@ -98,18 +98,21 @@ def test_stat_method(pandasmethname, kwargs): def test_value_counts_na(): arr = pd.array([0.1, 0.2, 0.1, pd.NA], dtype="Float64") result = arr.value_counts(dropna=False) - expected = pd.Series([2, 1, 1], index=[0.1, 0.2, pd.NA], dtype="Int64") + idx = pd.Index([0.1, 0.2, pd.NA], dtype=arr.dtype) + assert idx.dtype == arr.dtype + expected = pd.Series([2, 1, 1], index=idx, dtype="Int64") tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) - expected = pd.Series([2, 1], index=[0.1, 0.2], dtype="Int64") + expected = pd.Series([2, 1], index=idx[:-1], dtype="Int64") tm.assert_series_equal(result, expected) def test_value_counts_empty(): ser = pd.Series([], dtype="Float64") result = ser.value_counts() - idx = pd.Index([], dtype="object") + idx = pd.Index([], dtype="Float64") + assert idx.dtype == "Float64" expected = pd.Series([], index=idx, dtype="Int64") tm.assert_series_equal(result, expected) @@ -117,7 +120,8 @@ def test_value_counts_empty(): def test_value_counts_with_normalize(): ser = pd.Series([0.1, 0.2, 0.1, pd.NA], dtype="Float64") result = ser.value_counts(normalize=True) - expected = pd.Series([2, 1], index=[0.1, 0.2], dtype="Float64") / 3 + expected = pd.Series([2, 1], index=ser[:2], dtype="Float64") / 3 + assert expected.index.dtype == ser.dtype tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index 13591ef97c9d2..3911b7f9bad34 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -72,7 +72,8 @@ def test_construct_index(all_data, dropna): other = all_data result = pd.Index(pd.array(other, dtype=all_data.dtype)) - expected = pd.Index(other, dtype=object) + expected = pd.Index(other, dtype=all_data.dtype) + assert all_data.dtype == expected.dtype # dont coerce to object tm.assert_index_equal(result, expected) diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index 96fe1e77f6bc5..73c8d4e6b1aed 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -109,11 +109,14 @@ def test_stat_method(pandasmethname, kwargs): def test_value_counts_na(): arr = pd.array([1, 2, 1, pd.NA], dtype="Int64") result = arr.value_counts(dropna=False) - expected = pd.Series([2, 1, 1], index=[1, 2, pd.NA], dtype="Int64") + ex_index = pd.Index([1, 2, pd.NA], dtype="Int64") + assert ex_index.dtype == "Int64" + expected = pd.Series([2, 1, 1], index=ex_index, dtype="Int64") tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) - expected = pd.Series([2, 1], index=[1, 2], dtype="Int64") + expected = pd.Series([2, 1], index=arr[:2], dtype="Int64") + assert expected.index.dtype == arr.dtype tm.assert_series_equal(result, expected) @@ -121,9 +124,8 @@ def test_value_counts_empty(): # https://github.com/pandas-dev/pandas/issues/33317 ser = pd.Series([], dtype="Int64") result = ser.value_counts() - # TODO(ExtensionIndex): The dtype of the index seems wrong - # (it's int64 for non-empty) - idx = pd.Index([], dtype="object") + idx = pd.Index([], dtype=ser.dtype) + assert idx.dtype == ser.dtype expected = pd.Series([], index=idx, dtype="Int64") tm.assert_series_equal(result, expected) @@ -132,7 +134,8 @@ def test_value_counts_with_normalize(): # GH 33172 ser = pd.Series([1, 2, 1, pd.NA], dtype="Int64") result = ser.value_counts(normalize=True) - expected = pd.Series([2, 1], index=[1, 2], dtype="Float64") / 3 + expected = pd.Series([2, 1], index=ser[:2], dtype="Float64") / 3 + assert expected.index.dtype == ser.dtype tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 7c3a8c691b786..22fe7bb0de949 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -462,18 +462,18 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage2): def test_value_counts_na(dtype): arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) result = arr.value_counts(dropna=False) - expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64") + expected = pd.Series([2, 1, 1], index=arr[[0, 1, 3]], dtype="Int64") tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) - expected = pd.Series([2, 1], index=["a", "b"], dtype="Int64") + expected = pd.Series([2, 1], index=arr[:2], dtype="Int64") tm.assert_series_equal(result, expected) def test_value_counts_with_normalize(dtype): ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype) result = ser.value_counts(normalize=True) - expected = pd.Series([2, 1], index=["a", "b"], dtype="Float64") / 3 + expected = pd.Series([2, 1], index=ser[:2], dtype="Float64") / 3 tm.assert_series_equal(result, expected) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index ab4e91dc3f6e3..84e4992cce0e3 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -256,11 +256,14 @@ def test_numpy_array_all_dtypes(any_numpy_dtype): ) def test_array(arr, attr, index_or_series, request): box = index_or_series - if arr.dtype.name in ("Int64", "Sparse[int64, 0]") and box is pd.Index: - mark = pytest.mark.xfail(reason="Needs EA-Backed Index") + warn = None + if arr.dtype.name in ("Sparse[int64, 0]") and box is pd.Index: + mark = pytest.mark.xfail(reason="Index cannot yet store sparse dtype") request.node.add_marker(mark) + warn = FutureWarning - result = box(arr, copy=False).array + with tm.assert_produces_warning(warn): + result = box(arr, copy=False).array if attr: arr = getattr(arr, attr) @@ -330,7 +333,12 @@ def test_array_multiindex_raises(): ) def test_to_numpy(arr, expected, index_or_series_or_array, request): box = index_or_series_or_array - thing = box(arr) + + warn = None + if index_or_series_or_array is pd.Index and isinstance(arr, SparseArray): + warn = FutureWarning + with tm.assert_produces_warning(warn): + thing = box(arr) if arr.dtype.name == "int64" and box is pd.array: mark = pytest.mark.xfail(reason="thing is Int64 and to_numpy() returns object") diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index e4fbbc07c688a..f3be4749fb3aa 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -10,6 +10,7 @@ from pandas.core.dtypes.common import ( is_categorical_dtype, + is_dtype_equal, is_object_dtype, ) @@ -90,12 +91,16 @@ def test_memory_usage(index_or_series_obj): res = obj.memory_usage() res_deep = obj.memory_usage(deep=True) + is_ser = isinstance(obj, Series) is_object = is_object_dtype(obj) or ( isinstance(obj, Series) and is_object_dtype(obj.index) ) is_categorical = is_categorical_dtype(obj.dtype) or ( isinstance(obj, Series) and is_categorical_dtype(obj.index.dtype) ) + is_object_string = is_dtype_equal(obj, "string[python]") or ( + is_ser and is_dtype_equal(obj.index.dtype, "string[python]") + ) if len(obj) == 0: if isinstance(obj, Index): @@ -103,7 +108,7 @@ def test_memory_usage(index_or_series_obj): else: expected = 108 if IS64 else 64 assert res_deep == res == expected - elif is_object or is_categorical: + elif is_object or is_categorical or is_object_string: # only deep will pick them up assert res_deep > res else: @@ -164,6 +169,8 @@ def test_access_by_position(index_flat): assert index[-1] == index[size - 1] msg = f"index {size} is out of bounds for axis 0 with size {size}" + if is_dtype_equal(index.dtype, "string[pyarrow]"): + msg = "index out of bounds" with pytest.raises(IndexError, match=msg): index[size] msg = "single positional indexer is out-of-bounds" diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 6130646bb52c5..13bf096cfe167 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -29,6 +29,10 @@ def test_value_counts(index_or_series_obj): if isinstance(obj, pd.MultiIndex): expected.index = Index(expected.index) + if not isinstance(result.dtype, np.dtype): + # i.e IntegerDtype + expected = expected.astype("Int64") + # TODO(GH#32514): Order of entries with the same count is inconsistent # on CI (gh-32449) if obj.duplicated().any(): @@ -68,6 +72,10 @@ def test_value_counts_null(null_obj, index_or_series_obj): # Order of entries with the same count is inconsistent on CI (gh-32449) expected = expected.sort_index() result = result.sort_index() + + if not isinstance(result.dtype, np.dtype): + # i.e IntegerDtype + expected = expected.astype("Int64") tm.assert_series_equal(result, expected) expected[null_obj] = 3 diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index da6844084c896..571ab3dca1efc 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -50,6 +50,7 @@ class TestMyDtype(BaseDtypeTests): from pandas.tests.extension.base.dtype import BaseDtypeTests # noqa from pandas.tests.extension.base.getitem import BaseGetitemTests # noqa from pandas.tests.extension.base.groupby import BaseGroupbyTests # noqa +from pandas.tests.extension.base.index import BaseIndexTests # noqa from pandas.tests.extension.base.interface import BaseInterfaceTests # noqa from pandas.tests.extension.base.io import BaseParsingTests # noqa from pandas.tests.extension.base.methods import BaseMethodsTests # noqa diff --git a/pandas/tests/extension/base/index.py b/pandas/tests/extension/base/index.py new file mode 100644 index 0000000000000..2539c38733a6c --- /dev/null +++ b/pandas/tests/extension/base/index.py @@ -0,0 +1,20 @@ +""" +Tests for Indexes backed by arbitrary ExtensionArrays. +""" +import pandas as pd +from pandas.tests.extension.base.base import BaseExtensionTests + + +class BaseIndexTests(BaseExtensionTests): + """Tests for Index object backed by an ExtensionArray""" + + def test_index_from_array(self, data): + idx = pd.Index(data) + assert data.dtype == idx.dtype + + def test_index_from_listlike_with_dtype(self, data): + idx = pd.Index(data, dtype=data.dtype) + assert idx.dtype == data.dtype + + idx = pd.Index(list(data), dtype=data.dtype) + assert idx.dtype == data.dtype diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 53416b6a3e9db..a00860f4d02da 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -101,6 +101,10 @@ def test_take_na_value_other_decimal(self): self.assert_extension_array_equal(result, expected) +class TestIndex(base.BaseIndexTests): + pass + + class TestMissing(base.BaseMissingTests): pass diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index befbf3b445d62..538ca83667ef5 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -33,6 +33,7 @@ from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( + is_bool_dtype, is_list_like, pandas_dtype, ) @@ -42,7 +43,6 @@ ExtensionArray, ExtensionDtype, ) -from pandas.api.types import is_bool_dtype from pandas.core.indexers import unpack_tuple_and_ellipses diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index f090396a70724..d530a75b74c8f 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -196,6 +196,10 @@ class TestGetitem(BaseJSON, base.BaseGetitemTests): pass +class TestIndex(BaseJSON, base.BaseIndexTests): + pass + + class TestMissing(BaseJSON, base.BaseMissingTests): @pytest.mark.skip(reason="Setting a dict as a scalar") def test_fillna_series(self): @@ -306,6 +310,20 @@ def test_groupby_extension_apply(self): we'll be able to dispatch unique. """ + @unhashable + def test_groupby_extension_agg(self): + """ + This fails when we get to tm.assert_series_equal when left.index + contains dictionaries, which are not hashable. + """ + + @unhashable + def test_groupby_extension_no_sort(self): + """ + This fails when we get to tm.assert_series_equal when left.index + contains dictionaries, which are not hashable. + """ + @pytest.mark.xfail(reason="GH#39098: Converts agg result to object") def test_groupby_agg_extension(self, data_for_grouping): super().test_groupby_agg_extension(data_for_grouping) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 0212610ec270f..1f44889cdd88a 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -95,6 +95,10 @@ class TestSetitem(base.BaseSetitemTests): pass +class TestIndex(base.BaseIndexTests): + pass + + class TestMissing(base.BaseMissingTests): pass diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 78af884827f63..d21110e078709 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -144,6 +144,10 @@ class TestSetitem(base.BaseSetitemTests): pass +class TestIndex(base.BaseIndexTests): + pass + + class TestMissing(base.BaseMissingTests): @pytest.mark.skip(reason="Not implemented") def test_fillna_limit_pad(self, data_missing): diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index 5acfa79cccec8..a64b42fad9415 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -107,6 +107,10 @@ class TestGetitem(BaseDatetimeTests, base.BaseGetitemTests): pass +class TestIndex(base.BaseIndexTests): + pass + + class TestMethods(BaseDatetimeTests, base.BaseMethodsTests): def test_combine_add(self, data_repeated): # Timestamp.__add__(Timestamp) not defined diff --git a/pandas/tests/extension/test_floating.py b/pandas/tests/extension/test_floating.py index 440190bd6c2f9..c8022210a6aeb 100644 --- a/pandas/tests/extension/test_floating.py +++ b/pandas/tests/extension/test_floating.py @@ -164,6 +164,10 @@ class TestSetitem(base.BaseSetitemTests): pass +class TestIndex(base.BaseIndexTests): + pass + + class TestMissing(base.BaseMissingTests): pass diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 8c5aadd7a9bcb..f4ec68e7fed88 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -187,6 +187,10 @@ class TestSetitem(base.BaseSetitemTests): pass +class TestIndex(base.BaseIndexTests): + pass + + class TestMissing(base.BaseMissingTests): pass diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 6297995043b25..e2f4d69c489ba 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -93,6 +93,10 @@ class TestGetitem(BaseInterval, base.BaseGetitemTests): pass +class TestIndex(base.BaseIndexTests): + pass + + class TestGrouping(BaseInterval, base.BaseGroupbyTests): pass diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index d4bf4cb31d5db..2e1112ccf2205 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -12,6 +12,8 @@ classes (if they are relevant for the extension interface for all dtypes), or be added to the array-specific tests in `pandas/tests/arrays/`. +Note: we do not bother with base.BaseIndexTests because PandasArray +will never be held in an Index. """ import numpy as np import pytest diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py index 5ae397566fa0d..bbb464cb7dfed 100644 --- a/pandas/tests/extension/test_period.py +++ b/pandas/tests/extension/test_period.py @@ -85,6 +85,10 @@ class TestGetitem(BasePeriodTests, base.BaseGetitemTests): pass +class TestIndex(base.BaseIndexTests): + pass + + class TestMethods(BasePeriodTests, base.BaseMethodsTests): def test_combine_add(self, data_repeated): # Period + Period is not defined. diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index f7809dc2e4217..2ff224c44af49 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -191,6 +191,26 @@ def test_reindex(self, data, na_value): # Skipping TestSetitem, since we don't implement it. +class TestIndex(base.BaseIndexTests): + def test_index_from_array(self, data): + msg = "will store that array directly" + with tm.assert_produces_warning(FutureWarning, match=msg): + idx = pd.Index(data) + + if data.dtype.subtype == "f": + assert idx.dtype == np.float64 + elif data.dtype.subtype == "i": + assert idx.dtype == np.int64 + else: + assert idx.dtype == data.dtype.subtype + + # TODO(ExtensionIndex) this is failing because it doesn't recognize + # the sparse dtype + @pytest.mark.xfail(reason="Index cannot yet store sparse dtype") + def test_index_from_listlike_with_dtype(self, data): + super().test_index_from_listlike_with_dtype(data) + + class TestMissing(BaseSparseTests, base.BaseMissingTests): def test_isna(self, data_missing): sarr = SparseArray(data_missing) @@ -252,6 +272,14 @@ def test_fillna_frame(self, data_missing): class TestMethods(BaseSparseTests, base.BaseMethodsTests): + @pytest.mark.parametrize("ascending", [True, False]) + def test_sort_values_frame(self, data_for_sorting, ascending): + msg = "will store that array directly" + with tm.assert_produces_warning( + FutureWarning, match=msg, check_stacklevel=False + ): + super().test_sort_values_frame(data_for_sorting, ascending) + def test_combine_le(self, data_repeated): # We return a Series[SparseArray].__le__ returns a # Series[Sparse[bool]] diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index d9351add0fe6d..4256142556894 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -18,6 +18,8 @@ import numpy as np import pytest +from pandas.compat import pa_version_under2p0 + import pandas as pd from pandas.core.arrays import ArrowStringArray from pandas.core.arrays.string_ import StringDtype @@ -134,6 +136,10 @@ def test_setitem_preserves_views(self, data, request): super().test_setitem_preserves_views(data) +class TestIndex(base.BaseIndexTests): + pass + + class TestMissing(base.BaseMissingTests): pass @@ -186,7 +192,12 @@ class TestPrinting(base.BasePrintingTests): class TestGroupBy(base.BaseGroupbyTests): - pass + def test_groupby_extension_transform(self, data_for_grouping, request): + if data_for_grouping.dtype.storage == "pyarrow" and pa_version_under2p0: + # failure observed in 1.0.1, not in 2.0 or later + mark = pytest.mark.xfail(reason="pyarrow raises in self._data[item]") + request.node.add_marker(mark) + super().test_groupby_extension_transform(data_for_grouping) class Test2DCompat(base.Dim2CompatTests): diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 585176fc6a2d7..391a3df5233d1 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1069,7 +1069,7 @@ def test_idxmax_idxmin_convert_dtypes(self, op, expected_value): result = getattr(df, op)() expected = DataFrame( {"value": expected_value}, - index=Index([100, 200], name="ID"), + index=Index([100, 200], name="ID", dtype="Int64"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 06f00634802b4..dbc38497d3bee 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1136,7 +1136,7 @@ def test_apply_to_nullable_integer_returns_float(values, function): # https://github.com/pandas-dev/pandas/issues/32219 output = 0.5 if function == "var" else 1.5 arr = np.array([output] * 3, dtype=float) - idx = Index([1, 2, 3], name="a") + idx = Index([1, 2, 3], name="a", dtype="Int64") expected = DataFrame({"b": arr}, index=idx).astype("Float64") groups = DataFrame(values, dtype="Int64").groupby("a") @@ -1156,7 +1156,7 @@ def test_groupby_sum_below_mincount_nullable_integer(): # https://github.com/pandas-dev/pandas/issues/32861 df = DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64") grouped = df.groupby("a") - idx = Index([0, 1, 2], name="a") + idx = Index([0, 1, 2], name="a", dtype="Int64") result = grouped["b"].sum(min_count=2) expected = Series([pd.NA] * 3, dtype="Int64", index=idx, name="b") diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index bcb2abeed75e4..1badc4aa7995a 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -251,14 +251,14 @@ def test_groupby_quantile_NA_float(any_float_dtype): # GH#42849 df = DataFrame({"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_dtype) result = df.groupby("x")["y"].quantile(0.5) - expected = pd.Series([0.2], dtype=float, index=[1.0], name="y") - expected.index.name = "x" + exp_index = Index([1.0], dtype=any_float_dtype, name="x") + expected = pd.Series([0.2], dtype=float, index=exp_index, name="y") tm.assert_series_equal(expected, result) result = df.groupby("x")["y"].quantile([0.5, 0.75]) expected = pd.Series( [0.2] * 2, - index=pd.MultiIndex.from_product(([1.0], [0.5, 0.75]), names=["x", None]), + index=pd.MultiIndex.from_product((exp_index, [0.5, 0.75]), names=["x", None]), name="y", ) tm.assert_series_equal(result, expected) @@ -268,11 +268,13 @@ def test_groupby_quantile_NA_int(any_int_ea_dtype): # GH#42849 df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_int_ea_dtype) result = df.groupby("x")["y"].quantile(0.5) - expected = pd.Series([3.5], dtype=float, index=Index([1], name="x"), name="y") + expected = pd.Series( + [3.5], dtype=float, index=Index([1], name="x", dtype=any_int_ea_dtype), name="y" + ) tm.assert_series_equal(expected, result) result = df.groupby("x").quantile(0.5) - expected = DataFrame({"y": 3.5}, index=Index([1], name="x")) + expected = DataFrame({"y": 3.5}, index=Index([1], name="x", dtype=any_int_ea_dtype)) tm.assert_frame_equal(result, expected) @@ -281,7 +283,9 @@ def test_groupby_quantile_allNA_column(dtype): # GH#42849 df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype=dtype) result = df.groupby("x")["y"].quantile(0.5) - expected = pd.Series([np.nan], dtype=float, index=[1.0], name="y") + expected = pd.Series( + [np.nan], dtype=float, index=Index([1.0], dtype=dtype), name="y" + ) expected.index.name = "x" tm.assert_series_equal(expected, result) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 6b6caf1f8affd..311d93371a51f 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -34,6 +34,7 @@ Int64Index, UInt64Index, ) +from pandas.core.arrays import BaseMaskedArray class Base: @@ -232,6 +233,28 @@ def test_ensure_copied_data(self, index): elif isinstance(index, IntervalIndex): # checked in test_interval.py pass + elif type(index) is Index and not isinstance(index.dtype, np.dtype): + result = index_type(index.values, copy=False, **init_kwargs) + tm.assert_index_equal(result, index) + + if isinstance(index._values, BaseMaskedArray): + assert np.shares_memory(index._values._data, result._values._data) + tm.assert_numpy_array_equal( + index._values._data, result._values._data, check_same="same" + ) + assert np.shares_memory(index._values._mask, result._values._mask) + tm.assert_numpy_array_equal( + index._values._mask, result._values._mask, check_same="same" + ) + elif index.dtype == "string[python]": + assert np.shares_memory(index._values._ndarray, result._values._ndarray) + tm.assert_numpy_array_equal( + index._values._ndarray, result._values._ndarray, check_same="same" + ) + elif index.dtype == "string[pyarrow]": + assert tm.shares_memory(result._values, index._values) + else: + raise NotImplementedError(index.dtype) else: result = index_type(index.values, copy=False, **init_kwargs) tm.assert_numpy_array_equal(index.values, result.values, check_same="same") @@ -251,7 +274,10 @@ def test_memory_usage(self, index): # RangeIndex, IntervalIndex # don't have engines - if not isinstance(index, (RangeIndex, IntervalIndex)): + # Index[EA] has engine but it does not have a Hashtable .mapping + if not isinstance(index, (RangeIndex, IntervalIndex)) and not ( + type(index) is Index and not isinstance(index.dtype, np.dtype) + ): assert result2 > result if index.inferred_type == "object": @@ -398,7 +424,9 @@ def test_equals(self, index): assert index.equals(index) assert index.equals(index.copy()) - assert index.equals(index.astype(object)) + if not (type(index) is Index and not isinstance(index.dtype, np.dtype)): + # doesn't hold for e.g. IntegerDtype + assert index.equals(index.astype(object)) assert not index.equals(list(index)) assert not index.equals(np.array(index)) diff --git a/pandas/tests/indexes/datetimelike_/test_nat.py b/pandas/tests/indexes/datetimelike_/test_nat.py index b4a72ec65bd91..50cf29d016355 100644 --- a/pandas/tests/indexes/datetimelike_/test_nat.py +++ b/pandas/tests/indexes/datetimelike_/test_nat.py @@ -17,7 +17,6 @@ def test_nat(self, index_without_na): index_with_na = index_without_na.copy(deep=True) index_with_na._data[1] = NaT - assert type(index_without_na)._na_value is NaT assert empty_index._na_value is NaT assert index_with_na._na_value is NaT assert index_without_na._na_value is NaT diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index b27c5852cb97b..b1e764ceb7009 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -123,7 +123,9 @@ def test_constructor_from_sparse_array(self): Timestamp("2016-05-01T01:00:00.000000"), ] arr = pd.arrays.SparseArray(values) - result = Index(arr) + msg = "will store that array directly" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = Index(arr) expected = DatetimeIndex(values) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_any_index.py b/pandas/tests/indexes/test_any_index.py index 93dd0f3c0a770..860b940b2a350 100644 --- a/pandas/tests/indexes/test_any_index.py +++ b/pandas/tests/indexes/test_any_index.py @@ -138,9 +138,18 @@ def test_slice_keeps_name(self, index): # FutureWarning from non-tuple sequence of nd indexing @pytest.mark.filterwarnings("ignore::FutureWarning") def test_getitem_error(self, index, item): - msg = r"index 101 is out of bounds for axis 0 with size [\d]+|" + re.escape( - "only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) " - "and integer or boolean arrays are valid indices" + msg = "|".join( + [ + r"index 101 is out of bounds for axis 0 with size [\d]+", + re.escape( + "only integers, slices (`:`), ellipsis (`...`), " + "numpy.newaxis (`None`) and integer or boolean arrays " + "are valid indices" + ), + "index out of bounds", # string[pyarrow] + "Only integers, slices and integer or " + "boolean arrays are valid indices.", # string[pyarrow] + ] ) with pytest.raises(IndexError, match=msg): index[item] diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index c74a566cc573d..6cd540365d7be 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -530,21 +530,20 @@ def test_map_dictlike(self, index, mapper): # Cannot map duplicated index return + rng = np.arange(len(index), 0, -1) + if index.empty: # to match proper result coercion for uints expected = Index([]) elif index._is_backward_compat_public_numeric_index: - expected = index._constructor( - np.arange(len(index), 0, -1), dtype=index.dtype - ) + expected = index._constructor(rng, dtype=index.dtype) elif type(index) is Index and index.dtype != object: # i.e. EA-backed, for now just Nullable - expected = Index(np.arange(len(index), 0, -1), dtype=index.dtype) + expected = Index(rng, dtype=index.dtype) elif index.dtype.kind == "u": - # TODO: case where e.g. we cannot hold result in UInt8? - expected = Index(np.arange(len(index), 0, -1), dtype=index.dtype) + expected = Index(rng, dtype=index.dtype) else: - expected = Index(np.arange(len(index), 0, -1)) + expected = Index(rng) result = index.map(mapper(expected, index)) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 50770f5bb38f2..ed57289e759cd 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -465,7 +465,9 @@ def test_sort_values_with_missing(index_with_missing, na_position): sorted_values = np.concatenate([[None] * missing_count, sorted_values]) else: sorted_values = np.concatenate([sorted_values, [None] * missing_count]) - expected = type(index_with_missing)(sorted_values) + + # Explicitly pass dtype needed for Index backed by EA e.g. IntegerArray + expected = type(index_with_missing)(sorted_values, dtype=index_with_missing.dtype) result = index_with_missing.sort_values(na_position=na_position) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py index 573ee987ab4c8..03bdc13d1ea8b 100644 --- a/pandas/tests/indexes/test_numpy_compat.py +++ b/pandas/tests/indexes/test_numpy_compat.py @@ -12,6 +12,7 @@ ) import pandas._testing as tm from pandas.core.api import Float64Index +from pandas.core.arrays import BooleanArray from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin @@ -51,14 +52,21 @@ def test_numpy_ufuncs_basic(index, func): with tm.external_error_raised((TypeError, AttributeError)): with np.errstate(all="ignore"): func(index) - elif isinstance(index, NumericIndex): + elif isinstance(index, NumericIndex) or ( + not isinstance(index.dtype, np.dtype) and index.dtype._is_numeric + ): # coerces to float (e.g. np.sin) with np.errstate(all="ignore"): result = func(index) exp = Index(func(index.values), name=index.name) tm.assert_index_equal(result, exp) - assert isinstance(result, Float64Index) + if type(index) is not Index: + # i.e NumericIndex + assert isinstance(result, Float64Index) + else: + # e.g. np.exp with Int64 -> Float64 + assert type(result) is Index else: # raise AttributeError or TypeError if len(index) == 0: @@ -89,10 +97,16 @@ def test_numpy_ufuncs_other(index, func, request): with tm.external_error_raised(TypeError): func(index) - elif isinstance(index, NumericIndex): + elif isinstance(index, NumericIndex) or ( + not isinstance(index.dtype, np.dtype) and index.dtype._is_numeric + ): # Results in bool array result = func(index) - assert isinstance(result, np.ndarray) + if not isinstance(index.dtype, np.dtype): + # e.g. Int64 we expect to get BooleanArray back + assert isinstance(result, BooleanArray) + else: + assert isinstance(result, np.ndarray) assert not isinstance(result, Index) else: if len(index) == 0: @@ -103,11 +117,15 @@ def test_numpy_ufuncs_other(index, func, request): @pytest.mark.parametrize("func", [np.maximum, np.minimum]) -def test_numpy_ufuncs_reductions(index, func): +def test_numpy_ufuncs_reductions(index, func, request): # TODO: overlap with tests.series.test_ufunc.test_reductions if len(index) == 0: return + if repr(index.dtype) == "string[pyarrow]": + mark = pytest.mark.xfail(reason="ArrowStringArray has no min/max") + request.node.add_marker(mark) + if isinstance(index, CategoricalIndex) and index.dtype.ordered is False: with pytest.raises(TypeError, match="is not ordered for"): func.reduce(index) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 648b79bd288df..a73ac89994761 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -9,6 +9,7 @@ import pytest from pandas.core.dtypes.cast import find_common_type +from pandas.core.dtypes.common import is_dtype_equal from pandas import ( CategoricalIndex, @@ -46,12 +47,24 @@ def test_union_same_types(index): assert idx1.union(idx2).dtype == idx1.dtype -def test_union_different_types(index_flat, index_flat2): +def test_union_different_types(index_flat, index_flat2, request): # This test only considers combinations of indices # GH 23525 idx1 = index_flat idx2 = index_flat2 + if ( + not idx1.is_unique + and idx1.dtype.kind == "i" + and is_dtype_equal(idx2.dtype, "boolean") + ) or ( + not idx2.is_unique + and idx2.dtype.kind == "i" + and is_dtype_equal(idx1.dtype, "boolean") + ): + mark = pytest.mark.xfail(reason="GH#44000 True==1", raises=ValueError) + request.node.add_marker(mark) + common_dtype = find_common_type([idx1.dtype, idx2.dtype]) any_uint64 = idx1.dtype == np.uint64 or idx2.dtype == np.uint64 @@ -195,6 +208,7 @@ def test_union_base(self, index): first = index[3:] second = index[:5] everything = index + union = first.union(second) assert tm.equalContents(union, everything) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 358689839d6af..36176bb8194d4 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -99,6 +99,13 @@ def test_getitem_ndarray_3d( msgs.append("Data must be 1-dimensional") if len(index) == 0 or isinstance(index, pd.MultiIndex): msgs.append("positional indexers are out-of-bounds") + if type(index) is Index and not isinstance(index._values, np.ndarray): + # e.g. Int64 + msgs.append("values must be a 1D array") + + # string[pyarrow] + msgs.append("only handle 1-dimensional arrays") + msg = "|".join(msgs) potential_errors = (IndexError, ValueError, NotImplementedError) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 49488e823d662..70e739d1440d6 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -79,14 +79,10 @@ def test_ops(self, opname, obj): ("boolean", True), ], ) - def test_nanminmax(self, opname, dtype, val, index_or_series, request): + def test_nanminmax(self, opname, dtype, val, index_or_series): # GH#7261 klass = index_or_series - if dtype in ["Int64", "boolean"] and klass == Index: - mark = pytest.mark.xfail(reason="Need EA-backed Index") - request.node.add_marker(mark) - def check_missing(res): if dtype == "datetime64[ns]": return res is NaT diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 0d5201849ea56..06b24de982eea 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -437,9 +437,13 @@ def test_astype_string_to_extension_dtype_roundtrip( ) request.node.add_marker(mark) # GH-40351 - s = Series(data, dtype=dtype) - result = s.astype(nullable_string_dtype).astype(dtype) - tm.assert_series_equal(result, s) + ser = Series(data, dtype=dtype) + + # Note: just passing .astype(dtype) fails for dtype="category" + # with bc ser.dtype.categories will be object dtype whereas + # result.dtype.categories will have string dtype + result = ser.astype(nullable_string_dtype).astype(ser.dtype) + tm.assert_series_equal(result, ser) class TestAstypeCategorical: diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index 44fd5ac493a8a..ed07a31c24768 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas.core.dtypes.common import is_dtype_equal + import pandas as pd import pandas._testing as tm from pandas.arrays import SparseArray @@ -85,7 +87,10 @@ def test_binary_ufunc_with_index(flip, sparse, ufunc, arrays_for_binary_ufunc): name = "name" # op(pd.Series, array) preserves the name. series = pd.Series(a1, name=name) - other = pd.Index(a2, name=name).astype("int64") + + warn = None if not sparse else FutureWarning + with tm.assert_produces_warning(warn): + other = pd.Index(a2, name=name).astype("int64") array_args = (a1, a2) series_args = (series, other) # ufunc(series, array) @@ -275,7 +280,12 @@ def test_multiply(self, values_for_np_reduce, box_with_array, request): box = box_with_array values = values_for_np_reduce - obj = box(values) + warn = None + if is_dtype_equal(values.dtype, "Sparse[int]") and box is pd.Index: + warn = FutureWarning + msg = "passing a SparseArray to pd.Index" + with tm.assert_produces_warning(warn, match=msg): + obj = box(values) if isinstance(values, pd.core.arrays.SparseArray) and box is not pd.Index: mark = pytest.mark.xfail(reason="SparseArray has no 'mul'") @@ -309,7 +319,12 @@ def test_add(self, values_for_np_reduce, box_with_array): box = box_with_array values = values_for_np_reduce - obj = box(values) + warn = None + if is_dtype_equal(values.dtype, "Sparse[int]") and box is pd.Index: + warn = FutureWarning + msg = "passing a SparseArray to pd.Index" + with tm.assert_produces_warning(warn, match=msg): + obj = box(values) if values.dtype.kind in "miuf": result = np.add.reduce(obj) @@ -343,7 +358,12 @@ def test_max(self, values_for_np_reduce, box_with_array): # ATM Index casts to object, so we get python ints/floats same_type = False - obj = box(values) + warn = None + if is_dtype_equal(values.dtype, "Sparse[int]") and box is pd.Index: + warn = FutureWarning + msg = "passing a SparseArray to pd.Index" + with tm.assert_produces_warning(warn, match=msg): + obj = box(values) result = np.maximum.reduce(obj) if box is pd.DataFrame: @@ -366,7 +386,12 @@ def test_min(self, values_for_np_reduce, box_with_array): # ATM Index casts to object, so we get python ints/floats same_type = False - obj = box(values) + warn = None + if is_dtype_equal(values.dtype, "Sparse[int]") and box is pd.Index: + warn = FutureWarning + msg = "passing a SparseArray to pd.Index" + with tm.assert_produces_warning(warn, match=msg): + obj = box(values) result = np.minimum.reduce(obj) if box is pd.DataFrame: diff --git a/pandas/tests/strings/test_extract.py b/pandas/tests/strings/test_extract.py index 16ec4a8c6831c..0f4ffccd8ad7f 100644 --- a/pandas/tests/strings/test_extract.py +++ b/pandas/tests/strings/test_extract.py @@ -257,8 +257,7 @@ def test_extract_expand_True_single_capture_group(index_or_series, any_string_dt # single group renames series/index properly s_or_idx = index_or_series(["A1", "A2"], dtype=any_string_dtype) result = s_or_idx.str.extract(r"(?PA)\d", expand=True) - expected_dtype = "object" if index_or_series is Index else any_string_dtype - expected = DataFrame({"uno": ["A", "A"]}, dtype=expected_dtype) + expected = DataFrame({"uno": ["A", "A"]}, dtype=any_string_dtype) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index af6ffcb2a9379..b72dd111f3b25 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -363,9 +363,6 @@ def test_len_mixed(): def test_index( method, sub, start, end, index_or_series, any_string_dtype, expected, request ): - if index_or_series is Index and not any_string_dtype == "object": - mark = pytest.mark.xfail(reason="Need EA-backed Index") - request.node.add_marker(mark) obj = index_or_series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 0efe4a62c6152..94a20901b2f7a 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2455,6 +2455,5 @@ def test_union_with_duplicates(op): result = algos.union_with_duplicates(lvals, rvals) tm.assert_numpy_array_equal(result, expected) else: - with tm.assert_produces_warning(RuntimeWarning): - result = algos.union_with_duplicates(lvals, rvals) + result = algos.union_with_duplicates(lvals, rvals) tm.assert_extension_array_equal(result, expected)