diff --git a/doc/source/whatsnew/v2.0.3.rst b/doc/source/whatsnew/v2.0.3.rst index 146cc0d00fa3b..8e2b1dc315725 100644 --- a/doc/source/whatsnew/v2.0.3.rst +++ b/doc/source/whatsnew/v2.0.3.rst @@ -27,6 +27,7 @@ Bug fixes - Bug in :func:`read_csv` when defining ``dtype`` with ``bool[pyarrow]`` for the ``"c"`` and ``"python"`` engines (:issue:`53390`) - Bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` with ``expand=True`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`53532`) - Bug in indexing methods (e.g. :meth:`DataFrame.__getitem__`) where taking the entire :class:`DataFrame`/:class:`Series` would raise an ``OverflowError`` when Copy on Write was enabled and the length of the array was over the maximum size a 32-bit integer can hold (:issue:`53616`) +- Bug when constructing a :class:`DataFrame` with columns of an :class:`ArrowDtype` with a ``pyarrow.dictionary`` type that reindexes the data (:issue:`53617`) - Bug when indexing a :class:`DataFrame` or :class:`Series` with an :class:`Index` with a timestamp :class:`ArrowDtype` would raise an ``AttributeError`` (:issue:`53644`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 0e2ef1d63655d..6df553fd57ebd 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7212,11 +7212,19 @@ def _unpack_nested_dtype(other: Index) -> Index: ------- Index """ + from pandas.core.arrays.arrow import ArrowDtype + dtype = other.dtype if isinstance(dtype, CategoricalDtype): # If there is ever a SparseIndex, this could get dispatched # here too. return dtype.categories + elif isinstance(dtype, ArrowDtype): + # GH 53617 + import pyarrow as pa + + if pa.types.is_dictionary(dtype.pyarrow_dtype): + other = other.astype(ArrowDtype(dtype.pyarrow_dtype.value_type)) return other diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 269e710825899..d4bb45b3ba11a 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2696,6 +2696,24 @@ def test_frame_from_dict_with_mixed_tzaware_indexes(self): with pytest.raises(TypeError, match=msg): DataFrame({"D": ser1, "A": ser2, "B": ser3}) + @pytest.mark.parametrize( + "key_val, col_vals, col_type", + [ + ["3", ["3", "4"], "utf8"], + [3, [3, 4], "int8"], + ], + ) + def test_dict_data_arrow_column_expansion(self, key_val, col_vals, col_type): + # GH 53617 + pa = pytest.importorskip("pyarrow") + cols = pd.arrays.ArrowExtensionArray( + pa.array(col_vals, type=pa.dictionary(pa.int8(), getattr(pa, col_type)())) + ) + result = DataFrame({key_val: [1, 2]}, columns=cols) + expected = DataFrame([[1, np.nan], [2, np.nan]], columns=cols) + expected.iloc[:, 1] = expected.iloc[:, 1].astype(object) + tm.assert_frame_equal(result, expected) + class TestDataFrameConstructorWithDtypeCoercion: def test_floating_values_integer_dtype(self): diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py index c99e912ce4c0f..cd28d519313ed 100644 --- a/pandas/tests/indexes/numeric/test_indexing.py +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -11,7 +11,10 @@ Timestamp, ) import pandas._testing as tm -from pandas.core.arrays import FloatingArray +from pandas.core.arrays import ( + ArrowExtensionArray, + FloatingArray, +) @pytest.fixture @@ -389,6 +392,26 @@ def test_get_indexer_masked_na_boolean(self, dtype): result = idx.get_loc(NA) assert result == 2 + def test_get_indexer_arrow_dictionary_target(self): + pa = pytest.importorskip("pyarrow") + target = Index( + ArrowExtensionArray( + pa.array([1, 2], type=pa.dictionary(pa.int8(), pa.int8())) + ) + ) + idx = Index([1]) + + result = idx.get_indexer(target) + expected = np.array([0, -1], dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) + + result_1, result_2 = idx.get_indexer_non_unique(target) + expected_1, expected_2 = np.array([0, -1], dtype=np.int64), np.array( + [1], dtype=np.int64 + ) + tm.assert_numpy_array_equal(result_1, expected_1) + tm.assert_numpy_array_equal(result_2, expected_2) + class TestWhere: @pytest.mark.parametrize(