Skip to content

Commit

Permalink
BUG: DataFrame construction with dictionary ArrowDtype columns (panda…
Browse files Browse the repository at this point in the history
…s-dev#53654)

* BUG: DataFrame construction with dictionary ArrowDtype columns

* Add tests for get_indexer

* Windows
  • Loading branch information
mroeschke authored and root committed Jun 23, 2023
1 parent 5710d1a commit cf058cb
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 1 deletion.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ Bug fixes
- Bug in :func:`read_csv` when defining ``dtype`` with ``bool[pyarrow]`` for the ``"c"`` and ``"python"`` engines (:issue:`53390`)
- Bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` with ``expand=True`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`53532`)
- Bug in indexing methods (e.g. :meth:`DataFrame.__getitem__`) where taking the entire :class:`DataFrame`/:class:`Series` would raise an ``OverflowError`` when Copy on Write was enabled and the length of the array was over the maximum size a 32-bit integer can hold (:issue:`53616`)
- Bug when constructing a :class:`DataFrame` with columns of an :class:`ArrowDtype` with a ``pyarrow.dictionary`` type that reindexes the data (:issue:`53617`)
- Bug when indexing a :class:`DataFrame` or :class:`Series` with an :class:`Index` with a timestamp :class:`ArrowDtype` would raise an ``AttributeError`` (:issue:`53644`)

.. ---------------------------------------------------------------------------
Expand Down
7 changes: 7 additions & 0 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@
)
from pandas.core.dtypes.concat import concat_compat
from pandas.core.dtypes.dtypes import (
ArrowDtype,
CategoricalDtype,
DatetimeTZDtype,
ExtensionDtype,
Expand Down Expand Up @@ -7549,6 +7550,12 @@ def _unpack_nested_dtype(other: Index) -> Index:
# If there is ever a SparseIndex, this could get dispatched
# here too.
return dtype.categories
elif isinstance(dtype, ArrowDtype):
# GH 53617
import pyarrow as pa

if pa.types.is_dictionary(dtype.pyarrow_dtype):
other = other.astype(ArrowDtype(dtype.pyarrow_dtype.value_type))
return other


Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2714,6 +2714,24 @@ def test_frame_from_dict_with_mixed_tzaware_indexes(self):
with pytest.raises(TypeError, match=msg):
DataFrame({"D": ser1, "A": ser2, "B": ser3})

@pytest.mark.parametrize(
"key_val, col_vals, col_type",
[
["3", ["3", "4"], "utf8"],
[3, [3, 4], "int8"],
],
)
def test_dict_data_arrow_column_expansion(self, key_val, col_vals, col_type):
# GH 53617
pa = pytest.importorskip("pyarrow")
cols = pd.arrays.ArrowExtensionArray(
pa.array(col_vals, type=pa.dictionary(pa.int8(), getattr(pa, col_type)()))
)
result = DataFrame({key_val: [1, 2]}, columns=cols)
expected = DataFrame([[1, np.nan], [2, np.nan]], columns=cols)
expected.iloc[:, 1] = expected.iloc[:, 1].astype(object)
tm.assert_frame_equal(result, expected)


class TestDataFrameConstructorWithDtypeCoercion:
def test_floating_values_integer_dtype(self):
Expand Down
25 changes: 24 additions & 1 deletion pandas/tests/indexes/numeric/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@
Timestamp,
)
import pandas._testing as tm
from pandas.core.arrays import FloatingArray
from pandas.core.arrays import (
ArrowExtensionArray,
FloatingArray,
)


@pytest.fixture
Expand Down Expand Up @@ -389,6 +392,26 @@ def test_get_indexer_masked_na_boolean(self, dtype):
result = idx.get_loc(NA)
assert result == 2

def test_get_indexer_arrow_dictionary_target(self):
pa = pytest.importorskip("pyarrow")
target = Index(
ArrowExtensionArray(
pa.array([1, 2], type=pa.dictionary(pa.int8(), pa.int8()))
)
)
idx = Index([1])

result = idx.get_indexer(target)
expected = np.array([0, -1], dtype=np.int64)
tm.assert_numpy_array_equal(result, expected)

result_1, result_2 = idx.get_indexer_non_unique(target)
expected_1, expected_2 = np.array([0, -1], dtype=np.int64), np.array(
[1], dtype=np.int64
)
tm.assert_numpy_array_equal(result_1, expected_1)
tm.assert_numpy_array_equal(result_2, expected_2)


class TestWhere:
@pytest.mark.parametrize(
Expand Down

0 comments on commit cf058cb

Please sign in to comment.