diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index b2fd413fc7f9c..5f22dc3f209b8 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -111,6 +111,8 @@ def _handle_arrow_array_protocol(obj, type, mask, size): if not isinstance(res, (Array, ChunkedArray)): raise TypeError("The object's __arrow_array__ method does not " "return a pyarrow Array or ChunkedArray.") + if isinstance(res, ChunkedArray) and res.num_chunks==1: + res = res.chunk(0) return res diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py index b6250e8fc2fbf..08ad59bb84246 100644 --- a/python/pyarrow/tests/parquet/test_pandas.py +++ b/python/pyarrow/tests/parquet/test_pandas.py @@ -26,6 +26,7 @@ from pyarrow.tests.parquet.common import ( parametrize_legacy_dataset, parametrize_legacy_dataset_not_supported) from pyarrow.util import guid +from pyarrow.vendored.version import Version try: import pyarrow.parquet as pq @@ -556,6 +557,30 @@ def test_pandas_categorical_roundtrip(use_legacy_dataset): tm.assert_frame_equal(result, df) +@pytest.mark.pandas +def test_categories_with_string_pyarrow_dtype(tempdir): + # gh-33727: writing to parquet should not fail + if Version(pd.__version__) < Version("1.3.0"): + pytest.skip("PyArrow backed string data type introduced in pandas 1.3.0") + + df1 = pd.DataFrame({"x": ["foo", "bar", "foo"]}, dtype="string[pyarrow]") + df1 = df1.astype("category") + + df2 = pd.DataFrame({"x": ["foo", "bar", "foo"]}) + df2 = df2.astype("category") + + # categories should be converted to pa.Array + assert pa.array(df1["x"]) == pa.array(df2["x"]) + assert pa.array(df1["x"].cat.categories.values) == pa.array( + df2["x"].cat.categories.values) + + path = str(tempdir / 'cat.parquet') + pq.write_table(pa.table(df1), path) + result = pq.read_table(path).to_pandas() + + tm.assert_frame_equal(result, df2) + + @pytest.mark.pandas @parametrize_legacy_dataset def test_write_to_dataset_pandas_preserve_extensiondtypes( diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index fa3de3590486f..8befd50fc1585 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -3283,6 +3283,7 @@ def __arrow_array__(self, type=None): pa.array(arr) # ARROW-7066 - allow ChunkedArray output + # GH-33727 - if num_chunks=1 return Array class MyArray2: def __init__(self, data): self.data = data @@ -3292,7 +3293,21 @@ def __arrow_array__(self, type=None): arr = MyArray2(np.array([1, 2, 3], dtype='int64')) result = pa.array(arr) - expected = pa.chunked_array([[1, 2, 3]], type=pa.int64()) + expected = pa.array([1, 2, 3], type=pa.int64()) + assert result.equals(expected) + + class MyArray3: + def __init__(self, data1, data2): + self.data1 = data1 + self.data2 = data2 + + def __arrow_array__(self, type=None): + return pa.chunked_array([self.data1, self.data2], type=type) + + np_arr = np.array([1, 2, 3], dtype='int64') + arr = MyArray3(np_arr, np_arr) + result = pa.array(arr) + expected = pa.chunked_array([[1, 2, 3], [1, 2, 3]], type=pa.int64()) assert result.equals(expected) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 4d0ddf875474e..8754572639dfd 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -3924,6 +3924,22 @@ def test_dictionary_from_pandas_specified_type(): assert result.to_pylist() == ['a', 'b'] +def test_convert_categories_to_array_with_string_pyarrow_dtype(): + # gh-33727: categories should be converted to pa.Array + if Version(pd.__version__) < Version("1.3.0"): + pytest.skip("PyArrow backed string data type introduced in pandas 1.3.0") + + df = pd.DataFrame({"x": ["foo", "bar", "foo"]}, dtype="string[pyarrow]") + df = df.astype("category") + indices = pa.array(df['x'].cat.codes) + dictionary = pa.array(df["x"].cat.categories.values) + assert isinstance(dictionary, pa.Array) + + expected = pa.Array.from_pandas(df['x']) + result = pa.DictionaryArray.from_arrays(indices, dictionary) + assert result == expected + + # ---------------------------------------------------------------------- # Array protocol in pandas conversions tests