Skip to content

Commit

Permalink
String dtype: fix pyarrow-based IO + update tests (#59478)
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisvandenbossche authored Aug 22, 2024
1 parent 328e79d commit 487c585
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 48 deletions.
2 changes: 2 additions & 0 deletions pandas/io/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ def _arrow_dtype_mapping() -> dict:
pa.string(): pd.StringDtype(),
pa.float32(): pd.Float32Dtype(),
pa.float64(): pd.Float64Dtype(),
pa.string(): pd.StringDtype(),
pa.large_string(): pd.StringDtype(),
}


Expand Down
29 changes: 17 additions & 12 deletions pandas/tests/io/test_feather.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,15 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
import pandas._testing as tm

from pandas.io.feather_format import read_feather, to_feather # isort:skip

pytestmark = [
pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
),
pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
]
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)


pa = pytest.importorskip("pyarrow")

Expand Down Expand Up @@ -150,8 +146,8 @@ def test_path_pathlib(self):
def test_passthrough_keywords(self):
df = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD"), dtype=object),
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
).reset_index()
self.check_round_trip(df, write_kwargs={"version": 1})

Expand All @@ -165,7 +161,9 @@ def test_http_path(self, feather_file, httpserver):
res = read_feather(httpserver.url)
tm.assert_frame_equal(expected, res)

def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
def test_read_feather_dtype_backend(
self, string_storage, dtype_backend, using_infer_string
):
# GH#50765
df = pd.DataFrame(
{
Expand All @@ -187,7 +185,10 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):

if dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
string_dtype = pd.ArrowDtype(pa.string())
if using_infer_string:
string_dtype = pd.ArrowDtype(pa.large_string())
else:
string_dtype = pd.ArrowDtype(pa.string())
else:
string_dtype = pd.StringDtype(string_storage)

Expand All @@ -214,6 +215,10 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
}
)

if using_infer_string:
expected.columns = expected.columns.astype(
pd.StringDtype(string_storage, na_value=np.nan)
)
tm.assert_frame_equal(result, expected)

def test_int_columns_and_index(self):
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/io/test_fsspec.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def test_excel_options(fsspectest):
assert fsspectest.test[0] == "read"


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet")
def test_to_parquet_new_file(cleared_fs, df1):
"""Regression test for writing to a not-yet-existent GCS Parquet file."""
pytest.importorskip("fastparquet")
Expand Down Expand Up @@ -205,7 +205,7 @@ def test_arrowparquet_options(fsspectest):
assert fsspectest.test[0] == "parquet_read"


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet")
def test_fastparquet_options(fsspectest):
"""Regression test for writing to a not-yet-existent GCS Parquet file."""
pytest.importorskip("fastparquet")
Expand Down Expand Up @@ -263,7 +263,7 @@ def test_s3_protocols(s3_public_bucket_with_data, tips_file, protocol, s3so):
)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet")
@pytest.mark.single_cpu
def test_s3_parquet(s3_public_bucket, s3so, df1):
pytest.importorskip("fastparquet")
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/io/test_gcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def test_to_csv_compression_encoding_gcs(
tm.assert_frame_equal(df, read_df)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet")
def test_to_parquet_gcs_new_file(monkeypatch, tmpdir):
"""Regression test for writing to a not-yet-existent GCS Parquet file."""
pytest.importorskip("fastparquet")
Expand Down
25 changes: 14 additions & 11 deletions pandas/tests/io/test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
from pandas import read_orc
import pandas._testing as tm
Expand All @@ -20,20 +18,17 @@

import pyarrow as pa

pytestmark = [
pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
),
pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
]
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)


@pytest.fixture
def dirpath(datapath):
return datapath("io", "data", "orc")


def test_orc_reader_empty(dirpath):
def test_orc_reader_empty(dirpath, using_infer_string):
columns = [
"boolean1",
"byte1",
Expand All @@ -54,11 +49,12 @@ def test_orc_reader_empty(dirpath):
"float32",
"float64",
"object",
"object",
"str" if using_infer_string else "object",
]
expected = pd.DataFrame(index=pd.RangeIndex(0))
for colname, dtype in zip(columns, dtypes):
expected[colname] = pd.Series(dtype=dtype)
expected.columns = expected.columns.astype("str")

inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc")
got = read_orc(inputfile, columns=columns)
Expand Down Expand Up @@ -305,7 +301,7 @@ def test_orc_writer_dtypes_not_supported(orc_writer_dtypes_not_supported):
df.to_orc()


def test_orc_dtype_backend_pyarrow():
def test_orc_dtype_backend_pyarrow(using_infer_string):
pytest.importorskip("pyarrow")
df = pd.DataFrame(
{
Expand Down Expand Up @@ -338,6 +334,13 @@ def test_orc_dtype_backend_pyarrow():
for col in df.columns
}
)
if using_infer_string:
# ORC does not preserve distinction between string and large string
# -> the default large string comes back as string
string_dtype = pd.ArrowDtype(pa.string())
expected["string"] = expected["string"].astype(string_dtype)
expected["string_with_nan"] = expected["string_with_nan"].astype(string_dtype)
expected["string_with_none"] = expected["string_with_none"].astype(string_dtype)

tm.assert_frame_equal(result, expected)

Expand Down
63 changes: 42 additions & 21 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@
pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
),
pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
]


Expand All @@ -61,10 +60,17 @@
params=[
pytest.param(
"fastparquet",
marks=pytest.mark.skipif(
not _HAVE_FASTPARQUET,
reason="fastparquet is not installed",
),
marks=[
pytest.mark.skipif(
not _HAVE_FASTPARQUET,
reason="fastparquet is not installed",
),
pytest.mark.xfail(
using_string_dtype(),
reason="TODO(infer_string) fastparquet",
strict=False,
),
],
),
pytest.param(
"pyarrow",
Expand All @@ -86,15 +92,22 @@ def pa():


@pytest.fixture
def fp():
def fp(request):
if not _HAVE_FASTPARQUET:
pytest.skip("fastparquet is not installed")
if using_string_dtype():
request.applymarker(
pytest.mark.xfail(reason="TODO(infer_string) fastparquet", strict=False)
)
return "fastparquet"


@pytest.fixture
def df_compat():
return pd.DataFrame({"A": [1, 2, 3], "B": "foo"})
# TODO(infer_string) should this give str columns?
return pd.DataFrame(
{"A": [1, 2, 3], "B": "foo"}, columns=pd.Index(["A", "B"], dtype=object)
)


@pytest.fixture
Expand Down Expand Up @@ -366,16 +379,6 @@ def check_external_error_on_write(self, df, engine, exc):
with tm.external_error_raised(exc):
to_parquet(df, path, engine, compression=None)

@pytest.mark.network
@pytest.mark.single_cpu
def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine):
if engine != "auto":
pytest.importorskip(engine)
with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f:
httpserver.serve_content(content=f.read())
df = read_parquet(httpserver.url)
tm.assert_frame_equal(df, df_compat)


class TestBasic(Base):
def test_error(self, engine):
Expand Down Expand Up @@ -673,6 +676,16 @@ def test_read_empty_array(self, pa, dtype):
df, pa, read_kwargs={"dtype_backend": "numpy_nullable"}, expected=expected
)

@pytest.mark.network
@pytest.mark.single_cpu
def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine):
if engine != "auto":
pytest.importorskip(engine)
with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f:
httpserver.serve_content(content=f.read())
df = read_parquet(httpserver.url, engine=engine)
tm.assert_frame_equal(df, df_compat)


class TestParquetPyArrow(Base):
@pytest.mark.xfail(reason="datetime_with_nat unit doesn't round-trip")
Expand Down Expand Up @@ -906,7 +919,7 @@ def test_write_with_schema(self, pa):
out_df = df.astype(bool)
check_round_trip(df, pa, write_kwargs={"schema": schema}, expected=out_df)

def test_additional_extension_arrays(self, pa):
def test_additional_extension_arrays(self, pa, using_infer_string):
# test additional ExtensionArrays that are supported through the
# __arrow_array__ protocol
pytest.importorskip("pyarrow")
Expand All @@ -917,17 +930,25 @@ def test_additional_extension_arrays(self, pa):
"c": pd.Series(["a", None, "c"], dtype="string"),
}
)
check_round_trip(df, pa)
if using_infer_string:
check_round_trip(df, pa, expected=df.astype({"c": "str"}))
else:
check_round_trip(df, pa)

df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")})
check_round_trip(df, pa)

def test_pyarrow_backed_string_array(self, pa, string_storage):
def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_string):
# test ArrowStringArray supported through the __arrow_array__ protocol
pytest.importorskip("pyarrow")
df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")})
with pd.option_context("string_storage", string_storage):
check_round_trip(df, pa, expected=df.astype(f"string[{string_storage}]"))
if using_infer_string:
expected = df.astype("str")
expected.columns = expected.columns.astype("str")
else:
expected = df.astype(f"string[{string_storage}]")
check_round_trip(df, pa, expected=expected)

def test_additional_extension_types(self, pa):
# test additional ExtensionArrays that are supported through the
Expand Down

0 comments on commit 487c585

Please sign in to comment.