From 487c585e0db2b1dbf350d56b400ce857b4645eac Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 22 Aug 2024 11:43:57 +0200 Subject: [PATCH] String dtype: fix pyarrow-based IO + update tests (#59478) --- pandas/io/_util.py | 2 ++ pandas/tests/io/test_feather.py | 29 ++++++++------- pandas/tests/io/test_fsspec.py | 6 ++-- pandas/tests/io/test_gcs.py | 2 +- pandas/tests/io/test_orc.py | 25 +++++++------ pandas/tests/io/test_parquet.py | 63 ++++++++++++++++++++++----------- 6 files changed, 79 insertions(+), 48 deletions(-) diff --git a/pandas/io/_util.py b/pandas/io/_util.py index f502f827faa4e..a1c3318f04466 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -27,6 +27,8 @@ def _arrow_dtype_mapping() -> dict: pa.string(): pd.StringDtype(), pa.float32(): pd.Float32Dtype(), pa.float64(): pd.Float64Dtype(), + pa.string(): pd.StringDtype(), + pa.large_string(): pd.StringDtype(), } diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 6dd4368f09cc8..a1f3babb1ae3b 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -5,19 +5,15 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm from pandas.io.feather_format import read_feather, to_feather # isort:skip -pytestmark = [ - pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" - ), - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + pa = pytest.importorskip("pyarrow") @@ -150,8 +146,8 @@ def test_path_pathlib(self): def test_passthrough_keywords(self): df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ).reset_index() self.check_round_trip(df, write_kwargs={"version": 1}) @@ -165,7 +161,9 @@ def test_http_path(self, feather_file, httpserver): res = read_feather(httpserver.url) tm.assert_frame_equal(expected, res) - def test_read_feather_dtype_backend(self, string_storage, dtype_backend): + def test_read_feather_dtype_backend( + self, string_storage, dtype_backend, using_infer_string + ): # GH#50765 df = pd.DataFrame( { @@ -187,7 +185,10 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend): if dtype_backend == "pyarrow": pa = pytest.importorskip("pyarrow") - string_dtype = pd.ArrowDtype(pa.string()) + if using_infer_string: + string_dtype = pd.ArrowDtype(pa.large_string()) + else: + string_dtype = pd.ArrowDtype(pa.string()) else: string_dtype = pd.StringDtype(string_storage) @@ -214,6 +215,10 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend): } ) + if using_infer_string: + expected.columns = expected.columns.astype( + pd.StringDtype(string_storage, na_value=np.nan) + ) tm.assert_frame_equal(result, expected) def test_int_columns_and_index(self): diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 45e0cab2165a7..aa9c47ea0e63c 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -176,7 +176,7 @@ def test_excel_options(fsspectest): assert fsspectest.test[0] == "read" -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") def test_to_parquet_new_file(cleared_fs, df1): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") @@ -205,7 +205,7 @@ def test_arrowparquet_options(fsspectest): assert fsspectest.test[0] == "parquet_read" -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") def test_fastparquet_options(fsspectest): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") @@ -263,7 +263,7 @@ def test_s3_protocols(s3_public_bucket_with_data, tips_file, protocol, s3so): ) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") @pytest.mark.single_cpu def test_s3_parquet(s3_public_bucket, s3so, df1): pytest.importorskip("fastparquet") diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index bf56a5781f7cd..a9e7b2da03a4d 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -208,7 +208,7 @@ def test_to_csv_compression_encoding_gcs( tm.assert_frame_equal(df, read_df) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index a189afbac070d..90133344fdfc9 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import read_orc import pandas._testing as tm @@ -20,12 +18,9 @@ import pyarrow as pa -pytestmark = [ - pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" - ), - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) @pytest.fixture @@ -33,7 +28,7 @@ def dirpath(datapath): return datapath("io", "data", "orc") -def test_orc_reader_empty(dirpath): +def test_orc_reader_empty(dirpath, using_infer_string): columns = [ "boolean1", "byte1", @@ -54,11 +49,12 @@ def test_orc_reader_empty(dirpath): "float32", "float64", "object", - "object", + "str" if using_infer_string else "object", ] expected = pd.DataFrame(index=pd.RangeIndex(0)) for colname, dtype in zip(columns, dtypes): expected[colname] = pd.Series(dtype=dtype) + expected.columns = expected.columns.astype("str") inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc") got = read_orc(inputfile, columns=columns) @@ -305,7 +301,7 @@ def test_orc_writer_dtypes_not_supported(orc_writer_dtypes_not_supported): df.to_orc() -def test_orc_dtype_backend_pyarrow(): +def test_orc_dtype_backend_pyarrow(using_infer_string): pytest.importorskip("pyarrow") df = pd.DataFrame( { @@ -338,6 +334,13 @@ def test_orc_dtype_backend_pyarrow(): for col in df.columns } ) + if using_infer_string: + # ORC does not preserve distinction between string and large string + # -> the default large string comes back as string + string_dtype = pd.ArrowDtype(pa.string()) + expected["string"] = expected["string"].astype(string_dtype) + expected["string_with_nan"] = expected["string_with_nan"].astype(string_dtype) + expected["string_with_none"] = expected["string_with_none"].astype(string_dtype) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index f4d64bf84b3f5..0d0eae25781f1 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -52,7 +52,6 @@ pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ), - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), ] @@ -61,10 +60,17 @@ params=[ pytest.param( "fastparquet", - marks=pytest.mark.skipif( - not _HAVE_FASTPARQUET, - reason="fastparquet is not installed", - ), + marks=[ + pytest.mark.skipif( + not _HAVE_FASTPARQUET, + reason="fastparquet is not installed", + ), + pytest.mark.xfail( + using_string_dtype(), + reason="TODO(infer_string) fastparquet", + strict=False, + ), + ], ), pytest.param( "pyarrow", @@ -86,15 +92,22 @@ def pa(): @pytest.fixture -def fp(): +def fp(request): if not _HAVE_FASTPARQUET: pytest.skip("fastparquet is not installed") + if using_string_dtype(): + request.applymarker( + pytest.mark.xfail(reason="TODO(infer_string) fastparquet", strict=False) + ) return "fastparquet" @pytest.fixture def df_compat(): - return pd.DataFrame({"A": [1, 2, 3], "B": "foo"}) + # TODO(infer_string) should this give str columns? + return pd.DataFrame( + {"A": [1, 2, 3], "B": "foo"}, columns=pd.Index(["A", "B"], dtype=object) + ) @pytest.fixture @@ -366,16 +379,6 @@ def check_external_error_on_write(self, df, engine, exc): with tm.external_error_raised(exc): to_parquet(df, path, engine, compression=None) - @pytest.mark.network - @pytest.mark.single_cpu - def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine): - if engine != "auto": - pytest.importorskip(engine) - with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f: - httpserver.serve_content(content=f.read()) - df = read_parquet(httpserver.url) - tm.assert_frame_equal(df, df_compat) - class TestBasic(Base): def test_error(self, engine): @@ -673,6 +676,16 @@ def test_read_empty_array(self, pa, dtype): df, pa, read_kwargs={"dtype_backend": "numpy_nullable"}, expected=expected ) + @pytest.mark.network + @pytest.mark.single_cpu + def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine): + if engine != "auto": + pytest.importorskip(engine) + with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f: + httpserver.serve_content(content=f.read()) + df = read_parquet(httpserver.url, engine=engine) + tm.assert_frame_equal(df, df_compat) + class TestParquetPyArrow(Base): @pytest.mark.xfail(reason="datetime_with_nat unit doesn't round-trip") @@ -906,7 +919,7 @@ def test_write_with_schema(self, pa): out_df = df.astype(bool) check_round_trip(df, pa, write_kwargs={"schema": schema}, expected=out_df) - def test_additional_extension_arrays(self, pa): + def test_additional_extension_arrays(self, pa, using_infer_string): # test additional ExtensionArrays that are supported through the # __arrow_array__ protocol pytest.importorskip("pyarrow") @@ -917,17 +930,25 @@ def test_additional_extension_arrays(self, pa): "c": pd.Series(["a", None, "c"], dtype="string"), } ) - check_round_trip(df, pa) + if using_infer_string: + check_round_trip(df, pa, expected=df.astype({"c": "str"})) + else: + check_round_trip(df, pa) df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")}) check_round_trip(df, pa) - def test_pyarrow_backed_string_array(self, pa, string_storage): + def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_string): # test ArrowStringArray supported through the __arrow_array__ protocol pytest.importorskip("pyarrow") df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")}) with pd.option_context("string_storage", string_storage): - check_round_trip(df, pa, expected=df.astype(f"string[{string_storage}]")) + if using_infer_string: + expected = df.astype("str") + expected.columns = expected.columns.astype("str") + else: + expected = df.astype(f"string[{string_storage}]") + check_round_trip(df, pa, expected=expected) def test_additional_extension_types(self, pa): # test additional ExtensionArrays that are supported through the