String dtype: still return nullable NA-variant in object inference (`…

…maybe_converts_object`) if requested (#59487) * String dtype: maybe_converts_object give precedence to nullable dtype * update datetimelike input validation * update tests and remove xfails * explicitly test pd.array() behaviour (remove xfail) * fixup allow_2d * undo changes related to datetimelike input validation * fix test for str on current main --------- Co-authored-by: Matthew Roeschke <[email protected]>
pandas-dev · Aug 21, 2024 · 851639d · 851639d
1 parent 320d613
commit 851639d
Show file tree

Hide file tree

Showing 7 changed files with 43 additions and 20 deletions.
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -2699,16 +2699,16 @@ def maybe_convert_objects(ndarray[object] objects,
         seen.object_ = True
 
     elif seen.str_:
-        if using_string_dtype() and is_string_array(objects, skipna=True):
+        if convert_to_nullable_dtype and is_string_array(objects, skipna=True):
             from pandas.core.arrays.string_ import StringDtype
 
-            dtype = StringDtype(na_value=np.nan)
+            dtype = StringDtype()
             return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
 
-        elif convert_to_nullable_dtype and is_string_array(objects, skipna=True):
+        elif using_string_dtype() and is_string_array(objects, skipna=True):
             from pandas.core.arrays.string_ import StringDtype
 
-            dtype = StringDtype()
+            dtype = StringDtype(na_value=np.nan)
             return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
 
         seen.object_ = True

diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -36,9 +36,8 @@ def test_config(string_storage, using_infer_string):
         result = pd.array(["a", "b"])
         assert result.dtype.storage == string_storage
 
-    dtype = StringDtype(
-        string_storage, na_value=np.nan if using_infer_string else pd.NA
-    )
+    # pd.array(..) by default always returns the NA-variant
+    dtype = StringDtype(string_storage, na_value=pd.NA)
     expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype)
     tm.assert_equal(result, expected)
 

diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py
@@ -215,21 +215,45 @@ def test_dt64_array(dtype_unit):
             .construct_array_type()
             ._from_sequence(["a", None], dtype=pd.StringDtype()),
         ),
+        (
+            ["a", None],
+            "str",
+            pd.StringDtype(na_value=np.nan)
+            .construct_array_type()
+            ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan))
+            if using_string_dtype()
+            else NumpyExtensionArray(np.array(["a", "None"])),
+        ),
         (
             ["a", None],
             pd.StringDtype(),
             pd.StringDtype()
             .construct_array_type()
             ._from_sequence(["a", None], dtype=pd.StringDtype()),
         ),
+        (
+            ["a", None],
+            pd.StringDtype(na_value=np.nan),
+            pd.StringDtype(na_value=np.nan)
+            .construct_array_type()
+            ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)),
+        ),
         (
             # numpy array with string dtype
             np.array(["a", "b"], dtype=str),
-            None,
+            pd.StringDtype(),
             pd.StringDtype()
             .construct_array_type()
             ._from_sequence(["a", "b"], dtype=pd.StringDtype()),
         ),
+        (
+            # numpy array with string dtype
+            np.array(["a", "b"], dtype=str),
+            pd.StringDtype(na_value=np.nan),
+            pd.StringDtype(na_value=np.nan)
+            .construct_array_type()
+            ._from_sequence(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
+        ),
         # Boolean
         (
             [True, None],
@@ -287,7 +311,6 @@ def test_array_copy():
     assert tm.shares_memory(a, b)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize(
     "data, expected",
     [
@@ -387,6 +410,13 @@ def test_array_copy():
             .construct_array_type()
             ._from_sequence(["a", None], dtype=pd.StringDtype()),
         ),
+        (
+            # numpy array with string dtype
+            np.array(["a", "b"], dtype=str),
+            pd.StringDtype()
+            .construct_array_type()
+            ._from_sequence(["a", "b"], dtype=pd.StringDtype()),
+        ),
         # Boolean
         ([True, False], BooleanArray._from_sequence([True, False], dtype="boolean")),
         ([True, None], BooleanArray._from_sequence([True, None], dtype="boolean")),

diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py
@@ -297,9 +297,7 @@ def test_searchsorted(self):
         assert result == 10
 
     @pytest.mark.parametrize("box", [None, "index", "series"])
-    def test_searchsorted_castable_strings(
-        self, arr1d, box, string_storage, using_infer_string
-    ):
+    def test_searchsorted_castable_strings(self, arr1d, box, string_storage):
         arr = arr1d
         if box is None:
             pass
@@ -335,8 +333,7 @@ def test_searchsorted_castable_strings(
                 TypeError,
                 match=re.escape(
                     f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', "
-                    "or array of those. Got "
-                    f"{'str' if using_infer_string else 'string'} array instead."
+                    "or array of those. Got string array instead."
                 ),
             ):
                 arr.searchsorted([str(arr[1]), "baz"])

diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py
@@ -114,7 +114,7 @@ def test_value_counts_inferred(index_or_series, using_infer_string):
     else:
         exp = np.unique(np.array(s_values, dtype=np.object_))
         if using_infer_string:
-            exp = array(exp)
+            exp = array(exp, dtype="str")
         tm.assert_equal(s.unique(), exp)
 
     assert s.nunique() == 4
@@ -192,7 +192,7 @@ def test_value_counts_bins(index_or_series, using_infer_string):
     else:
         exp = np.array(["a", "b", np.nan, "d"], dtype=object)
         if using_infer_string:
-            exp = array(exp)
+            exp = array(exp, dtype="str")
         tm.assert_equal(s.unique(), exp)
     assert s.nunique() == 3
 

diff --git a/pandas/tests/dtypes/cast/test_construct_ndarray.py b/pandas/tests/dtypes/cast/test_construct_ndarray.py
@@ -21,7 +21,7 @@ def test_construct_1d_ndarray_preserving_na(
 ):
     result = sanitize_array(values, index=None, dtype=dtype)
     if using_infer_string and expected.dtype == object and dtype is None:
-        tm.assert_extension_array_equal(result, pd.array(expected))
+        tm.assert_extension_array_equal(result, pd.array(expected, dtype="str"))
     else:
         tm.assert_numpy_array_equal(result, expected)
 

diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py
@@ -8,8 +8,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.errors import ParserError
 
 from pandas import (
@@ -531,7 +529,6 @@ def test_usecols_additional_columns_integer_columns(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_usecols_dtype(all_parsers):
     parser = all_parsers
     data = """