Skip to content

Commit

Permalink
String dtype: still return nullable NA-variant in object inference (`…
Browse files Browse the repository at this point in the history
…maybe_converts_object`) if requested (#59487)

* String dtype: maybe_converts_object give precedence to nullable dtype

* update datetimelike input validation

* update tests and remove xfails

* explicitly test pd.array() behaviour (remove xfail)

* fixup allow_2d

* undo changes related to datetimelike input validation

* fix test for str on current main

---------

Co-authored-by: Matthew Roeschke <[email protected]>
  • Loading branch information
jorisvandenbossche and mroeschke authored Aug 21, 2024
1 parent 320d613 commit 851639d
Show file tree
Hide file tree
Showing 7 changed files with 43 additions and 20 deletions.
8 changes: 4 additions & 4 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2699,16 +2699,16 @@ def maybe_convert_objects(ndarray[object] objects,
seen.object_ = True

elif seen.str_:
if using_string_dtype() and is_string_array(objects, skipna=True):
if convert_to_nullable_dtype and is_string_array(objects, skipna=True):
from pandas.core.arrays.string_ import StringDtype

dtype = StringDtype(na_value=np.nan)
dtype = StringDtype()
return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)

elif convert_to_nullable_dtype and is_string_array(objects, skipna=True):
elif using_string_dtype() and is_string_array(objects, skipna=True):
from pandas.core.arrays.string_ import StringDtype

dtype = StringDtype()
dtype = StringDtype(na_value=np.nan)
return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)

seen.object_ = True
Expand Down
5 changes: 2 additions & 3 deletions pandas/tests/arrays/string_/test_string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,8 @@ def test_config(string_storage, using_infer_string):
result = pd.array(["a", "b"])
assert result.dtype.storage == string_storage

dtype = StringDtype(
string_storage, na_value=np.nan if using_infer_string else pd.NA
)
# pd.array(..) by default always returns the NA-variant
dtype = StringDtype(string_storage, na_value=pd.NA)
expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype)
tm.assert_equal(result, expected)

Expand Down
34 changes: 32 additions & 2 deletions pandas/tests/arrays/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,21 +215,45 @@ def test_dt64_array(dtype_unit):
.construct_array_type()
._from_sequence(["a", None], dtype=pd.StringDtype()),
),
(
["a", None],
"str",
pd.StringDtype(na_value=np.nan)
.construct_array_type()
._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan))
if using_string_dtype()
else NumpyExtensionArray(np.array(["a", "None"])),
),
(
["a", None],
pd.StringDtype(),
pd.StringDtype()
.construct_array_type()
._from_sequence(["a", None], dtype=pd.StringDtype()),
),
(
["a", None],
pd.StringDtype(na_value=np.nan),
pd.StringDtype(na_value=np.nan)
.construct_array_type()
._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)),
),
(
# numpy array with string dtype
np.array(["a", "b"], dtype=str),
None,
pd.StringDtype(),
pd.StringDtype()
.construct_array_type()
._from_sequence(["a", "b"], dtype=pd.StringDtype()),
),
(
# numpy array with string dtype
np.array(["a", "b"], dtype=str),
pd.StringDtype(na_value=np.nan),
pd.StringDtype(na_value=np.nan)
.construct_array_type()
._from_sequence(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
),
# Boolean
(
[True, None],
Expand Down Expand Up @@ -287,7 +311,6 @@ def test_array_copy():
assert tm.shares_memory(a, b)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize(
"data, expected",
[
Expand Down Expand Up @@ -387,6 +410,13 @@ def test_array_copy():
.construct_array_type()
._from_sequence(["a", None], dtype=pd.StringDtype()),
),
(
# numpy array with string dtype
np.array(["a", "b"], dtype=str),
pd.StringDtype()
.construct_array_type()
._from_sequence(["a", "b"], dtype=pd.StringDtype()),
),
# Boolean
([True, False], BooleanArray._from_sequence([True, False], dtype="boolean")),
([True, None], BooleanArray._from_sequence([True, None], dtype="boolean")),
Expand Down
7 changes: 2 additions & 5 deletions pandas/tests/arrays/test_datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,9 +297,7 @@ def test_searchsorted(self):
assert result == 10

@pytest.mark.parametrize("box", [None, "index", "series"])
def test_searchsorted_castable_strings(
self, arr1d, box, string_storage, using_infer_string
):
def test_searchsorted_castable_strings(self, arr1d, box, string_storage):
arr = arr1d
if box is None:
pass
Expand Down Expand Up @@ -335,8 +333,7 @@ def test_searchsorted_castable_strings(
TypeError,
match=re.escape(
f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', "
"or array of those. Got "
f"{'str' if using_infer_string else 'string'} array instead."
"or array of those. Got string array instead."
),
):
arr.searchsorted([str(arr[1]), "baz"])
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/base/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def test_value_counts_inferred(index_or_series, using_infer_string):
else:
exp = np.unique(np.array(s_values, dtype=np.object_))
if using_infer_string:
exp = array(exp)
exp = array(exp, dtype="str")
tm.assert_equal(s.unique(), exp)

assert s.nunique() == 4
Expand Down Expand Up @@ -192,7 +192,7 @@ def test_value_counts_bins(index_or_series, using_infer_string):
else:
exp = np.array(["a", "b", np.nan, "d"], dtype=object)
if using_infer_string:
exp = array(exp)
exp = array(exp, dtype="str")
tm.assert_equal(s.unique(), exp)
assert s.nunique() == 3

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/dtypes/cast/test_construct_ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def test_construct_1d_ndarray_preserving_na(
):
result = sanitize_array(values, index=None, dtype=dtype)
if using_infer_string and expected.dtype == object and dtype is None:
tm.assert_extension_array_equal(result, pd.array(expected))
tm.assert_extension_array_equal(result, pd.array(expected, dtype="str"))
else:
tm.assert_numpy_array_equal(result, expected)

Expand Down
3 changes: 0 additions & 3 deletions pandas/tests/io/parser/usecols/test_usecols_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.errors import ParserError

from pandas import (
Expand Down Expand Up @@ -531,7 +529,6 @@ def test_usecols_additional_columns_integer_columns(all_parsers):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_usecols_dtype(all_parsers):
parser = all_parsers
data = """
Expand Down

0 comments on commit 851639d

Please sign in to comment.