From 8b22858a243384af6d39f992b00f5689762c104a Mon Sep 17 00:00:00 2001 From: KevsterAmp Date: Wed, 7 Aug 2024 12:45:28 +0800 Subject: [PATCH 1/9] add merge type validation on pandas.merge --- pandas/core/reshape/merge.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 6364072fd215c..b03f4235f9c35 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -350,7 +350,10 @@ def merge( left_df = _validate_operand(left) left._check_copy_deprecation(copy) right_df = _validate_operand(right) - if how == "cross": + merge_type = ["left", "right", "inner", "outer", "cross"] + if how not in merge_type: + raise ValueError(f"'{how}' is not a valid Merge type ({merge_type})") + elif how == "cross": return _cross_merge( left_df, right_df, From 775b8b9a0c898d15d93f6c699d20174ee4525be3 Mon Sep 17 00:00:00 2001 From: KevsterAmp Date: Wed, 7 Aug 2024 21:43:44 +0800 Subject: [PATCH 2/9] add ENH to latest whatsnew doc --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 32c98fbf9d655..09bf7e64a67bc 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -30,6 +30,7 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`) - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) +- :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`) - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) - :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`). From 79e905539547499d110cc9d4956c52477fa06a4d Mon Sep 17 00:00:00 2001 From: KevsterAmp Date: Thu, 8 Aug 2024 16:31:07 +0800 Subject: [PATCH 3/9] move merge type validation to _MergeOperation class --- pandas/core/reshape/merge.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index b03f4235f9c35..dde56275b86fb 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -350,10 +350,7 @@ def merge( left_df = _validate_operand(left) left._check_copy_deprecation(copy) right_df = _validate_operand(right) - merge_type = ["left", "right", "inner", "outer", "cross"] - if how not in merge_type: - raise ValueError(f"'{how}' is not a valid Merge type ({merge_type})") - elif how == "cross": + if how == "cross": return _cross_merge( left_df, right_df, @@ -985,6 +982,11 @@ def __init__( ) raise MergeError(msg) + # GH 59435: raise when "how" is not a valid Merge type + merge_type = ("left", "right", "inner", "outer", "cross", "asof") + if how not in merge_type: + raise ValueError(f"'{how}' is not a valid Merge type {merge_type}") + self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on) ( From e1b578144ed9d72064ca56f3f7f2ba7e7dbecd7e Mon Sep 17 00:00:00 2001 From: KevsterAmp Date: Thu, 8 Aug 2024 16:32:31 +0800 Subject: [PATCH 4/9] add tests on merge validation; add asof as valid mergetype --- pandas/tests/reshape/merge/test_merge.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 4a6228e47eba0..86d287e5f6519 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1456,6 +1456,21 @@ def test_merge_readonly(self): data1.merge(data2) # no error + def test_merge_how_validation(self): + # https://github.com/pandas-dev/pandas/issues/59422 + data1 = DataFrame( + np.arange(20).reshape((4, 5)) + 1, columns=["a", "b", "c", "d", "e"] + ) + data2 = DataFrame( + np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"] + ) + msg = ( + "'full' is not a valid Merge type " + "('left', 'right', 'inner', 'outer', 'cross', 'asof')" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + data1.merge(data2, how="full") + def _check_merge(x, y): for how in ["inner", "left", "outer"]: From 19d6085c7b45167f8e5d0da7b15d98b3b25689c8 Mon Sep 17 00:00:00 2001 From: KevsterAmp Date: Wed, 21 Aug 2024 20:36:31 +0800 Subject: [PATCH 5/9] change merge_type from tuple to dict --- pandas/core/reshape/merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index dde56275b86fb..e79aea381c096 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -983,7 +983,7 @@ def __init__( raise MergeError(msg) # GH 59435: raise when "how" is not a valid Merge type - merge_type = ("left", "right", "inner", "outer", "cross", "asof") + merge_type = {"left", "right", "inner", "outer", "cross", "asof"} if how not in merge_type: raise ValueError(f"'{how}' is not a valid Merge type {merge_type}") From 2d201f995425cfeec5289555d0a85a72398d0f5c Mon Sep 17 00:00:00 2001 From: KevsterAmp Date: Wed, 21 Aug 2024 20:36:52 +0800 Subject: [PATCH 6/9] change ValueError message to updated change --- pandas/tests/frame/methods/test_join.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index 7de87e633cfb1..07ac5e2c1d609 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -1,4 +1,5 @@ from datetime import datetime +import re import zoneinfo import numpy as np @@ -276,7 +277,11 @@ def test_join_index(float_frame): tm.assert_index_equal(joined.index, float_frame.index.sort_values()) tm.assert_index_equal(joined.columns, expected_columns) - with pytest.raises(ValueError, match="join method"): + join_msg = ( + "'foo' is not a valid Merge type " + "('left', 'right', 'inner', 'outer', 'cross', 'asof')" + ) + with pytest.raises(ValueError, match=re.escape(join_msg)): f.join(f2, how="foo") # corner case - overlapping columns From 42bfbda337c7b9db58fbee0d6142ca296a5514d1 Mon Sep 17 00:00:00 2001 From: KevsterAmp Date: Wed, 21 Aug 2024 21:12:13 +0800 Subject: [PATCH 7/9] change Error message on merge type --- pandas/core/reshape/merge.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index e79aea381c096..07e8fa4841c04 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -985,7 +985,10 @@ def __init__( # GH 59435: raise when "how" is not a valid Merge type merge_type = {"left", "right", "inner", "outer", "cross", "asof"} if how not in merge_type: - raise ValueError(f"'{how}' is not a valid Merge type {merge_type}") + raise ValueError( + f"'{how}' is not a valid Merge type: " + f"left, right, inner, outer, cross, asof" + ) self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on) From 3e5b2394c5f9c302c61ea35a38ff27356d62225b Mon Sep 17 00:00:00 2001 From: KevsterAmp Date: Wed, 21 Aug 2024 21:12:52 +0800 Subject: [PATCH 8/9] update test error messages for merge type errors --- pandas/tests/frame/methods/test_join.py | 5 +---- pandas/tests/reshape/merge/test_merge.py | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index 07ac5e2c1d609..479ea7d7ba692 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -277,10 +277,7 @@ def test_join_index(float_frame): tm.assert_index_equal(joined.index, float_frame.index.sort_values()) tm.assert_index_equal(joined.columns, expected_columns) - join_msg = ( - "'foo' is not a valid Merge type " - "('left', 'right', 'inner', 'outer', 'cross', 'asof')" - ) + join_msg = "'foo' is not a valid Merge type: left, right, inner, outer, cross, asof" with pytest.raises(ValueError, match=re.escape(join_msg)): f.join(f2, how="foo") diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 5251b721b685f..d4766242b8460 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1464,10 +1464,7 @@ def test_merge_how_validation(self): data2 = DataFrame( np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"] ) - msg = ( - "'full' is not a valid Merge type " - "('left', 'right', 'inner', 'outer', 'cross', 'asof')" - ) + msg = "'full' is not a valid Merge type: left, right, inner, outer, cross, asof" with pytest.raises(ValueError, match=re.escape(msg)): data1.merge(data2, how="full") From 84e77c961ff23df3cf1c042ae66e5b408c4f3f8b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 21 Aug 2024 23:07:17 +0000 Subject: [PATCH 9/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 1e3b22212ac29..6ec219240922f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -30,8 +30,8 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`) - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) -- :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`) - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) +- :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`) - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) - :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`). - :meth:`Index.get_loc` now accepts also subclasses of ``tuple`` as keys (:issue:`57922`)