From 4e7a1ee50e27f31679662cb71e06219d11d82d61 Mon Sep 17 00:00:00 2001 From: Zhengbo Wang <77875500+luke396@users.noreply.github.com> Date: Mon, 15 Jan 2024 14:11:39 +0800 Subject: [PATCH 01/13] CI: Improve error message format in validate_docstrings.py (#56827) --- scripts/validate_docstrings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 53c67b7df928b..682d64244bc1f 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -392,7 +392,7 @@ def header(title, width=80, char="#") -> str: if result["errors"]: sys.stderr.write(f'{len(result["errors"])} Errors found for `{func_name}`:\n') for err_code, err_desc in result["errors"]: - sys.stderr.write(f"\t{err_desc}\n") + sys.stderr.write(f"\t{err_code}\t{err_desc}\n") else: sys.stderr.write(f'Docstring for "{func_name}" correct. :)\n') From d9849dfb9013fe5987611e6dfdaa18411b2d1808 Mon Sep 17 00:00:00 2001 From: Xiao Yuan Date: Mon, 15 Jan 2024 14:34:27 +0800 Subject: [PATCH 02/13] DOC: fix EX03 in `pandas.errors` (#56867) --- ci/code_checks.sh | 4 ---- pandas/errors/__init__.py | 10 +++++----- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 8658715b8bf3e..afb76c29133f9 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -76,13 +76,9 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.errors.DatabaseError \ pandas.errors.IndexingError \ pandas.errors.InvalidColumnName \ - pandas.errors.PossibleDataLossError \ - pandas.errors.PossiblePrecisionLoss \ - pandas.errors.SettingWithCopyError \ pandas.errors.SettingWithCopyWarning \ pandas.errors.SpecificationError \ pandas.errors.UndefinedVariableError \ - pandas.errors.ValueLabelTypeMismatch \ pandas.Timestamp.ceil \ pandas.Timestamp.floor \ pandas.Timestamp.round \ diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 9faa17f6e5f15..9d39b8d92fec9 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -425,7 +425,7 @@ class SettingWithCopyError(ValueError): -------- >>> pd.options.mode.chained_assignment = 'raise' >>> df = pd.DataFrame({'A': [1, 1, 1, 2, 2]}, columns=['A']) - >>> df.loc[0:3]['A'] = 'a' # doctest: +SKIP + >>> df.loc[0:3]['A'] = 'a' # doctest: +SKIP ... # SettingWithCopyError: A value is trying to be set on a copy of a... """ @@ -665,8 +665,8 @@ class PossibleDataLossError(Exception): Examples -------- - >>> store = pd.HDFStore('my-store', 'a') # doctest: +SKIP - >>> store.open("w") # doctest: +SKIP + >>> store = pd.HDFStore('my-store', 'a') # doctest: +SKIP + >>> store.open("w") # doctest: +SKIP ... # PossibleDataLossError: Re-opening the file [my-store] with mode [a]... """ @@ -734,7 +734,7 @@ class PossiblePrecisionLoss(Warning): Examples -------- >>> df = pd.DataFrame({"s": pd.Series([1, 2**53], dtype=np.int64)}) - >>> df.to_stata('test') # doctest: +SKIP + >>> df.to_stata('test') # doctest: +SKIP ... # PossiblePrecisionLoss: Column converted from int64 to float64... """ @@ -746,7 +746,7 @@ class ValueLabelTypeMismatch(Warning): Examples -------- >>> df = pd.DataFrame({"categories": pd.Series(["a", 2], dtype="category")}) - >>> df.to_stata('test') # doctest: +SKIP + >>> df.to_stata('test') # doctest: +SKIP ... # ValueLabelTypeMismatch: Stata value labels (pandas categories) must be str... """ From a0634c90e8ffe5cd23914270c0f992b6a4078c14 Mon Sep 17 00:00:00 2001 From: Jordan Murphy <35613487+jordan-d-murphy@users.noreply.github.com> Date: Sun, 14 Jan 2024 23:37:48 -0700 Subject: [PATCH 03/13] DOC: fix EX03 errors in docstrings - pandas.core.resample.Resampler.interpolate, pandas.pivot, pandas.merge_asof, pandas.wide_to_long (#56868) --- ci/code_checks.sh | 4 --- pandas/core/reshape/melt.py | 20 +++++++-------- pandas/core/reshape/merge.py | 48 ++++++++++++++++++------------------ 3 files changed, 34 insertions(+), 38 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index afb76c29133f9..50310a3af9661 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -89,11 +89,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.read_parquet \ pandas.DataFrame.to_sql \ pandas.read_stata \ - pandas.core.resample.Resampler.interpolate \ pandas.plotting.scatter_matrix \ - pandas.pivot \ - pandas.merge_asof \ - pandas.wide_to_long \ pandas.Index.rename \ pandas.Index.droplevel \ pandas.Index.isin \ diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index bb1cd0d738dac..1ae7000f56bc9 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -295,7 +295,7 @@ def wide_to_long( ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7}, ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1}, ... "X" : dict(zip(range(3), np.random.randn(3))) - ... }) + ... }) >>> df["id"] = df.index >>> df A1970 A1980 B1970 B1980 X id @@ -332,8 +332,8 @@ def wide_to_long( 6 3 1 2.2 3.3 7 3 2 2.3 3.4 8 3 3 2.1 2.9 - >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age') - >>> l + >>> long_format = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age') + >>> long_format ... # doctest: +NORMALIZE_WHITESPACE ht famid birth age @@ -358,9 +358,9 @@ def wide_to_long( Going from long back to wide just takes some creative use of `unstack` - >>> w = l.unstack() - >>> w.columns = w.columns.map('{0[0]}{0[1]}'.format) - >>> w.reset_index() + >>> wide_format = long_format.unstack() + >>> wide_format.columns = wide_format.columns.map('{0[0]}{0[1]}'.format) + >>> wide_format.reset_index() famid birth ht1 ht2 0 1 1 2.8 3.4 1 1 2 2.9 3.8 @@ -381,7 +381,7 @@ def wide_to_long( ... 'B(weekly)-2011': np.random.rand(3), ... 'X' : np.random.randint(3, size=3)}) >>> df['id'] = df.index - >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS + >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS A(weekly)-2010 A(weekly)-2011 B(weekly)-2010 B(weekly)-2011 X id 0 0.548814 0.544883 0.437587 0.383442 0 0 1 0.715189 0.423655 0.891773 0.791725 1 1 @@ -430,9 +430,9 @@ def wide_to_long( 7 3 2 2.3 3.4 8 3 3 2.1 2.9 - >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age', - ... sep='_', suffix=r'\w+') - >>> l + >>> long_format = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age', + ... sep='_', suffix=r'\w+') + >>> long_format ... # doctest: +NORMALIZE_WHITESPACE ht famid birth age diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 804144931bcfd..6ca403bdb439a 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -601,17 +601,17 @@ def merge_asof( ... pd.Timestamp("2016-05-25 13:30:00.075") ... ], ... "ticker": [ - ... "GOOG", - ... "MSFT", - ... "MSFT", - ... "MSFT", - ... "GOOG", - ... "AAPL", - ... "GOOG", - ... "MSFT" - ... ], - ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], - ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03] + ... "GOOG", + ... "MSFT", + ... "MSFT", + ... "MSFT", + ... "GOOG", + ... "AAPL", + ... "GOOG", + ... "MSFT" + ... ], + ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], + ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03] ... } ... ) >>> quotes @@ -626,19 +626,19 @@ def merge_asof( 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03 >>> trades = pd.DataFrame( - ... { - ... "time": [ - ... pd.Timestamp("2016-05-25 13:30:00.023"), - ... pd.Timestamp("2016-05-25 13:30:00.038"), - ... pd.Timestamp("2016-05-25 13:30:00.048"), - ... pd.Timestamp("2016-05-25 13:30:00.048"), - ... pd.Timestamp("2016-05-25 13:30:00.048") - ... ], - ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], - ... "price": [51.95, 51.95, 720.77, 720.92, 98.0], - ... "quantity": [75, 155, 100, 100, 100] - ... } - ... ) + ... { + ... "time": [ + ... pd.Timestamp("2016-05-25 13:30:00.023"), + ... pd.Timestamp("2016-05-25 13:30:00.038"), + ... pd.Timestamp("2016-05-25 13:30:00.048"), + ... pd.Timestamp("2016-05-25 13:30:00.048"), + ... pd.Timestamp("2016-05-25 13:30:00.048") + ... ], + ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + ... "price": [51.95, 51.95, 720.77, 720.92, 98.0], + ... "quantity": [75, 155, 100, 100, 100] + ... } + ... ) >>> trades time ticker price quantity 0 2016-05-25 13:30:00.023 MSFT 51.95 75 From 321df559d631fd4d6a13b3ccb8ed8cfcbb9ee456 Mon Sep 17 00:00:00 2001 From: Jordan Murphy <35613487+jordan-d-murphy@users.noreply.github.com> Date: Sun, 14 Jan 2024 23:50:43 -0700 Subject: [PATCH 04/13] DOC: fix EX03 errors in docstrings - pandas.io.formats.style.Styler: format_index, relabel_index, hide, set_td_classes (#56881) --- ci/code_checks.sh | 7 ------- pandas/io/formats/style.py | 14 +++++++------- pandas/io/formats/style_render.py | 30 ++++++++++++++++-------------- 3 files changed, 23 insertions(+), 28 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 50310a3af9661..1eeb39d428ee3 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -101,10 +101,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.io.formats.style.Styler.apply_index \ pandas.io.formats.style.Styler.map_index \ pandas.io.formats.style.Styler.format \ - pandas.io.formats.style.Styler.format_index \ - pandas.io.formats.style.Styler.relabel_index \ - pandas.io.formats.style.Styler.hide \ - pandas.io.formats.style.Styler.set_td_classes \ pandas.io.formats.style.Styler.set_tooltips \ pandas.io.formats.style.Styler.set_uuid \ pandas.io.formats.style.Styler.pipe \ @@ -114,9 +110,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.io.formats.style.Styler.text_gradient \ pandas.DataFrame.values \ pandas.DataFrame.groupby \ - pandas.DataFrame.idxmax \ - pandas.DataFrame.idxmin \ - pandas.DataFrame.pivot \ pandas.DataFrame.sort_values \ pandas.DataFrame.plot.hexbin \ pandas.DataFrame.plot.line \ diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 26349dc129361..0fbfae22f4663 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -1496,10 +1496,10 @@ def set_td_classes(self, classes: DataFrame) -> Styler: Using `MultiIndex` columns and a `classes` `DataFrame` as a subset of the underlying, - >>> df = pd.DataFrame([[1,2],[3,4]], index=["a", "b"], - ... columns=[["level0", "level0"], ["level1a", "level1b"]]) + >>> df = pd.DataFrame([[1, 2], [3, 4]], index=["a", "b"], + ... columns=[["level0", "level0"], ["level1a", "level1b"]]) >>> classes = pd.DataFrame(["min-val"], index=["a"], - ... columns=[["level0"],["level1a"]]) + ... columns=[["level0"], ["level1a"]]) >>> df.style.set_td_classes(classes) # doctest: +SKIP Form of the output with new additional css classes, @@ -2717,7 +2717,7 @@ def hide( -------- Simple application hiding specific rows: - >>> df = pd.DataFrame([[1,2], [3,4], [5,6]], index=["a", "b", "c"]) + >>> df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], index=["a", "b", "c"]) >>> df.style.hide(["a", "b"]) # doctest: +SKIP 0 1 c 5 6 @@ -2725,7 +2725,7 @@ def hide( Hide the index and retain the data values: >>> midx = pd.MultiIndex.from_product([["x", "y"], ["a", "b", "c"]]) - >>> df = pd.DataFrame(np.random.randn(6,6), index=midx, columns=midx) + >>> df = pd.DataFrame(np.random.randn(6, 6), index=midx, columns=midx) >>> df.style.format("{:.1f}").hide() # doctest: +SKIP x y a b c a b c @@ -2739,7 +2739,7 @@ def hide( Hide specific rows in a MultiIndex but retain the index: >>> df.style.format("{:.1f}").hide(subset=(slice(None), ["a", "c"])) - ... # doctest: +SKIP + ... # doctest: +SKIP x y a b c a b c x b 0.7 1.0 1.3 1.5 -0.0 -0.2 @@ -2748,7 +2748,7 @@ def hide( Hide specific rows and the index through chaining: >>> df.style.format("{:.1f}").hide(subset=(slice(None), ["a", "c"])).hide() - ... # doctest: +SKIP + ... # doctest: +SKIP x y a b c a b c 0.7 1.0 1.3 1.5 -0.0 -0.2 diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 55541e5262719..622e047b9f99b 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -1318,9 +1318,10 @@ def format_index( Using the default ``formatter`` for unspecified levels >>> df = pd.DataFrame([[1, 2, 3]], - ... columns=pd.MultiIndex.from_arrays([["a", "a", "b"],[2, np.nan, 4]])) + ... columns=pd.MultiIndex.from_arrays( + ... [["a", "a", "b"], [2, np.nan, 4]])) >>> df.style.format_index({0: lambda v: v.upper()}, axis=1, precision=1) - ... # doctest: +SKIP + ... # doctest: +SKIP A B 2.0 nan 4.0 0 1 2 3 @@ -1329,7 +1330,7 @@ def format_index( >>> func = lambda s: 'STRING' if isinstance(s, str) else 'FLOAT' >>> df.style.format_index(func, axis=1, na_rep='MISS') - ... # doctest: +SKIP + ... # doctest: +SKIP STRING STRING FLOAT MISS FLOAT 0 1 2 3 @@ -1338,7 +1339,7 @@ def format_index( >>> df = pd.DataFrame([[1, 2, 3]], columns=['"A"', 'A&B', None]) >>> s = df.style.format_index('$ {0}', axis=1, escape="html", na_rep="NA") - ... # doctest: +SKIP + ... # doctest: +SKIP $ "A" $ A&B NA @@ -1348,7 +1349,7 @@ def format_index( >>> df = pd.DataFrame([[1, 2, 3]], columns=["123", "~", "$%#"]) >>> df.style.format_index("\\textbf{{{}}}", escape="latex", axis=1).to_latex() - ... # doctest: +SKIP + ... # doctest: +SKIP \begin{tabular}{lrrr} {} & {\textbf{123}} & {\textbf{\textasciitilde }} & {\textbf{\$\%\#}} \\ 0 & 1 & 2 & 3 \\ @@ -1475,7 +1476,7 @@ def relabel_index( Chaining with pre-hidden elements - >>> df.style.hide([0,1]).relabel_index(["C"]) # doctest: +SKIP + >>> df.style.hide([0, 1]).relabel_index(["C"]) # doctest: +SKIP col C c @@ -1493,9 +1494,10 @@ def relabel_index( 1 5 1 0 6 1 7 - >>> styler.hide((midx.get_level_values(0)==0)|(midx.get_level_values(1)==0)) - ... # doctest: +SKIP - >>> styler.hide(level=[0,1]) # doctest: +SKIP + >>> styler.hide((midx.get_level_values(0) == 0) | + ... (midx.get_level_values(1) == 0)) + ... # doctest: +SKIP + >>> styler.hide(level=[0, 1]) # doctest: +SKIP >>> styler.relabel_index(["binary6", "binary7"]) # doctest: +SKIP col binary6 6 @@ -1503,9 +1505,9 @@ def relabel_index( We can also achieve the above by indexing first and then re-labeling - >>> styler = df.loc[[(1,1,0), (1,1,1)]].style - >>> styler.hide(level=[0,1]).relabel_index(["binary6", "binary7"]) - ... # doctest: +SKIP + >>> styler = df.loc[[(1, 1, 0), (1, 1, 1)]].style + >>> styler.hide(level=[0, 1]).relabel_index(["binary6", "binary7"]) + ... # doctest: +SKIP col binary6 6 binary7 7 @@ -1516,9 +1518,9 @@ def relabel_index( brackets if the string if pre-formatted), >>> df = pd.DataFrame({"samples": np.random.rand(10)}) - >>> styler = df.loc[np.random.randint(0,10,3)].style + >>> styler = df.loc[np.random.randint(0, 10, 3)].style >>> styler.relabel_index([f"sample{i+1} ({{}})" for i in range(3)]) - ... # doctest: +SKIP + ... # doctest: +SKIP samples sample1 (5) 0.315811 sample2 (0) 0.495941 From cc0ae35f08010b519e971b0e448cfcbabba61a01 Mon Sep 17 00:00:00 2001 From: Xiao Yuan Date: Mon, 15 Jan 2024 14:51:56 +0800 Subject: [PATCH 05/13] DOC: fix EX03 in pandas.ExcelWriter (#56884) --- ci/code_checks.sh | 1 - pandas/io/excel/_base.py | 7 ++++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 1eeb39d428ee3..6178d0b2a5307 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -82,7 +82,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.Timestamp.ceil \ pandas.Timestamp.floor \ pandas.Timestamp.round \ - pandas.ExcelWriter \ pandas.read_json \ pandas.io.json.build_table_schema \ pandas.io.formats.style.Styler.to_latex \ diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 786f719337b84..2189f54263dec 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -935,7 +935,7 @@ class ExcelWriter(Generic[_WorkbookT]): is installed otherwise `openpyxl `__ * `odswriter `__ for ods files - See ``DataFrame.to_excel`` for typical usage. + See :meth:`DataFrame.to_excel` for typical usage. The writer should be used as a context manager. Otherwise, call `close()` to save and close any opened file handles. @@ -1031,7 +1031,7 @@ class ExcelWriter(Generic[_WorkbookT]): Here, the `if_sheet_exists` parameter can be set to replace a sheet if it already exists: - >>> with ExcelWriter( + >>> with pd.ExcelWriter( ... "path_to_file.xlsx", ... mode="a", ... engine="openpyxl", @@ -1042,7 +1042,8 @@ class ExcelWriter(Generic[_WorkbookT]): You can also write multiple DataFrames to a single sheet. Note that the ``if_sheet_exists`` parameter needs to be set to ``overlay``: - >>> with ExcelWriter("path_to_file.xlsx", + >>> with pd.ExcelWriter( + ... "path_to_file.xlsx", ... mode="a", ... engine="openpyxl", ... if_sheet_exists="overlay", From 82449b9d3c2aaf363b03fdaaa6c8c552b9fd72e6 Mon Sep 17 00:00:00 2001 From: Jordan Murphy <35613487+jordan-d-murphy@users.noreply.github.com> Date: Mon, 15 Jan 2024 00:51:10 -0700 Subject: [PATCH 06/13] DOC: fix EX03 errors in docstrings - pandas.Index.rename, pandas.Index.isin, pandas.IndexSlice (#56870) Co-authored-by: Marc Garcia --- ci/code_checks.sh | 3 --- pandas/core/indexes/base.py | 6 +++--- pandas/core/indexing.py | 2 +- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 6178d0b2a5307..f27dbeaf35915 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -89,12 +89,9 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.DataFrame.to_sql \ pandas.read_stata \ pandas.plotting.scatter_matrix \ - pandas.Index.rename \ pandas.Index.droplevel \ - pandas.Index.isin \ pandas.MultiIndex.names \ pandas.MultiIndex.droplevel \ - pandas.IndexSlice \ pandas.Grouper \ pandas.io.formats.style.Styler.map \ pandas.io.formats.style.Styler.apply_index \ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c3775961cedb8..bdd6392387ae8 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1953,7 +1953,7 @@ def rename(self, name, inplace: bool = False) -> Self | None: >>> idx = pd.MultiIndex.from_product([['python', 'cobra'], ... [2018, 2019]], - ... names=['kind', 'year']) + ... names=['kind', 'year']) >>> idx MultiIndex([('python', 2018), ('python', 2019), @@ -6575,7 +6575,7 @@ def isin(self, values, level=None) -> npt.NDArray[np.bool_]: Examples -------- - >>> idx = pd.Index([1,2,3]) + >>> idx = pd.Index([1, 2, 3]) >>> idx Index([1, 2, 3], dtype='int64') @@ -6584,7 +6584,7 @@ def isin(self, values, level=None) -> npt.NDArray[np.bool_]: >>> idx.isin([1, 4]) array([ True, False, False]) - >>> midx = pd.MultiIndex.from_arrays([[1,2,3], + >>> midx = pd.MultiIndex.from_arrays([[1, 2, 3], ... ['red', 'blue', 'green']], ... names=('number', 'color')) >>> midx diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 0f892d4924933..2e7a237406ca5 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -121,7 +121,7 @@ class _IndexSlice: Examples -------- - >>> midx = pd.MultiIndex.from_product([['A0','A1'], ['B0','B1','B2','B3']]) + >>> midx = pd.MultiIndex.from_product([['A0', 'A1'], ['B0', 'B1', 'B2', 'B3']]) >>> columns = ['foo', 'bar'] >>> dfmi = pd.DataFrame(np.arange(16).reshape((len(midx), len(columns))), ... index=midx, columns=columns) From 116832d91b40384dfa1fab0fc38fc40d54e2b0a8 Mon Sep 17 00:00:00 2001 From: "Paulo S. Costa" Date: Mon, 15 Jan 2024 10:01:43 -0600 Subject: [PATCH 07/13] TYP: Persist typing information for pipe args and kwargs (#56760) * Type generic pipe with function params * Type common pipe with function params * Type resample pipe with function params * Type groupby pipe with function params * Type style pipe function params and tuple func --- pandas/_typing.py | 13 +++++++++++- pandas/core/common.py | 36 +++++++++++++++++++++++++++++++--- pandas/core/generic.py | 26 +++++++++++++++++++++--- pandas/core/groupby/groupby.py | 34 +++++++++++++++++++++++++++----- pandas/core/resample.py | 29 ++++++++++++++++++++++++--- pandas/io/formats/style.py | 31 +++++++++++++++++++++++++++-- 6 files changed, 152 insertions(+), 17 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index a80f9603493a7..fa9dc14bb4bd7 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -90,18 +90,29 @@ from typing import SupportsIndex if sys.version_info >= (3, 10): + from typing import Concatenate # pyright: ignore[reportUnusedImport] + from typing import ParamSpec from typing import TypeGuard # pyright: ignore[reportUnusedImport] else: - from typing_extensions import TypeGuard # pyright: ignore[reportUnusedImport] + from typing_extensions import ( # pyright: ignore[reportUnusedImport] + Concatenate, + ParamSpec, + TypeGuard, + ) + + P = ParamSpec("P") if sys.version_info >= (3, 11): from typing import Self # pyright: ignore[reportUnusedImport] else: from typing_extensions import Self # pyright: ignore[reportUnusedImport] + else: npt: Any = None + ParamSpec: Any = None Self: Any = None TypeGuard: Any = None + Concatenate: Any = None HashableT = TypeVar("HashableT", bound=Hashable) MutableMappingT = TypeVar("MutableMappingT", bound=MutableMapping) diff --git a/pandas/core/common.py b/pandas/core/common.py index 7d864e02be54e..69b602feee3ea 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -24,6 +24,7 @@ TYPE_CHECKING, Any, Callable, + TypeVar, cast, overload, ) @@ -51,7 +52,9 @@ from pandas._typing import ( AnyArrayLike, ArrayLike, + Concatenate, NpDtype, + P, RandomState, T, ) @@ -463,8 +466,34 @@ def random_state(state: RandomState | None = None): ) +_T = TypeVar("_T") # Secondary TypeVar for use in pipe's type hints + + +@overload +def pipe( + obj: _T, + func: Callable[Concatenate[_T, P], T], + *args: P.args, + **kwargs: P.kwargs, +) -> T: + ... + + +@overload +def pipe( + obj: Any, + func: tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, +) -> T: + ... + + def pipe( - obj, func: Callable[..., T] | tuple[Callable[..., T], str], *args, **kwargs + obj: _T, + func: Callable[Concatenate[_T, P], T] | tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, ) -> T: """ Apply a function ``func`` to object ``obj`` either by passing obj as the @@ -490,12 +519,13 @@ def pipe( object : the return type of ``func``. """ if isinstance(func, tuple): - func, target = func + # Assigning to func_ so pyright understands that it's a callable + func_, target = func if target in kwargs: msg = f"{target} is both the pipe target and a keyword argument" raise ValueError(msg) kwargs[target] = obj - return func(*args, **kwargs) + return func_(*args, **kwargs) else: return func(obj, *args, **kwargs) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a61148a09be18..caac11b6ab4f6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -50,6 +50,7 @@ Axis, AxisInt, CompressionOptions, + Concatenate, DtypeArg, DtypeBackend, DtypeObj, @@ -213,6 +214,7 @@ ) from pandas._libs.tslibs import BaseOffset + from pandas._typing import P from pandas import ( DataFrame, @@ -6118,13 +6120,31 @@ def sample( return result + @overload + def pipe( + self, + func: Callable[Concatenate[Self, P], T], + *args: P.args, + **kwargs: P.kwargs, + ) -> T: + ... + + @overload + def pipe( + self, + func: tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: + ... + @final @doc(klass=_shared_doc_kwargs["klass"]) def pipe( self, - func: Callable[..., T] | tuple[Callable[..., T], str], - *args, - **kwargs, + func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, ) -> T: r""" Apply chainable functions that expect Series or DataFrames. diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f36297a59498d..ab22d4e3dc200 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -29,6 +29,7 @@ class providing the base-class of operations. Union, cast, final, + overload, ) import warnings @@ -55,7 +56,6 @@ class providing the base-class of operations. PositionalIndexer, RandomState, Scalar, - T, npt, ) from pandas.compat.numpy import function as nv @@ -147,7 +147,13 @@ class providing the base-class of operations. ) if TYPE_CHECKING: - from typing import Any + from pandas._typing import ( + Any, + Concatenate, + P, + Self, + T, + ) from pandas.core.resample import Resampler from pandas.core.window import ( @@ -989,6 +995,24 @@ def _selected_obj(self): def _dir_additions(self) -> set[str]: return self.obj._dir_additions() + @overload + def pipe( + self, + func: Callable[Concatenate[Self, P], T], + *args: P.args, + **kwargs: P.kwargs, + ) -> T: + ... + + @overload + def pipe( + self, + func: tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: + ... + @Substitution( klass="GroupBy", examples=dedent( @@ -1014,9 +1038,9 @@ def _dir_additions(self) -> set[str]: @Appender(_pipe_template) def pipe( self, - func: Callable[..., T] | tuple[Callable[..., T], str], - *args, - **kwargs, + func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, ) -> T: return com.pipe(self, func, *args, **kwargs) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 31309777c154d..924f9e6d49040 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -9,6 +9,7 @@ cast, final, no_type_check, + overload, ) import warnings @@ -97,12 +98,16 @@ from collections.abc import Hashable from pandas._typing import ( + Any, AnyArrayLike, Axis, AxisInt, + Concatenate, Frequency, IndexLabel, InterpolateOptions, + P, + Self, T, TimedeltaConvertibleTypes, TimeGrouperOrigin, @@ -254,6 +259,24 @@ def _get_binner(self): bin_grouper = BinGrouper(bins, binlabels, indexer=self._indexer) return binner, bin_grouper + @overload + def pipe( + self, + func: Callable[Concatenate[Self, P], T], + *args: P.args, + **kwargs: P.kwargs, + ) -> T: + ... + + @overload + def pipe( + self, + func: tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: + ... + @final @Substitution( klass="Resampler", @@ -278,9 +301,9 @@ def _get_binner(self): @Appender(_pipe_template) def pipe( self, - func: Callable[..., T] | tuple[Callable[..., T], str], - *args, - **kwargs, + func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, ) -> T: return super().pipe(func, *args, **kwargs) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 0fbfae22f4663..5289a21adfbb4 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -9,7 +9,6 @@ import operator from typing import ( TYPE_CHECKING, - Any, Callable, overload, ) @@ -66,15 +65,20 @@ from matplotlib.colors import Colormap from pandas._typing import ( + Any, Axis, AxisInt, + Concatenate, FilePath, IndexLabel, IntervalClosedType, Level, + P, QuantileInterpolation, Scalar, + Self, StorageOptions, + T, WriteBuffer, WriteExcelBuffer, ) @@ -3614,7 +3618,30 @@ class MyStyler(cls): # type: ignore[valid-type,misc] return MyStyler - def pipe(self, func: Callable, *args, **kwargs): + @overload + def pipe( + self, + func: Callable[Concatenate[Self, P], T], + *args: P.args, + **kwargs: P.kwargs, + ) -> T: + ... + + @overload + def pipe( + self, + func: tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: + ... + + def pipe( + self, + func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: """ Apply ``func(self, *args, **kwargs)``, and return the result. From 32d29ff3bd4c21e7e58ebcda33aa4d25605fb388 Mon Sep 17 00:00:00 2001 From: Saadha Salim Date: Mon, 15 Jan 2024 21:59:18 +0530 Subject: [PATCH 08/13] DOC: Additions/updates to documentation : grammar changes to documentation (#56886) --- doc/source/development/contributing.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 82af8122a6bbd..78d22c768b865 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -19,7 +19,7 @@ Bug reports and enhancement requests ==================================== Bug reports and enhancement requests are an important part of making pandas more stable and -are curated though Github issues. When reporting and issue or request, please select the `appropriate +are curated though Github issues. When reporting an issue or request, please select the `appropriate category and fill out the issue form fully `_ to ensure others and the core development team can fully understand the scope of the issue. From 5c67c969acb19810377bf7163bc902ea9d8dcd79 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 15 Jan 2024 18:52:24 +0100 Subject: [PATCH 09/13] TST: extension tests use its own fixtures (#56889) TST: ensure extension tests use its own fixtures --- pandas/tests/extension/base/accumulate.py | 1 + pandas/tests/extension/base/dtype.py | 1 + pandas/tests/extension/base/groupby.py | 1 + pandas/tests/extension/base/methods.py | 6 ++++++ pandas/tests/extension/base/reduce.py | 3 +++ pandas/tests/extension/base/setitem.py | 5 +++-- pandas/tests/extension/decimal/test_decimal.py | 1 + pandas/tests/extension/test_arrow.py | 7 +++++++ pandas/tests/extension/test_categorical.py | 2 ++ pandas/tests/extension/test_datetime.py | 2 ++ pandas/tests/extension/test_masked.py | 1 + pandas/tests/extension/test_numpy.py | 1 + pandas/tests/extension/test_period.py | 1 + pandas/tests/extension/test_sparse.py | 4 ++++ 14 files changed, 34 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 9189ef7ec9aa5..9a41a3a582c4a 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -26,6 +26,7 @@ def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool): expected = getattr(alt, op_name)(skipna=skipna) tm.assert_series_equal(result, expected, check_dtype=False) + @pytest.mark.parametrize("skipna", [True, False]) def test_accumulate_series(self, data, all_numeric_accumulations, skipna): op_name = all_numeric_accumulations ser = pd.Series(data) diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index 3fb116430861a..c7b768f6e3c88 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -114,6 +114,7 @@ def test_get_common_dtype(self, dtype): # only case we can test in general) assert dtype._get_common_dtype([dtype]) == dtype + @pytest.mark.parametrize("skipna", [True, False]) def test_infer_dtype(self, data, data_missing, skipna): # only testing that this works without raising an error res = infer_dtype(data, skipna=skipna) diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 9f38246d1a317..75628ea177fc2 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -34,6 +34,7 @@ def test_grouping_grouper(self, data_for_grouping): tm.assert_numpy_array_equal(gr1.grouping_vector, df.A.values) tm.assert_extension_array_equal(gr2.grouping_vector, data_for_grouping) + @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_extension_agg(self, as_index, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index ba247f51e5f1b..c803a8113b4a4 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -37,6 +37,7 @@ def test_value_counts_default_dropna(self, data): kwarg = sig.parameters["dropna"] assert kwarg.default is True + @pytest.mark.parametrize("dropna", [True, False]) def test_value_counts(self, all_data, dropna): all_data = all_data[:10] if dropna: @@ -96,6 +97,7 @@ def test_apply_simple_series(self, data): result = pd.Series(data).apply(id) assert isinstance(result, pd.Series) + @pytest.mark.parametrize("na_action", [None, "ignore"]) def test_map(self, data_missing, na_action): result = data_missing.map(lambda x: x, na_action=na_action) expected = data_missing.to_numpy() @@ -211,6 +213,7 @@ def test_nargsort(self, data_missing_for_sorting, na_position, expected): result = nargsort(data_missing_for_sorting, na_position=na_position) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("ascending", [True, False]) def test_sort_values(self, data_for_sorting, ascending, sort_by_key): ser = pd.Series(data_for_sorting) result = ser.sort_values(ascending=ascending, key=sort_by_key) @@ -224,6 +227,7 @@ def test_sort_values(self, data_for_sorting, ascending, sort_by_key): tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("ascending", [True, False]) def test_sort_values_missing( self, data_missing_for_sorting, ascending, sort_by_key ): @@ -235,6 +239,7 @@ def test_sort_values_missing( expected = ser.iloc[[0, 2, 1]] tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("ascending", [True, False]) def test_sort_values_frame(self, data_for_sorting, ascending): df = pd.DataFrame({"A": [1, 2, 1], "B": data_for_sorting}) result = df.sort_values(["A", "B"]) @@ -243,6 +248,7 @@ def test_sort_values_frame(self, data_for_sorting, ascending): ) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("keep", ["first", "last", False]) def test_duplicated(self, data, keep): arr = data.take([0, 1, 0, 1]) result = arr.duplicated(keep=keep) diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index 2a443901fa41a..6ea1b3a6fbe9d 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -77,6 +77,7 @@ def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool): tm.assert_extension_array_equal(result1, expected) + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna): op_name = all_boolean_reductions ser = pd.Series(data) @@ -95,6 +96,7 @@ def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna): self.check_reduce(ser, op_name, skipna) @pytest.mark.filterwarnings("ignore::RuntimeWarning") + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): op_name = all_numeric_reductions ser = pd.Series(data) @@ -113,6 +115,7 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): # min/max with empty produce numpy warnings self.check_reduce(ser, op_name, skipna) + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_frame(self, data, all_numeric_reductions, skipna): op_name = all_numeric_reductions ser = pd.Series(data) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index ba756b471eb8b..7ee2c23c5b23a 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -105,9 +105,10 @@ def test_setitem_sequence_broadcasts(self, data, box_in_series): assert data[0] == data[2] assert data[1] == data[2] - def test_setitem_scalar(self, data, indexer_li): + @pytest.mark.parametrize("setter", ["loc", "iloc"]) + def test_setitem_scalar(self, data, setter): arr = pd.Series(data) - setter = indexer_li(arr) + setter = getattr(arr, setter) setter[0] = data[1] assert arr[0] == data[1] diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 64b897d27a835..69958b51c9e47 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -257,6 +257,7 @@ def test_fillna_copy_series(self, data_missing, using_copy_on_write): with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): super().test_fillna_copy_series(data_missing) + @pytest.mark.parametrize("dropna", [True, False]) def test_value_counts(self, all_data, dropna): all_data = all_data[:10] if dropna: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 8d0bb85b2a01f..05a112e464677 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -271,6 +271,7 @@ def test_compare_scalar(self, data, comparison_op): ser = pd.Series(data) self._compare_other(ser, data, comparison_op, data[0]) + @pytest.mark.parametrize("na_action", [None, "ignore"]) def test_map(self, data_missing, na_action): if data_missing.dtype.kind in "mM": result = data_missing.map(lambda x: x, na_action=na_action) @@ -423,6 +424,7 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: return False return True + @pytest.mark.parametrize("skipna", [True, False]) def test_accumulate_series(self, data, all_numeric_accumulations, skipna, request): pa_type = data.dtype.pyarrow_dtype op_name = all_numeric_accumulations @@ -524,6 +526,7 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): expected = getattr(alt, op_name)(skipna=skipna) tm.assert_almost_equal(result, expected) + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, request): dtype = data.dtype pa_dtype = dtype.pyarrow_dtype @@ -549,6 +552,7 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, reque request.applymarker(xfail_mark) super().test_reduce_series_numeric(data, all_numeric_reductions, skipna) + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_boolean( self, data, all_boolean_reductions, skipna, na_value, request ): @@ -585,6 +589,7 @@ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): }[arr.dtype.kind] return cmp_dtype + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_frame(self, data, all_numeric_reductions, skipna, request): op_name = all_numeric_reductions if op_name == "skew": @@ -2325,6 +2330,7 @@ def test_str_extract_expand(): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) def test_duration_from_strings_with_nat(unit): # GH51175 strings = ["1000", "NaT"] @@ -2827,6 +2833,7 @@ def test_dt_components(): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("skipna", [True, False]) def test_boolean_reduce_series_all_null(all_boolean_reductions, skipna): # GH51624 ser = pd.Series([None], dtype="float64[pyarrow]") diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index edf560dda36e7..bd4ab5077c6e8 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -134,6 +134,7 @@ def test_combine_add(self, data_repeated): expected = pd.Series([a + val for a in list(orig_data1)]) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("na_action", [None, "ignore"]) def test_map(self, data, na_action): result = data.map(lambda x: x, na_action=na_action) tm.assert_extension_array_equal(result, data) @@ -174,6 +175,7 @@ def test_array_repr(self, data, size): super().test_array_repr(data, size) @pytest.mark.xfail(reason="TBD") + @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_extension_agg(self, as_index, data_for_grouping): super().test_groupby_extension_agg(as_index, data_for_grouping) diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index 4b25b2768849e..6352bf76f96bb 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -100,6 +100,7 @@ def _supports_accumulation(self, ser, op_name: str) -> bool: def _supports_reduction(self, obj, op_name: str) -> bool: return op_name in ["min", "max", "median", "mean", "std", "any", "all"] + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna): meth = all_boolean_reductions msg = f"'{meth}' with datetime64 dtypes is deprecated and will raise in" @@ -113,6 +114,7 @@ def test_series_constructor(self, data): data = data._with_freq(None) super().test_series_constructor(data) + @pytest.mark.parametrize("na_action", [None, "ignore"]) def test_map(self, data, na_action): result = data.map(lambda x: x, na_action=na_action) tm.assert_extension_array_equal(result, data) diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index 0e19c4078b471..3efc561d6a125 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -169,6 +169,7 @@ def data_for_grouping(dtype): class TestMaskedArrays(base.ExtensionTests): + @pytest.mark.parametrize("na_action", [None, "ignore"]) def test_map(self, data_missing, na_action): result = data_missing.map(lambda x: x, na_action=na_action) if data_missing.dtype == Float32Dtype(): diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 0893c6231197e..3f54f6cbbba69 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -313,6 +313,7 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): tm.assert_almost_equal(result, expected) @pytest.mark.skip("TODO: tests not written yet") + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_frame(self, data, all_numeric_reductions, skipna): pass diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py index 4fe9c160d66af..2d1d213322bac 100644 --- a/pandas/tests/extension/test_period.py +++ b/pandas/tests/extension/test_period.py @@ -109,6 +109,7 @@ def test_diff(self, data, periods): else: super().test_diff(data, periods) + @pytest.mark.parametrize("na_action", [None, "ignore"]) def test_map(self, data, na_action): result = data.map(lambda x: x, na_action=na_action) tm.assert_extension_array_equal(result, data) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 2efcc192aa15b..d8f14383ef114 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -102,6 +102,7 @@ class TestSparseArray(base.ExtensionTests): def _supports_reduction(self, obj, op_name: str) -> bool: return True + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, request): if all_numeric_reductions in [ "prod", @@ -126,6 +127,7 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, reque super().test_reduce_series_numeric(data, all_numeric_reductions, skipna) + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_frame(self, data, all_numeric_reductions, skipna, request): if all_numeric_reductions in [ "prod", @@ -366,6 +368,7 @@ def test_map(self, func, na_action, expected): result = data.map(func, na_action=na_action) tm.assert_extension_array_equal(result, expected) + @pytest.mark.parametrize("na_action", [None, "ignore"]) def test_map_raises(self, data, na_action): # GH52096 msg = "fill value in the sparse values not supported" @@ -486,6 +489,7 @@ def test_array_repr(self, data, size): super().test_array_repr(data, size) @pytest.mark.xfail(reason="result does not match expected") + @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_extension_agg(self, as_index, data_for_grouping): super().test_groupby_extension_agg(as_index, data_for_grouping) From de723d28ff2d0da093a94b38934e57a20aac4a67 Mon Sep 17 00:00:00 2001 From: Jordan Murphy <35613487+jordan-d-murphy@users.noreply.github.com> Date: Mon, 15 Jan 2024 10:56:27 -0700 Subject: [PATCH 10/13] DOC: fix EX03 errors in docstrings - pandas.Timestamp - ceil, floor, round (#56879) Co-authored-by: Marc Garcia --- ci/code_checks.sh | 3 --- pandas/_libs/tslibs/nattype.pyx | 24 ++++++++++++------------ pandas/_libs/tslibs/timestamps.pyx | 24 ++++++++++++------------ 3 files changed, 24 insertions(+), 27 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index f27dbeaf35915..78bf63fa3a3ba 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -79,9 +79,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.errors.SettingWithCopyWarning \ pandas.errors.SpecificationError \ pandas.errors.UndefinedVariableError \ - pandas.Timestamp.ceil \ - pandas.Timestamp.floor \ - pandas.Timestamp.round \ pandas.read_json \ pandas.io.json.build_table_schema \ pandas.io.formats.style.Styler.to_latex \ diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 65db0e05f859c..cd5e6e521b79f 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -973,16 +973,16 @@ timedelta}, default 'raise' A timestamp can be rounded using multiple frequency units: - >>> ts.round(freq='h') # hour + >>> ts.round(freq='h') # hour Timestamp('2020-03-14 16:00:00') - >>> ts.round(freq='min') # minute + >>> ts.round(freq='min') # minute Timestamp('2020-03-14 15:33:00') - >>> ts.round(freq='s') # seconds + >>> ts.round(freq='s') # seconds Timestamp('2020-03-14 15:32:52') - >>> ts.round(freq='ms') # milliseconds + >>> ts.round(freq='ms') # milliseconds Timestamp('2020-03-14 15:32:52.193000') ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes): @@ -1062,16 +1062,16 @@ timedelta}, default 'raise' A timestamp can be floored using multiple frequency units: - >>> ts.floor(freq='h') # hour + >>> ts.floor(freq='h') # hour Timestamp('2020-03-14 15:00:00') - >>> ts.floor(freq='min') # minute + >>> ts.floor(freq='min') # minute Timestamp('2020-03-14 15:32:00') - >>> ts.floor(freq='s') # seconds + >>> ts.floor(freq='s') # seconds Timestamp('2020-03-14 15:32:52') - >>> ts.floor(freq='ns') # nanoseconds + >>> ts.floor(freq='ns') # nanoseconds Timestamp('2020-03-14 15:32:52.192548651') ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes): @@ -1151,16 +1151,16 @@ timedelta}, default 'raise' A timestamp can be ceiled using multiple frequency units: - >>> ts.ceil(freq='h') # hour + >>> ts.ceil(freq='h') # hour Timestamp('2020-03-14 16:00:00') - >>> ts.ceil(freq='min') # minute + >>> ts.ceil(freq='min') # minute Timestamp('2020-03-14 15:33:00') - >>> ts.ceil(freq='s') # seconds + >>> ts.ceil(freq='s') # seconds Timestamp('2020-03-14 15:32:53') - >>> ts.ceil(freq='us') # microseconds + >>> ts.ceil(freq='us') # microseconds Timestamp('2020-03-14 15:32:52.192549') ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes): diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 1dae2403706e8..d4cd90613ca5b 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1973,16 +1973,16 @@ timedelta}, default 'raise' A timestamp can be rounded using multiple frequency units: - >>> ts.round(freq='h') # hour + >>> ts.round(freq='h') # hour Timestamp('2020-03-14 16:00:00') - >>> ts.round(freq='min') # minute + >>> ts.round(freq='min') # minute Timestamp('2020-03-14 15:33:00') - >>> ts.round(freq='s') # seconds + >>> ts.round(freq='s') # seconds Timestamp('2020-03-14 15:32:52') - >>> ts.round(freq='ms') # milliseconds + >>> ts.round(freq='ms') # milliseconds Timestamp('2020-03-14 15:32:52.193000') ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes): @@ -2064,16 +2064,16 @@ timedelta}, default 'raise' A timestamp can be floored using multiple frequency units: - >>> ts.floor(freq='h') # hour + >>> ts.floor(freq='h') # hour Timestamp('2020-03-14 15:00:00') - >>> ts.floor(freq='min') # minute + >>> ts.floor(freq='min') # minute Timestamp('2020-03-14 15:32:00') - >>> ts.floor(freq='s') # seconds + >>> ts.floor(freq='s') # seconds Timestamp('2020-03-14 15:32:52') - >>> ts.floor(freq='ns') # nanoseconds + >>> ts.floor(freq='ns') # nanoseconds Timestamp('2020-03-14 15:32:52.192548651') ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes): @@ -2153,16 +2153,16 @@ timedelta}, default 'raise' A timestamp can be ceiled using multiple frequency units: - >>> ts.ceil(freq='h') # hour + >>> ts.ceil(freq='h') # hour Timestamp('2020-03-14 16:00:00') - >>> ts.ceil(freq='min') # minute + >>> ts.ceil(freq='min') # minute Timestamp('2020-03-14 15:33:00') - >>> ts.ceil(freq='s') # seconds + >>> ts.ceil(freq='s') # seconds Timestamp('2020-03-14 15:32:53') - >>> ts.ceil(freq='us') # microseconds + >>> ts.ceil(freq='us') # microseconds Timestamp('2020-03-14 15:32:52.192549') ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes): From 178a352d918238f3d21064a341f2c6f7c498f678 Mon Sep 17 00:00:00 2001 From: Jordan Murphy <35613487+jordan-d-murphy@users.noreply.github.com> Date: Mon, 15 Jan 2024 10:57:47 -0700 Subject: [PATCH 11/13] DOC: fix EX03 errors in docstrings - pandas.io.json.build_table_schema, pandas.read_stata, pandas.plotting.scatter_matrix, pandas.Index.droplevel , pandas.Grouper (#56880) Co-authored-by: Marc Garcia --- ci/code_checks.sh | 7 ------- pandas/core/groupby/grouper.py | 4 ++-- pandas/core/indexes/base.py | 2 +- pandas/io/json/_table_schema.py | 2 +- pandas/io/stata.py | 4 ++-- pandas/plotting/_misc.py | 2 +- 6 files changed, 7 insertions(+), 14 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 78bf63fa3a3ba..e013a329869dd 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -80,16 +80,9 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.errors.SpecificationError \ pandas.errors.UndefinedVariableError \ pandas.read_json \ - pandas.io.json.build_table_schema \ pandas.io.formats.style.Styler.to_latex \ pandas.read_parquet \ pandas.DataFrame.to_sql \ - pandas.read_stata \ - pandas.plotting.scatter_matrix \ - pandas.Index.droplevel \ - pandas.MultiIndex.names \ - pandas.MultiIndex.droplevel \ - pandas.Grouper \ pandas.io.formats.style.Styler.map \ pandas.io.formats.style.Styler.apply_index \ pandas.io.formats.style.Styler.map_index \ diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e68c393f8f707..a93cf33590c3e 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -151,8 +151,8 @@ class Grouper: Specify a resample operation on the column 'Publish date' >>> df = pd.DataFrame( - ... { - ... "Publish date": [ + ... { + ... "Publish date": [ ... pd.Timestamp("2000-01-02"), ... pd.Timestamp("2000-01-02"), ... pd.Timestamp("2000-01-09"), diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bdd6392387ae8..a2666cd6cb229 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2124,7 +2124,7 @@ def droplevel(self, level: IndexLabel = 0): Examples -------- >>> mi = pd.MultiIndex.from_arrays( - ... [[1, 2], [3, 4], [5, 6]], names=['x', 'y', 'z']) + ... [[1, 2], [3, 4], [5, 6]], names=['x', 'y', 'z']) >>> mi MultiIndex([(1, 3, 5), (2, 4, 6)], diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index 4d9fba72cf173..c279eeea78c6b 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -277,7 +277,7 @@ def build_table_schema( ... {'A': [1, 2, 3], ... 'B': ['a', 'b', 'c'], ... 'C': pd.date_range('2016-01-01', freq='d', periods=3), - ... }, index=pd.Index(range(3), name='idx')) + ... }, index=pd.Index(range(3), name='idx')) >>> build_table_schema(df) {'fields': \ [{'name': 'idx', 'type': 'integer'}, \ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 4abf9af185a01..576e27f202524 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -176,7 +176,7 @@ Creating a dummy stata for this example >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', 'parrot'], -... 'speed': [350, 18, 361, 15]}}) # doctest: +SKIP +... 'speed': [350, 18, 361, 15]}}) # doctest: +SKIP >>> df.to_stata('animals.dta') # doctest: +SKIP Read a Stata dta file: @@ -189,7 +189,7 @@ >>> df = pd.DataFrame(values, columns=["i"]) # doctest: +SKIP >>> df.to_stata('filename.dta') # doctest: +SKIP ->>> with pd.read_stata('filename.dta', chunksize=10000) as itr: # doctest: +SKIP +>>> with pd.read_stata('filename.dta', chunksize=10000) as itr: # doctest: +SKIP >>> for chunk in itr: ... # Operate on a single chunk, e.g., chunk.mean() ... pass # doctest: +SKIP diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 18db460d388a4..c8c8f68f5289e 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -204,7 +204,7 @@ def scatter_matrix( .. plot:: :context: close-figs - >>> df = pd.DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D']) + >>> df = pd.DataFrame(np.random.randn(1000, 4), columns=['A', 'B', 'C', 'D']) >>> pd.plotting.scatter_matrix(df, alpha=0.2) array([[, , , ], From f14893d507104aba4ec1dd82c80d1fc4824caf9b Mon Sep 17 00:00:00 2001 From: Tiffany Xiao Date: Mon, 15 Jan 2024 12:59:10 -0500 Subject: [PATCH 12/13] DOC: fixed Ex03 errors in docstrings: (#56878) * Addressing docstring errors * Resolving merge conflicts * Removing extra formatting in See Also --- ci/code_checks.sh | 3 --- pandas/errors/__init__.py | 37 +++++++++++++++++++++++++++++-------- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index e013a329869dd..145be3e52f2c0 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -73,9 +73,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX03 --ignore_functions \ pandas.Series.plot.line \ pandas.Series.to_sql \ - pandas.errors.DatabaseError \ - pandas.errors.IndexingError \ - pandas.errors.InvalidColumnName \ pandas.errors.SettingWithCopyWarning \ pandas.errors.SpecificationError \ pandas.errors.UndefinedVariableError \ diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 9d39b8d92fec9..3cda1273d4ae7 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -599,19 +599,30 @@ class IndexingError(Exception): """ Exception is raised when trying to index and there is a mismatch in dimensions. + Raised by properties like :attr:`.pandas.DataFrame.iloc` when + an indexer is out of bounds or :attr:`.pandas.DataFrame.loc` when its index is + unalignable to the frame index. + + See Also + -------- + DataFrame.iloc : Purely integer-location based indexing for \ + selection by position. + DataFrame.loc : Access a group of rows and columns by label(s) \ + or a boolean array. + Examples -------- >>> df = pd.DataFrame({'A': [1, 1, 1]}) - >>> df.loc[..., ..., 'A'] # doctest: +SKIP + >>> df.loc[..., ..., 'A'] # doctest: +SKIP ... # IndexingError: indexer may only contain one '...' entry >>> df = pd.DataFrame({'A': [1, 1, 1]}) - >>> df.loc[1, ..., ...] # doctest: +SKIP + >>> df.loc[1, ..., ...] # doctest: +SKIP ... # IndexingError: Too many indexers - >>> df[pd.Series([True], dtype=bool)] # doctest: +SKIP + >>> df[pd.Series([True], dtype=bool)] # doctest: +SKIP ... # IndexingError: Unalignable boolean Series provided as indexer... >>> s = pd.Series(range(2), - ... index = pd.MultiIndex.from_product([["a", "b"], ["c"]])) - >>> s.loc["a", "c", "d"] # doctest: +SKIP + ... index=pd.MultiIndex.from_product([["a", "b"], ["c"]])) + >>> s.loc["a", "c", "d"] # doctest: +SKIP ... # IndexingError: Too many indexers """ @@ -713,13 +724,19 @@ class AttributeConflictWarning(Warning): class DatabaseError(OSError): """ - Error is raised when executing sql with bad syntax or sql that throws an error. + Error is raised when executing SQL with bad syntax or SQL that throws an error. + + Raised by :func:`.pandas.read_sql` when a bad SQL statement is passed in. + + See Also + -------- + read_sql : Read SQL query or database table into a DataFrame. Examples -------- >>> from sqlite3 import connect >>> conn = connect(':memory:') - >>> pd.read_sql('select * test', conn) # doctest: +SKIP + >>> pd.read_sql('select * test', conn) # doctest: +SKIP ... # DatabaseError: Execution failed on sql 'test': near "test": syntax error """ @@ -758,10 +775,14 @@ class InvalidColumnName(Warning): Because the column name is an invalid Stata variable, the name needs to be converted. + See Also + -------- + DataFrame.to_stata : Export DataFrame object to Stata dta format. + Examples -------- >>> df = pd.DataFrame({"0categories": pd.Series([2, 2])}) - >>> df.to_stata('test') # doctest: +SKIP + >>> df.to_stata('test') # doctest: +SKIP ... # InvalidColumnName: Not all pandas column names were valid Stata variable... """ From 1af1030506efb6178db3b97ec4a2eb68290472bc Mon Sep 17 00:00:00 2001 From: JackCollins91 <112877841+JackCollins91@users.noreply.github.com> Date: Mon, 15 Jan 2024 19:15:49 +0100 Subject: [PATCH 13/13] DOC: Avoid requesting data from s3 buckets from our docs (#56762) * Update io.rst Make consistent with other s3 bucket URL examples and avoid doc build error when problem with s3 url. * Update io.rst Make example consistent with other code block examples * Update v2.3.0.rst * immitating interactive mode For each S3 bucket code block, ideally we show what the output would be, but without making an actual call. Unfortunately, for several of the S3 buckets, there are issues with the code, which we must fix in another commit or PR. For now, the two S3 examples that do work, we edit to make the code block show what the output would have been if it had run successfully. Find details on issues in conversation on PR #56592 * Update io.rst Code still doesn't run, but at least unmatched } is no longer the issue. * Update v2.3.0.rst avoids unnecessary file change in PR * Update io.rst Rollback changes to one of the examples (out of scope) * Update io.rst * Update io.rst --------- Co-authored-by: JackCollins1991 <55454098+JackCollins1991@users.noreply.github.com> --- doc/source/user_guide/io.rst | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index b3ad23e0d4104..bb5b4e056d527 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1704,7 +1704,7 @@ option parameter: .. code-block:: python - storage_options = {"client_kwargs": {"endpoint_url": "http://127.0.0.1:5555"}}} + storage_options = {"client_kwargs": {"endpoint_url": "http://127.0.0.1:5555"}} df = pd.read_json("s3://pandas-test/test-1", storage_options=storage_options) More sample configurations and documentation can be found at `S3Fs documentation @@ -3015,14 +3015,15 @@ Read in the content of the "books.xml" as instance of ``StringIO`` or Even read XML from AWS S3 buckets such as NIH NCBI PMC Article Datasets providing Biomedical and Life Science Jorurnals: -.. ipython:: python - :okwarning: +.. code-block:: python - df = pd.read_xml( - "s3://pmc-oa-opendata/oa_comm/xml/all/PMC1236943.xml", - xpath=".//journal-meta", - ) - df + >>> df = pd.read_xml( + ... "s3://pmc-oa-opendata/oa_comm/xml/all/PMC1236943.xml", + ... xpath=".//journal-meta", + ...) + >>> df + journal-id journal-title issn publisher + 0 Cardiovasc Ultrasound Cardiovascular Ultrasound 1476-7120 NaN With `lxml`_ as default ``parser``, you access the full-featured XML library that extends Python's ElementTree API. One powerful tool is ability to query