From 55441d313c0d5c8e23558734bc20681c1a31378a Mon Sep 17 00:00:00 2001 From: wenchen-cai Date: Tue, 27 Aug 2024 00:23:26 +0800 Subject: [PATCH 1/4] DOCS: fix docstring validation errors for pandas.Series.str (#59597) --- ci/code_checks.sh | 2 -- pandas/core/strings/accessor.py | 8 ++++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 2d260c78a8f33..916720e5a01e3 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -142,8 +142,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.sparse.sp_values SA01" \ -i "pandas.Series.sparse.to_coo PR07,RT03,SA01" \ -i "pandas.Series.std PR01,RT03,SA01" \ - -i "pandas.Series.str.wrap RT03,SA01" \ - -i "pandas.Series.str.zfill RT03" \ -i "pandas.Timedelta.asm8 SA01" \ -i "pandas.Timedelta.ceil SA01" \ -i "pandas.Timedelta.components SA01" \ diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index c88270b2a2f16..bdb88e981bcda 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1853,6 +1853,7 @@ def zfill(self, width: int): Returns ------- Series/Index of objects. + A Series or Index where the strings are prepended with '0' characters. See Also -------- @@ -2385,6 +2386,13 @@ def wrap( Returns ------- Series or Index + A Series or Index where the strings are wrapped at the specified line width. + + See Also + -------- + Series.str.strip : Remove leading and trailing characters in Series/Index. + Series.str.lstrip : Remove leading characters in Series/Index. + Series.str.rstrip : Remove trailing characters in Series/Index. Notes ----- From 6fa4eb43fbf01d558c9e8cd0fdde6fa5359c9d19 Mon Sep 17 00:00:00 2001 From: Abhinav Reddy Date: Mon, 26 Aug 2024 12:25:02 -0400 Subject: [PATCH 2/4] DOC: Fix Numpy Docstring errors in pandas.api.extensions.ExtensionArray (#59605) * fix duplicated * fix fillna * fix insert * fix isin * fix tolist * fix unique * fix view --------- Co-authored-by: Abhinav Thimma --- ci/code_checks.sh | 7 ----- pandas/core/arrays/base.py | 52 +++++++++++++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 8 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 916720e5a01e3..4ddc429f2a51c 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -173,14 +173,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.tzinfo GL08" \ -i "pandas.Timestamp.value GL08" \ -i "pandas.Timestamp.year GL08" \ - -i "pandas.api.extensions.ExtensionArray.duplicated RT03,SA01" \ - -i "pandas.api.extensions.ExtensionArray.fillna SA01" \ - -i "pandas.api.extensions.ExtensionArray.insert PR07,RT03,SA01" \ -i "pandas.api.extensions.ExtensionArray.interpolate PR01,SA01" \ - -i "pandas.api.extensions.ExtensionArray.isin PR07,RT03,SA01" \ - -i "pandas.api.extensions.ExtensionArray.tolist RT03,SA01" \ - -i "pandas.api.extensions.ExtensionArray.unique RT03,SA01" \ - -i "pandas.api.extensions.ExtensionArray.view SA01" \ -i "pandas.api.interchange.from_dataframe RT03,SA01" \ -i "pandas.api.types.is_bool PR01,SA01" \ -i "pandas.api.types.is_categorical_dtype SA01" \ diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index f05d1ae18c604..2124f86b03b9c 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1137,6 +1137,13 @@ def fillna( ExtensionArray With NA/NaN filled. + See Also + -------- + api.extensions.ExtensionArray.dropna : Return ExtensionArray without + NA values. + api.extensions.ExtensionArray.isna : A 1-D array indicating if + each value is missing. + Examples -------- >>> arr = pd.array([np.nan, np.nan, 2, 3, np.nan, np.nan]) @@ -1220,6 +1227,15 @@ def duplicated( Returns ------- ndarray[bool] + With true in indices where elements are duplicated and false otherwise. + + See Also + -------- + DataFrame.duplicated : Return boolean Series denoting + duplicate rows. + Series.duplicated : Indicate duplicate Series values. + api.extensions.ExtensionArray.unique : Compute the ExtensionArray + of unique values. Examples -------- @@ -1303,6 +1319,13 @@ def unique(self) -> Self: Returns ------- pandas.api.extensions.ExtensionArray + With unique values from the input array. + + See Also + -------- + Index.unique: Return unique values in the index. + Series.unique: Return unique values of Series object. + unique: Return unique values based on a hash table. Examples -------- @@ -1436,10 +1459,18 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: Parameters ---------- values : np.ndarray or ExtensionArray + Values to compare every element in the array against. Returns ------- np.ndarray[bool] + With true at indices where value is in `values`. + + See Also + -------- + DataFrame.isin: Whether each element in the DataFrame is contained in values. + Index.isin: Return a boolean array where the index values are in values. + Series.isin: Whether elements in Series are contained in values. Examples -------- @@ -1743,6 +1774,12 @@ def view(self, dtype: Dtype | None = None) -> ArrayLike: ExtensionArray or np.ndarray A view on the :class:`ExtensionArray`'s data. + See Also + -------- + api.extensions.ExtensionArray.ravel: Return a flattened view on input array. + Index.view: Equivalent function for Index. + ndarray.view: New view of array with the same data. + Examples -------- This gives view on the underlying data of an ``ExtensionArray`` and is not a @@ -2201,6 +2238,12 @@ def tolist(self) -> list: Returns ------- list + Python list of values in array. + + See Also + -------- + Index.to_list: Return a list of the values in the Index. + Series.to_list: Return a list of the values in the Series. Examples -------- @@ -2223,11 +2266,18 @@ def insert(self, loc: int, item) -> Self: Parameters ---------- loc : int + Index where the `item` needs to be inserted. item : scalar-like + Value to be inserted. Returns ------- - same type as self + ExtensionArray + With `item` inserted at `loc`. + + See Also + -------- + Index.insert: Make new Index inserting new item at location. Notes ----- From d31aa834cef5a433938933f75ca20f0268a4ea83 Mon Sep 17 00:00:00 2001 From: ktseng4096 <32848825+ktseng4096@users.noreply.github.com> Date: Mon, 26 Aug 2024 11:33:43 -0700 Subject: [PATCH 3/4] DOC: add See Also section to groupby.DataFrameGroupBy.prod (#59599) * Update Groupby.prod * update code_check list * remove extra spaces * fix errors * ruff formatting --- ci/code_checks.sh | 2 - pandas/core/groupby/groupby.py | 77 ++++++++++++++++------------------ 2 files changed, 37 insertions(+), 42 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 4ddc429f2a51c..76cc02652ec24 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -226,7 +226,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.ohlc SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ - -i "pandas.core.groupby.DataFrameGroupBy.prod SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.sum SA01" \ -i "pandas.core.groupby.SeriesGroupBy.__iter__ RT03,SA01" \ @@ -243,7 +242,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.SeriesGroupBy.nth PR02" \ -i "pandas.core.groupby.SeriesGroupBy.ohlc SA01" \ -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ - -i "pandas.core.groupby.SeriesGroupBy.prod SA01" \ -i "pandas.core.groupby.SeriesGroupBy.sem SA01" \ -i "pandas.core.groupby.SeriesGroupBy.sum SA01" \ -i "pandas.core.resample.Resampler.__iter__ RT03,SA01" \ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b288dad63179f..8c9c92594ebe7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -164,32 +164,6 @@ class providing the base-class of operations. to each row or column of a DataFrame. """ -_groupby_agg_method_template = """ -Compute {fname} of group values. - -Parameters ----------- -numeric_only : bool, default {no} - Include only float, int, boolean columns. - - .. versionchanged:: 2.0.0 - - numeric_only no longer accepts ``None``. - -min_count : int, default {mc} - The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. - -Returns -------- -Series or DataFrame - Computed {fname} of values within each group. - -Examples --------- -{example} -""" - _groupby_agg_method_engine_template = """ Compute {fname} of group values. @@ -3029,16 +3003,38 @@ def sum( return result @final - @doc( - _groupby_agg_method_template, - fname="prod", - no=False, - mc=0, - example=dedent( - """\ + def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT: + """ + Compute prod of group values. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None``. + + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + + Returns + ------- + Series or DataFrame + Computed prod of values within each group. + + See Also + -------- + Series.prod : Return the product of the values over the requested axis. + DataFrame.prod : Return the product of the values over the requested axis. + + Examples + -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'b', 'b'] + >>> lst = ["a", "a", "b", "b"] >>> ser = pd.Series([1, 2, 3, 4], index=lst) >>> ser a 1 @@ -3054,8 +3050,11 @@ def sum( For DataFrameGroupBy: >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["tiger", "leopard", "cheetah", "lion"]) + >>> df = pd.DataFrame( + ... data, + ... columns=["a", "b", "c"], + ... index=["tiger", "leopard", "cheetah", "lion"], + ... ) >>> df a b c tiger 1 8 2 @@ -3066,10 +3065,8 @@ def sum( b c a 1 16 10 - 2 30 72""" - ), - ) - def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT: + 2 30 72 + """ return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod ) From bb4ab4f2c0c2806f367679b7131fb98f718a3480 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Mon, 26 Aug 2024 20:36:12 +0200 Subject: [PATCH 4/4] ENH: support Arrow PyCapsule Interface on Series for export (#59587) * ENH: support Arrow PyCapsule Interface on Series for export * simplify * simplify --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/series.py | 27 +++++++++++++++++++++ pandas/tests/series/test_arrow_interface.py | 23 ++++++++++++++++++ 3 files changed, 51 insertions(+) create mode 100644 pandas/tests/series/test_arrow_interface.py diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 1533f9267ce39..eaf9ce899f03a 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -43,6 +43,7 @@ Other enhancements - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`) - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`) - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`) +- :class:`Series` now supports the Arrow PyCapsule Interface for export (:issue:`59518`) - :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`) - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`) - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) diff --git a/pandas/core/series.py b/pandas/core/series.py index 17494f948876a..4f79e30f48f3c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -34,6 +34,7 @@ from pandas._libs.lib import is_range_indexer from pandas.compat import PYPY from pandas.compat._constants import REF_COUNT +from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.errors import ( ChainedAssignmentError, @@ -558,6 +559,32 @@ def _init_dict( # ---------------------------------------------------------------------- + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the pandas Series as an Arrow C stream PyCapsule. + + This relies on pyarrow to convert the pandas Series to the Arrow + format (and follows the default behaviour of ``pyarrow.Array.from_pandas`` + in its handling of the index, i.e. to ignore it). + This conversion is not necessarily zero-copy. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the dataframe should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + """ + pa = import_optional_dependency("pyarrow", min_version="16.0.0") + ca = pa.chunked_array([pa.Array.from_pandas(self, type=requested_schema)]) + return ca.__arrow_c_stream__(requested_schema) + + # ---------------------------------------------------------------------- + @property def _constructor(self) -> type[Series]: return Series diff --git a/pandas/tests/series/test_arrow_interface.py b/pandas/tests/series/test_arrow_interface.py new file mode 100644 index 0000000000000..34a2a638e4185 --- /dev/null +++ b/pandas/tests/series/test_arrow_interface.py @@ -0,0 +1,23 @@ +import ctypes + +import pytest + +import pandas as pd + +pa = pytest.importorskip("pyarrow", minversion="16.0") + + +def test_series_arrow_interface(): + s = pd.Series([1, 4, 2]) + + capsule = s.__arrow_c_stream__() + assert ( + ctypes.pythonapi.PyCapsule_IsValid( + ctypes.py_object(capsule), b"arrow_array_stream" + ) + == 1 + ) + + ca = pa.chunked_array(s) + expected = pa.chunked_array([[1, 4, 2]]) + assert ca.equals(expected)