diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c56cb45..7337961 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -3,7 +3,19 @@ Changelog ========= -* Update :func:`~core.projectlevel` to raise ``KeyError`` for wrong level names. +v0.5.0 (2024-04-09) +------------------------------------------------------------ +* **BREAKING**: Change :func:`~core.extractlevel` to drop split levels by default and + accordingly rename the governing argument from ``drop=False`` to ``keep=False`` + :pull:`53`. +* Add ``regex=True`` argument to :func:`~core.extractlevel` to use templates as + manual extraction regex, f.ex. + ``df.pix.extract(variable=r"Emissions\|(?P.*?)(?:\|(?P.*?))?", + regex=True)`` will also split ``Emissions|CO2`` to ``gas = "CO2"`` and + ``sector = NaN``, while ``df.pix.extract(variable="Emissions|{gas}|{sector}")`` would + have dropped it. +* Update :func:`~core.projectlevel` to raise ``KeyError`` for wrong level names + :pull:`52`. v0.4.2 (2024-04-03) ------------------------------------------------------------ diff --git a/src/pandas_indexing/accessors.py b/src/pandas_indexing/accessors.py index 884d6ff..492dd75 100644 --- a/src/pandas_indexing/accessors.py +++ b/src/pandas_indexing/accessors.py @@ -10,6 +10,7 @@ >>> df.pix.multiply(other, how="left") """ +import warnings from typing import Any, Callable, Dict, Literal, Mapping, Optional, Sequence, Union import pandas as pd @@ -71,9 +72,31 @@ def assign( @doc(extractlevel, index_or_data="") def extract( - self, template: Optional[str] = None, *, axis: Axis = 0, **templates: str + self, + template: Optional[str] = None, + *, + keep: bool = False, + dropna: bool = True, + regex: bool = False, + axis: Axis = 0, + drop: Optional[bool] = None, + **templates: str, ) -> Union[DataFrame, Series, Index]: - return extractlevel(self._obj, template, axis=axis, **templates) + if drop is not None: + warnings.warn( + "Argument `drop` is deprecated (use `keep` instead)", DeprecationWarning + ) + keep = not drop + + return extractlevel( + self._obj, + template, + keep=keep, + dropna=dropna, + regex=regex, + axis=axis, + **templates, + ) @doc(formatlevel, index_or_data="") def format( diff --git a/src/pandas_indexing/core.py b/src/pandas_indexing/core.py index 500ab32..f255168 100644 --- a/src/pandas_indexing/core.py +++ b/src/pandas_indexing/core.py @@ -3,6 +3,7 @@ """ import re +import warnings from functools import reduce from itertools import chain, product from operator import and_, or_ @@ -671,7 +672,11 @@ def antijoin( def _extractlevel( - index: Index, template: Optional[str] = None, drop: bool = False, **templates: str + index: Index, + template: Optional[str] = None, + keep: bool = False, + regex: bool = False, + **templates: str, ) -> Tuple[Index, List[str]]: index = ensure_multiindex(index) all_identifiers = set() @@ -682,8 +687,6 @@ def _extractlevel( templates[index.names[0]] = template for dim, template in templates.items(): - identifiers = re.findall(r"\{([a-zA-Z_]+)\}", template) - all_identifiers.update(identifiers) if dim not in index.names: raise ValueError(f"{dim} not a dimension of index: {index.names}") @@ -691,18 +694,26 @@ def _extractlevel( labels = index.levels[levelnum] codes = index.codes[levelnum] - regex_pattern = reduce( - lambda s, ident: s.replace(rf"\{{{ident}\}}", rf"(?P<{ident}>.*?)"), - identifiers, - re.escape(template), - ) - components = labels.str.extract(f"^{regex_pattern}$", expand=True) + if regex: + regex_pattern = re.compile(f"^{template}$") + identifiers = list(regex_pattern.groupindex) + else: + identifiers = re.findall(r"\{([a-zA-Z_]+)\}", template) + regex_pattern = reduce( + lambda s, ident: s.replace(rf"\{{{ident}\}}", rf"(?P<{ident}>.*?)"), + identifiers, + re.escape(template), + ) + regex_pattern = re.compile(f"^{regex_pattern}$") + + components = labels.str.extract(regex_pattern, expand=True) + all_identifiers.update(identifiers) index = assignlevel( index, **{ident: components[ident].values[codes] for ident in identifiers} ) - if drop: + if not keep: index = index.droplevel(list(set(templates) - all_identifiers)) return index, list(all_identifiers) @@ -718,8 +729,10 @@ def extractlevel( index_or_data: T, template: Optional[str] = None, *, - drop: bool = False, + keep: bool = False, dropna: bool = True, + regex: bool = False, + drop: Optional[bool] = None, axis: Axis = 0, **templates: str, ) -> T: @@ -736,12 +749,17 @@ def extractlevel( {index_or_data} template : str, optional Extraction template for a single level - drop : bool, default False + keep : bool, default False Whether to keep the split dimension dropna : bool, default True Whether to drop the non-matching levels + regex : bool, default False + Whether templates are given as regular expressions + (regexes must use named captures) axis : {{0, 1, "index", "columns"}}, default 0 Axis of DataFrame to extract from + drop : bool, optional + Deprecated argument, use keep instead **templates : str Templates for splitting one or multiple levels @@ -759,9 +777,12 @@ def extractlevel( Examples -------- >>> s = Series( - ... range(3), + ... range(4), ... MultiIndex.from_arrays( - ... [["SE|Elec|Bio", "SE|Elec|Coal", "PE|Coal"], ["GWh", "GWh", "EJ"]], + ... [ + ... ["SE|Elec|Bio", "SE|Elec|Coal", "PE|Coal", "SE|Elec"], + ... ["GWh", "GWh", "EJ", "GWh"], + ... ], ... names=["variable", "unit"], ... ), ... ) @@ -770,22 +791,37 @@ def extractlevel( SE|Elec|Bio GWh 0 SE|Elec|Coal GWh 1 PE|Coal EJ 2 + SE|Elec GWh 3 dtype: int64 - >>> extractlevel(s, variable="SE|{{type}}|{{fuel}}") + >>> extractlevel(s, variable="SE|{{type}}|{{fuel}}", keep=True) variable unit type fuel SE|Elec|Bio GWh Elec Bio 0 SE|Elec|Coal GWh Elec Coal 1 dtype: int64 - >>> extractlevel(s, variable="SE|{{type}}|{{fuel}}", dropna=False) + >>> extractlevel(s, variable="SE|{{type}}|{{fuel}}") + unit type fuel + GWh Elec Bio 0 + GWh Elec Coal 1 + dtype: int64 + + >>> extractlevel(s, variable="SE|{{type}}|{{fuel}}", keep=True, dropna=False) variable unit type fuel SE|Elec|Bio GWh Elec Bio 0 SE|Elec|Coal GWh Elec Coal 1 PE|Coal EJ NaN NaN 2 + SE|Elec GWh NaN NaN 3 + dtype: int64 + + >>> extractlevel(s, variable=r"SE\\|(?P.*?)(?:\\|(?P.*?))?", regex=True) + unit type fuel + GWh Elec Bio 0 + GWh Elec Coal 1 + GWh Elec NaN 3 dtype: int64 >>> s = Series(range(3), ["SE|Elec|Bio", "SE|Elec|Coal", "PE|Coal"]) - >>> extractlevel(s, "SE|{{type}}|{{fuel}}", drop=True) + >>> extractlevel(s, "SE|{{type}}|{{fuel}}") type fuel Elec Bio 0 Coal 1 @@ -794,19 +830,31 @@ def extractlevel( See also -------- formatlevel + + .. versionchanged:: 0.5.0 + *drop* replaced by *keep* and default changed to not keep. + *regex* added. """ + if drop is not None: + warnings.warn( + "Argument `drop` is deprecated (use `keep` instead)", DeprecationWarning + ) + keep = not drop + if isinstance(index_or_data, Index): index_or_data, identifiers = _extractlevel( - index_or_data, template, drop, **templates + index_or_data, template, keep=keep, regex=regex, **templates ) else: index, identifiers = _extractlevel( - get_axis(index_or_data, axis), template, drop, **templates + get_axis(index_or_data, axis), template, keep=keep, regex=regex, **templates ) index_or_data = index_or_data.set_axis(index, axis=axis) if dropna: - index_or_data = dropnalevel(index_or_data, subset=identifiers, axis=axis) + index_or_data = dropnalevel( + index_or_data, subset=identifiers, how="all", axis=axis + ) return index_or_data diff --git a/tests/test_accessors.py b/tests/test_accessors.py index 0c9e98d..72515aa 100644 --- a/tests/test_accessors.py +++ b/tests/test_accessors.py @@ -142,6 +142,30 @@ def test_aggregate(mdf): ) +def test_extract(): + midx = MultiIndex.from_arrays( + [["e|foo", "e|bar", "bar"], [1, 2, 3]], names=["var", "num"] + ) + + assert_index_equal( + midx.pix.extract(var="{e}|{typ}"), + MultiIndex.from_arrays( + [[1, 2], ["e", "e"], ["foo", "bar"]], + names=["num", "e", "typ"], + ), + ) + + # drop=False + with pytest.warns(DeprecationWarning): + assert_index_equal( + midx.pix.extract(var="{e}|{typ}", drop=False), + MultiIndex.from_arrays( + [["e|foo", "e|bar"], [1, 2], ["e", "e"], ["foo", "bar"]], + names=["var", "num", "e", "typ"], + ), + ) + + def test_add_zeros_like(mdf): reference = MultiIndex.from_arrays( [["foo", "foo", "bar", "baz"], [1, 2, 3, 4], ["a", "b", "c", "d"]], diff --git a/tests/test_core.py b/tests/test_core.py index 1f7f270..cf7251a 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -226,8 +226,8 @@ def test_extractlevel(mdf, mseries, midx): mseries = mseries.set_axis(midx) expected_idx = MultiIndex.from_arrays( - [["e|foo", "e|bar"], [1, 2], ["e", "e"], ["foo", "bar"]], - names=["var", "num", "e", "typ"], + [[1, 2], ["e", "e"], ["foo", "bar"]], + names=["num", "e", "typ"], ) assert_index_equal(extractlevel(midx, var="{e}|{typ}"), expected_idx) @@ -248,12 +248,12 @@ def test_extractlevel_options(mdf): ) mdf_t = mdf.T.set_axis(midx, axis=1) - # drop=True + # keep=True assert_index_equal( - extractlevel(midx, var="{e}|{typ}", drop=True), + extractlevel(midx, var="{e}|{typ}", keep=True), MultiIndex.from_arrays( - [[1, 2], ["e", "e"], ["foo", "bar"]], - names=["num", "e", "typ"], + [["e|foo", "e|bar"], [1, 2], ["e", "e"], ["foo", "bar"]], + names=["var", "num", "e", "typ"], ), ) @@ -261,19 +261,14 @@ def test_extractlevel_options(mdf): assert_index_equal( extractlevel(midx, var="{e}|{typ}", dropna=False), MultiIndex.from_arrays( - [ - ["e|foo", "e|bar", "bar"], - [1, 2, 3], - ["e", "e", nan], - ["foo", "bar", nan], - ], - names=["var", "num", "e", "typ"], + [[1, 2, 3], ["e", "e", nan], ["foo", "bar", nan]], + names=["num", "e", "typ"], ), ) # axis=1 assert_frame_equal( - extractlevel(mdf_t, var="{e}|{typ}", drop=True, axis=1), + extractlevel(mdf_t, var="{e}|{typ}", axis=1), mdf_t.iloc[:, [0, 1]].set_axis( MultiIndex.from_arrays( [[1, 2], ["e", "e"], ["foo", "bar"]], @@ -283,6 +278,25 @@ def test_extractlevel_options(mdf): ), ) + # regex + assert_index_equal( + extractlevel(midx, var=r"((?P.*?)\|)?(?P.*?)", regex=True), + MultiIndex.from_arrays( + [[1, 2, 3], ["e", "e", nan], ["foo", "bar", "bar"]], + names=["num", "e", "typ"], + ), + ) + + # drop=True + with pytest.warns(DeprecationWarning): + assert_index_equal( + extractlevel(midx, var="{e}|{typ}", drop=False), + MultiIndex.from_arrays( + [["e|foo", "e|bar"], [1, 2], ["e", "e"], ["foo", "bar"]], + names=["var", "num", "e", "typ"], + ), + ) + with pytest.raises(ValueError): # mdf does not have the var level extractlevel(mdf, var="{e}|{typ}")