From 035d57febdaacf31c21d8c48dac57c108bfbf36f Mon Sep 17 00:00:00 2001 From: Felix Claessen Date: Fri, 14 Aug 2020 18:00:58 +0200 Subject: [PATCH 01/15] Call finalize in all constructors to inherit metadata. Prepare tests for upcoming pandas==1.1.1 functionality: inherit metadata when resampling and grouping. --- requirements.txt | 0 timely_beliefs/beliefs/__init__.py | 5 ++- timely_beliefs/beliefs/classes.py | 21 +++++++----- timely_beliefs/tests/test_belief_io.py | 44 ++++++++++++++++++++++++++ 4 files changed, 61 insertions(+), 9 deletions(-) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..e69de29b diff --git a/timely_beliefs/beliefs/__init__.py b/timely_beliefs/beliefs/__init__.py index 54310b32..469a6f08 100644 --- a/timely_beliefs/beliefs/__init__.py +++ b/timely_beliefs/beliefs/__init__.py @@ -8,6 +8,7 @@ from datetime import datetime, timedelta from typing import List +import pandas as pd from pandas.api.extensions import register_dataframe_accessor @@ -120,7 +121,9 @@ def number_of_sources(self): @property def number_of_probabilistic_beliefs(self) -> int: """Return the number of beliefs in the BeliefsDataFrame that are probabilistic (more than 1 unique value).""" - df = self._obj.for_each_belief(df=self._obj).nunique(dropna=True) + df = self._obj.for_each_belief( + df=self._obj, fnc=pd.DataFrame.nunique, dropna=True + ) return len(df[df > 1].max(axis=1).dropna()) @property diff --git a/timely_beliefs/beliefs/classes.py b/timely_beliefs/beliefs/classes.py index bc706217..94a9e5dc 100644 --- a/timely_beliefs/beliefs/classes.py +++ b/timely_beliefs/beliefs/classes.py @@ -375,7 +375,11 @@ class BeliefsSeries(pd.Series): @property def _constructor(self): - return BeliefsSeries + def f(*args, **kwargs): + """ Call __finalize__() after construction to inherit metadata. """ + return BeliefsSeries(*args, **kwargs).__finalize__(self, method="inherit") + + return f @property def _constructor_expanddim(self): @@ -436,7 +440,13 @@ class BeliefsDataFrame(pd.DataFrame): @property def _constructor(self): - return BeliefsDataFrame + def f(*args, **kwargs): + """ Call __finalize__() after construction to inherit metadata. """ + return BeliefsDataFrame(*args, **kwargs).__finalize__( + self, method="inherit" + ) + + return f @property def _constructor_sliced(self): @@ -1017,12 +1027,7 @@ def resample_events( "cumulative_probability": "prod", # assume independent variables } ) - # make a new BeliefsDataFrame, because agg() doesn't behave nicely for subclassed DataFrames - df = BeliefsDataFrame( - df.reset_index(), - sensor=self.sensor, - event_resolution=event_resolution, - ) + df.event_resolution = event_resolution else: # upsample new_index = pd.date_range( diff --git a/timely_beliefs/tests/test_belief_io.py b/timely_beliefs/tests/test_belief_io.py index a275aff2..1925af71 100644 --- a/timely_beliefs/tests/test_belief_io.py +++ b/timely_beliefs/tests/test_belief_io.py @@ -385,6 +385,50 @@ def test_slicing_retains_metadata(drop_level): metadata = {md: getattr(example_df, md) for md in METADATA} df = df.xs("2000-01-03 10:00:00+00:00", level="event_start", drop_level=drop_level) print(df) + assert isinstance(df, tb.BeliefsDataFrame) + for md in metadata: + assert getattr(df, md) == metadata[md] + + +@pytest.mark.parametrize("resolution", [timedelta(minutes=30), timedelta(hours=2)]) +def test_agg_resampling_retains_metadata(resolution): + """ + Test whether aggregate resampling retains the metadata. + + Fails with pandas==1.0.0 + Succeeds with pandas==1.1.0 + """ + df = example_df + metadata = {md: getattr(example_df, md) for md in METADATA} + df = df.resample(resolution, level="event_start").mean() + print(df) + assert isinstance(df, tb.BeliefsDataFrame) + for md in metadata: + # if md == "event_resolution": + # assert df.event_resolution == resolution + # else: # todo: the event_resolution metadata is only updated when resampling using df.resample_events(). A reason to override the original resample method, or otherwise something to document. + assert getattr(df, md) == metadata[md] + + +def test_groupby_retains_metadata(): + """ Test whether grouping by index level retains the metadata. + + Succeeds with pandas==1.0.0 + Fails with pandas==1.1.0 + Should be fixed with https://github.com/pandas-dev/pandas/pull/35688 + """ + df = example_df + metadata = {md: getattr(example_df, md) for md in METADATA} + + def assert_function(x): + print(x) + assert isinstance(x, tb.BeliefsDataFrame) + for md in metadata: + assert getattr(x, md) == metadata[md] + return x + + df = df.groupby(level="event_start").apply(lambda x: assert_function(x)) + assert isinstance(df, tb.BeliefsDataFrame) for md in metadata: assert getattr(df, md) == metadata[md] From c56d7cccb45155f3bcce1b22fe28cf2d846e89bd Mon Sep 17 00:00:00 2001 From: "F.N. Claessen" Date: Thu, 26 Nov 2020 16:22:56 +0100 Subject: [PATCH 02/15] Update reference in test. Add pandas test. --- timely_beliefs/tests/test_belief_io.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/timely_beliefs/tests/test_belief_io.py b/timely_beliefs/tests/test_belief_io.py index 1925af71..6a8f611b 100644 --- a/timely_beliefs/tests/test_belief_io.py +++ b/timely_beliefs/tests/test_belief_io.py @@ -411,11 +411,11 @@ def test_agg_resampling_retains_metadata(resolution): def test_groupby_retains_metadata(): - """ Test whether grouping by index level retains the metadata. + """Test whether grouping by index level retains the metadata. Succeeds with pandas==1.0.0 Fails with pandas==1.1.0 - Should be fixed with https://github.com/pandas-dev/pandas/pull/35688 + Should be fixed with https://github.com/pandas-dev/pandas/pull/37461 """ df = example_df metadata = {md: getattr(example_df, md) for md in METADATA} @@ -477,3 +477,13 @@ def test_init_from_beliefs_series(): bdf, df_copy ) # new bdf retains altered column of original bdf pd.testing.assert_series_equal(s, s_copy) # input BeliefsSeries was not altered + + +def test_groupby_retains_attribute(): + df = pd.DataFrame([[1, 2], [3, 4]], columns=["x", "y"]) + df.a = "b" + assert df.a == "b" + df = df.groupby("x").apply(lambda x: x) + assert df.a == "b" + df = df.groupby("x").sum() + assert df.a == "b" From 5b24bec904c0a59188b2f6e01e51314ffd715314 Mon Sep 17 00:00:00 2001 From: "F.N. Claessen" Date: Mon, 7 Dec 2020 17:21:37 +0100 Subject: [PATCH 03/15] Finish rebase. --- requirements.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index e69de29b..00000000 From 96163ee1f90b0652a4f7aa835399028ae15f824c Mon Sep 17 00:00:00 2001 From: "F.N. Claessen" Date: Mon, 7 Dec 2020 20:35:13 +0100 Subject: [PATCH 04/15] Complete the ordering of BeliefSources by name. --- timely_beliefs/sources/classes.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/timely_beliefs/sources/classes.py b/timely_beliefs/sources/classes.py index fc7d609b..feae8d89 100644 --- a/timely_beliefs/sources/classes.py +++ b/timely_beliefs/sources/classes.py @@ -1,3 +1,4 @@ +from functools import total_ordering from typing import Union from sqlalchemy import Column, Integer, String @@ -5,10 +6,11 @@ from timely_beliefs.db_base import Base +@total_ordering class BeliefSource(object): """ - A belief source is any data-creating entitiy such as a user, a ML model or a script. + A belief source is any data-creating entity such as a user, a ML model or a script. """ name: str From 939e023173e34e19b73b9fa2fa345e47b3ca753c Mon Sep 17 00:00:00 2001 From: "F.N. Claessen" Date: Wed, 9 Dec 2020 17:16:41 +0100 Subject: [PATCH 05/15] Fix computation order. --- timely_beliefs/beliefs/classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/timely_beliefs/beliefs/classes.py b/timely_beliefs/beliefs/classes.py index 94a9e5dc..952954f4 100644 --- a/timely_beliefs/beliefs/classes.py +++ b/timely_beliefs/beliefs/classes.py @@ -1032,7 +1032,7 @@ def resample_events( # upsample new_index = pd.date_range( start=df.index[0], - periods=len(df) * self.event_resolution // event_resolution, + periods=len(df) * (self.event_resolution // event_resolution), freq=event_resolution, name="event_start", ) From 1007af85c189465f6842b23685f3f71e312a0222 Mon Sep 17 00:00:00 2001 From: "F.N. Claessen" Date: Wed, 9 Dec 2020 17:18:29 +0100 Subject: [PATCH 06/15] Workaround for aggregation function when resampling. --- timely_beliefs/beliefs/classes.py | 54 ++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/timely_beliefs/beliefs/classes.py b/timely_beliefs/beliefs/classes.py index 952954f4..872a554f 100644 --- a/timely_beliefs/beliefs/classes.py +++ b/timely_beliefs/beliefs/classes.py @@ -1,9 +1,8 @@ import math from datetime import datetime, timedelta -from typing import Any, Callable, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import altair as alt -import numpy as np import pandas as pd from pandas.core.groupby import DataFrameGroupBy from sqlalchemy import Column, DateTime, Float, ForeignKey, Integer, Interval @@ -1012,24 +1011,23 @@ def resample_events( and keep_only_most_recent_belief and df.lineage.number_of_sources == 1 ): - df = df.reset_index( - level=[belief_timing_col, "source", "cumulative_probability"] - ) if event_resolution > self.event_resolution: # downsample - df = df.resample(event_resolution).agg( - { - "event_value": np.nanmean, - "source": "first", # keep the only source - belief_timing_col: "max" - if belief_timing_col == "belief_time" - else "min", # keep only most recent belief - "cumulative_probability": "prod", # assume independent variables - } - ) + column_functions = { + "event_value": "mean", + "source": "first", # keep the only source + belief_timing_col: "max" + if belief_timing_col == "belief_time" + else "min", # keep only most recent belief + "cumulative_probability": "prod", # assume independent variables + } + df = resample_beliefs_data_frame(df, event_resolution, column_functions) df.event_resolution = event_resolution else: # upsample + df = df.reset_index( + level=[belief_timing_col, "source", "cumulative_probability"] + ) new_index = pd.date_range( start=df.index[0], periods=len(df) * (self.event_resolution // event_resolution), @@ -1472,3 +1470,29 @@ def assign_sensor_and_event_resolution(df, sensor, event_resolution): if sensor else None ) + + +def resample_beliefs_data_frame( + df: BeliefsDataFrame, event_resolution: timedelta, col_att_dict: Dict[str, str] +) -> BeliefsDataFrame: + """Because df.resample().agg() doesn't behave nicely for subclassed DataFrames, + we aggregate each index level and column separately against the resampled event_start level, + and then recombine them afterwards. + """ + belief_timing_col = ( + "belief_time" if "belief_time" in df.index.names else "belief_horizon" + ) + event_timing_col = "event_start" if "event_start" in df.index.names else "event_end" + return pd.concat( + [ + getattr( + df.reset_index() + .set_index(event_timing_col)[col] + .to_frame() + .resample(event_resolution), + att, + )() + for col, att in col_att_dict.items() + ], + axis=1, + ).set_index([belief_timing_col, "source", "cumulative_probability"], append=True) From e72ef14bd115c619b4aea8cf201f0bf9a7631ac7 Mon Sep 17 00:00:00 2001 From: "F.N. Claessen" Date: Wed, 9 Dec 2020 17:21:17 +0100 Subject: [PATCH 07/15] Fix tests: - Separate tests for mean resampling and aggregate resampling. - Separate tests for temporary attributes and subclass attributes. - Test metadata propagation for groupby of subclassed DataFrames. --- timely_beliefs/tests/test_belief_io.py | 112 +++++++++++++++++++++++-- 1 file changed, 106 insertions(+), 6 deletions(-) diff --git a/timely_beliefs/tests/test_belief_io.py b/timely_beliefs/tests/test_belief_io.py index 6a8f611b..236f4312 100644 --- a/timely_beliefs/tests/test_belief_io.py +++ b/timely_beliefs/tests/test_belief_io.py @@ -1,6 +1,7 @@ import os from datetime import datetime, timedelta +import numpy as np import pandas as pd import pytest import pytz @@ -391,9 +392,9 @@ def test_slicing_retains_metadata(drop_level): @pytest.mark.parametrize("resolution", [timedelta(minutes=30), timedelta(hours=2)]) -def test_agg_resampling_retains_metadata(resolution): +def test_mean_resampling_retains_metadata(resolution): """ - Test whether aggregate resampling retains the metadata. + Test whether mean resampling retains the metadata. Fails with pandas==1.0.0 Succeeds with pandas==1.1.0 @@ -410,6 +411,34 @@ def test_agg_resampling_retains_metadata(resolution): assert getattr(df, md) == metadata[md] +@pytest.mark.parametrize("resolution", [timedelta(minutes=30), timedelta(hours=2)]) +def _test_agg_resampling_retains_metadata(resolution): + """ + Test whether aggregate resampling retains the metadata. + + Fails with pandas==1.1.5 + """ + df = example_df + metadata = {md: getattr(example_df, md) for md in METADATA} + df = df.reset_index(level=["belief_time", "source", "cumulative_probability"]) + df = df.resample(resolution).agg( + { + "event_value": np.nanmean, + "source": "first", # keep the only source + "belief_time": "max", # keep the latest belief + "cumulative_probability": "prod", # assume independent variables + } + ) + df = df.set_index(["belief_time", "source", "cumulative_probability"], append=True) + print(df) + assert isinstance(df, tb.BeliefsDataFrame) + for md in metadata: + # if md == "event_resolution": + # assert df.event_resolution == resolution + # else: # todo: the event_resolution metadata is only updated when resampling using df.resample_events(). A reason to override the original resample method, or otherwise something to document. + assert getattr(df, md) == metadata[md] + + def test_groupby_retains_metadata(): """Test whether grouping by index level retains the metadata. @@ -479,11 +508,82 @@ def test_init_from_beliefs_series(): pd.testing.assert_series_equal(s, s_copy) # input BeliefsSeries was not altered -def test_groupby_retains_attribute(): +def test_groupby_does_not_retain_temporary_attribute(): df = pd.DataFrame([[1, 2], [3, 4]], columns=["x", "y"]) df.a = "b" assert df.a == "b" - df = df.groupby("x").apply(lambda x: x) - assert df.a == "b" - df = df.groupby("x").sum() + df2 = df.groupby("x").apply(lambda x: x) + assert not hasattr(df2, "a") + df3 = df.groupby("x").sum() + assert not hasattr(df3, "a") + + +@pytest.mark.parametrize( + "att, args", + [ + # ("all", []), + # ("any", []), + # ("count", []), + ("first", []), + ("last", []), + ("max", []), + # ("mean", []), + # ("median", []), + ("min", []), + ("prod", []), + # ("sem", []), + # ("size", []), + # ("std", []), + ("sum", []), + # ("var", []), + # ("apply", [lambda x: x]), + # ("apply", [np.max]) + # ("apply", [np.min]) + # ("apply", [np.nanmean]), + ("agg", ["first"]), + ("agg", ["max"]), + # ("agg", ["mean"]), + ("agg", ["min"]), + ("agg", ["sum"]), + # ("agg", [{"y": "min"}]), + # ("agg", [{"x": "min", "y": "max"}]), + ], +) +def test_groupby_retains_subclass_attribute(att, args): + """Checks on metadata propagation for subclassed DataFrames under groupby operations. + + Commented-out parameter combinations fail with pandas==1.1.5 + """ + + METADATA = ["a"] + + class SubclassedSeries(pd.Series): + + _metadata = METADATA + + @property + def _constructor(self): + return SubclassedSeries + + @property + def _constructor_expanddim(self): + return SubclassedDataFrame + + class SubclassedDataFrame(pd.DataFrame): + + _metadata = METADATA + + @property + def _constructor(self): + return SubclassedDataFrame + + @property + def _constructor_sliced(self): + return SubclassedSeries + + df = SubclassedDataFrame([[1, 2], [3, 4]], columns=["x", "y"]) + df.a = "b" assert df.a == "b" + df2 = getattr(df.groupby("x"), att)(*args) + print(df2) + assert df2.a == "b" From 74379967e38ff01ee6c9f331773da9e8cfa0b39b Mon Sep 17 00:00:00 2001 From: "F.N. Claessen" Date: Thu, 10 Dec 2020 11:44:19 +0100 Subject: [PATCH 08/15] Update pandas dependency. Drop support for pandas<1.1.5. Bump timely-beliefs version with major release. --- README.md | 4 +++- setup.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index bfe6eb71..def0322c 100644 --- a/README.md +++ b/README.md @@ -148,6 +148,8 @@ This will create an interactive Vega-Lite chart like the one in the screenshot a ## Development -We welcome other contributions to timely_beliefs. +The `timely_beliefs` package runs on `pandas>=1.1.5`. +Contact us if you need support for older versions. +We welcome other contributions to `timely_beliefs`. [See our developer docs for details.](dev/dev.md) diff --git a/setup.py b/setup.py index a22d0af5..c830522e 100644 --- a/setup.py +++ b/setup.py @@ -14,10 +14,10 @@ "uncertainty", "lineage", ], - version="0.1.3", + version="1.0.0", install_requires=[ "pytz", - "pandas>=0.24,<1.1", # test_groupby_preserves_metadata fails on 1.1 + "pandas>=1.1.5", "numpy", "pyerf", "SQLAlchemy", From 5dce22dde55e148c1e6a44aacfcf13e01728b4fd Mon Sep 17 00:00:00 2001 From: "F.N. Claessen" Date: Thu, 10 Dec 2020 16:23:20 +0100 Subject: [PATCH 09/15] Prepare dtype test for empty frames. --- timely_beliefs/beliefs/classes.py | 2 +- timely_beliefs/tests/test_belief_query.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/timely_beliefs/beliefs/classes.py b/timely_beliefs/beliefs/classes.py index 872a554f..43748dcb 100644 --- a/timely_beliefs/beliefs/classes.py +++ b/timely_beliefs/beliefs/classes.py @@ -1457,7 +1457,7 @@ def set_columns_and_indices_for_empty_frame(df, columns, indices, default_types) elif default_types[col] in (int, float): df[col] = pd.to_numeric(df[col]) - df.set_index(indices, inplace=True) + df.set_index(indices, inplace=True) # todo: pandas GH30517 def assign_sensor_and_event_resolution(df, sensor, event_resolution): diff --git a/timely_beliefs/tests/test_belief_query.py b/timely_beliefs/tests/test_belief_query.py index 5242d631..22c7a94e 100644 --- a/timely_beliefs/tests/test_belief_query.py +++ b/timely_beliefs/tests/test_belief_query.py @@ -263,3 +263,18 @@ def test_upsample(time_slot_sensor, rolling_day_ahead_beliefs_about_time_slot_ev belief_df = belief_df.resample_events(new_resolution) assert belief_df.sensor.event_resolution == timedelta(minutes=15) assert belief_df.event_resolution == new_resolution + + +def _test_empty_frame(time_slot_sensor): + """ pandas GH30517 """ + bdf = DBTimedBelief.query( + session=session, + sensor=time_slot_sensor, + belief_before=datetime(1900, 1, 1, 13, tzinfo=utc), + ) + assert len(bdf) == 0 # no data expected + assert pd.api.types.is_datetime64_dtype(bdf.index.get_level_values("belief_time")) + bdf = bdf.convert_index_from_belief_time_to_horizon() + assert pd.api.types.is_timedelta64_dtype( + bdf.index.get_level_values("belief_horizon") + ) # dtype of belief_horizon is timedelta64[ns], so the minimum horizon on an empty BeliefsDataFrame is NaT instead of NaN From 4987ff55c67b7300a14f047db47a1e534c390dd3 Mon Sep 17 00:00:00 2001 From: "F.N. Claessen" Date: Thu, 17 Dec 2020 18:04:01 +0100 Subject: [PATCH 10/15] Add comment about which pandas version fixed test_groupby_retains_metadata. --- timely_beliefs/tests/test_belief_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/timely_beliefs/tests/test_belief_io.py b/timely_beliefs/tests/test_belief_io.py index 236f4312..29459e4b 100644 --- a/timely_beliefs/tests/test_belief_io.py +++ b/timely_beliefs/tests/test_belief_io.py @@ -444,7 +444,7 @@ def test_groupby_retains_metadata(): Succeeds with pandas==1.0.0 Fails with pandas==1.1.0 - Should be fixed with https://github.com/pandas-dev/pandas/pull/37461 + Fixed with pandas==1.1.5 """ df = example_df metadata = {md: getattr(example_df, md) for md in METADATA} From 6b7907c1b96fc45a071ee00bbd091e268f6f591f Mon Sep 17 00:00:00 2001 From: "F.N. Claessen" Date: Thu, 17 Dec 2020 18:06:11 +0100 Subject: [PATCH 11/15] Rename downsampling function. --- timely_beliefs/beliefs/classes.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/timely_beliefs/beliefs/classes.py b/timely_beliefs/beliefs/classes.py index 43748dcb..d4aa066d 100644 --- a/timely_beliefs/beliefs/classes.py +++ b/timely_beliefs/beliefs/classes.py @@ -1021,7 +1021,9 @@ def resample_events( else "min", # keep only most recent belief "cumulative_probability": "prod", # assume independent variables } - df = resample_beliefs_data_frame(df, event_resolution, column_functions) + df = downsample_beliefs_data_frame( + df, event_resolution, column_functions + ) df.event_resolution = event_resolution else: # upsample @@ -1472,7 +1474,7 @@ def assign_sensor_and_event_resolution(df, sensor, event_resolution): ) -def resample_beliefs_data_frame( +def downsample_beliefs_data_frame( df: BeliefsDataFrame, event_resolution: timedelta, col_att_dict: Dict[str, str] ) -> BeliefsDataFrame: """Because df.resample().agg() doesn't behave nicely for subclassed DataFrames, From 5f7abf14c8e8f2efbe56adc4b094fe77a1c4339c Mon Sep 17 00:00:00 2001 From: "F.N. Claessen" Date: Thu, 17 Dec 2020 20:00:06 +0100 Subject: [PATCH 12/15] Simplify calls to for_each_belief. --- timely_beliefs/beliefs/__init__.py | 6 ++---- timely_beliefs/visualization/utils.py | 8 +++----- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/timely_beliefs/beliefs/__init__.py b/timely_beliefs/beliefs/__init__.py index 469a6f08..ee31aca2 100644 --- a/timely_beliefs/beliefs/__init__.py +++ b/timely_beliefs/beliefs/__init__.py @@ -105,7 +105,7 @@ def number_of_belief_times(self): def number_of_beliefs(self) -> int: """Return the total number of beliefs in the BeliefsDataFrame, including both deterministic beliefs (which require a single row) and probabilistic beliefs (which require multiple rows).""" - return len(self._obj.for_each_belief(df=self._obj)) + return len(self._obj.for_each_belief()) @property def sources(self) -> List[int]: @@ -121,9 +121,7 @@ def number_of_sources(self): @property def number_of_probabilistic_beliefs(self) -> int: """Return the number of beliefs in the BeliefsDataFrame that are probabilistic (more than 1 unique value).""" - df = self._obj.for_each_belief( - df=self._obj, fnc=pd.DataFrame.nunique, dropna=True - ) + df = self._obj.for_each_belief(fnc=pd.DataFrame.nunique, dropna=True) return len(df[df > 1].max(axis=1).dropna()) @property diff --git a/timely_beliefs/visualization/utils.py b/timely_beliefs/visualization/utils.py index 5f5898c6..5e6ca3a1 100644 --- a/timely_beliefs/visualization/utils.py +++ b/timely_beliefs/visualization/utils.py @@ -250,20 +250,18 @@ def prepare_df_for_plotting( df["lower_value"] = df["event_value"] else: df_ci0 = ( - df.for_each_belief(get_nth_percentile_belief, n=(1 - ci) * 100 / 2, df=df) + df.for_each_belief(get_nth_percentile_belief, n=(1 - ci) * 100 / 2) .rename(columns={"event_value": "lower_value"}) .droplevel("cumulative_probability") .drop("belief_horizon", axis=1) ) df_exp = ( - df.for_each_belief(get_nth_percentile_belief, n=50, df=df) + df.for_each_belief(get_nth_percentile_belief, n=50) .rename(columns={"event_value": "expected_value"}) .droplevel("cumulative_probability") ) df_ci1 = ( - df.for_each_belief( - get_nth_percentile_belief, n=100 - (1 - ci) * 100 / 2, df=df - ) + df.for_each_belief(get_nth_percentile_belief, n=100 - (1 - ci) * 100 / 2) .rename(columns={"event_value": "upper_value"}) .droplevel("cumulative_probability") .drop("belief_horizon", axis=1) From a1acd4877cd1a528480d1bf2f6e3b32d0519ee87 Mon Sep 17 00:00:00 2001 From: "F.N. Claessen" Date: Thu, 17 Dec 2020 20:01:14 +0100 Subject: [PATCH 13/15] Fix plotting integer values (int64 is not JSON serializable). --- timely_beliefs/visualization/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/timely_beliefs/visualization/utils.py b/timely_beliefs/visualization/utils.py index 5e6ca3a1..b275e1d4 100644 --- a/timely_beliefs/visualization/utils.py +++ b/timely_beliefs/visualization/utils.py @@ -40,6 +40,7 @@ def plot( # Set up data source bdf = bdf.copy() + bdf["event_value"] = bdf["event_value"].astype(float) sensor_name = bdf.sensor.name sensor_unit = bdf.sensor.unit if bdf.sensor.unit != "" else "a.u." # arbitrary unit plottable_df, belief_horizon_unit = prepare_df_for_plotting( From 0db3886f03f57a18c431d28d3f632c0cfd43a184 Mon Sep 17 00:00:00 2001 From: "F.N. Claessen" Date: Mon, 21 Dec 2020 11:46:57 +0100 Subject: [PATCH 14/15] Add reference in test docs to relevant pandas issue. --- timely_beliefs/tests/test_belief_io.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/timely_beliefs/tests/test_belief_io.py b/timely_beliefs/tests/test_belief_io.py index 29459e4b..b968ab45 100644 --- a/timely_beliefs/tests/test_belief_io.py +++ b/timely_beliefs/tests/test_belief_io.py @@ -553,6 +553,8 @@ def test_groupby_retains_subclass_attribute(att, args): """Checks on metadata propagation for subclassed DataFrames under groupby operations. Commented-out parameter combinations fail with pandas==1.1.5 + The relevant issue has to do with calling finalize after operations: + see https://github.com/pandas-dev/pandas/issues/28283 """ METADATA = ["a"] From 41a06abd3c0bfdc2692dd012cad90fb6800a4b87 Mon Sep 17 00:00:00 2001 From: Felix Claessen <30658763+Flix6x@users.noreply.github.com> Date: Mon, 21 Dec 2020 11:51:13 +0100 Subject: [PATCH 15/15] Issue 35 metadata lost on multiplication (#43) This PR adds no additional logic, only a test to check whether Issue #35 is successfully resolved (plus some refactoring of test util functions). Note that this is a merge into resample-while-keeping-metadata, and that branch (with PR #23) actually contains the logic that resolves this issue (as a side effect, because I was actually addressing deeper issues there: #22 and #26). * Refactor metadata propagation checks to util function. * Add test for metadata propagation upon multiplication (GH 35). Co-authored-by: F.N. Claessen --- timely_beliefs/tests/test_belief_io.py | 67 +++++++++++++------------- timely_beliefs/tests/utils.py | 32 +++++++++++- 2 files changed, 64 insertions(+), 35 deletions(-) diff --git a/timely_beliefs/tests/test_belief_io.py b/timely_beliefs/tests/test_belief_io.py index b968ab45..298d1e05 100644 --- a/timely_beliefs/tests/test_belief_io.py +++ b/timely_beliefs/tests/test_belief_io.py @@ -7,8 +7,8 @@ import pytz import timely_beliefs as tb -from timely_beliefs.beliefs.classes import METADATA from timely_beliefs.examples import example_df +from timely_beliefs.tests.utils import assert_metadata_is_retained @pytest.fixture(scope="module") @@ -360,21 +360,16 @@ def test_converting_between_data_frame_and_series_retains_metadata(): Test whether expanding dimensions of a BeliefsSeries into a BeliefsDataFrame retains the metadata. """ df = example_df - metadata = {md: getattr(example_df, md) for md in METADATA} series = df["event_value"] - for md in metadata: - assert getattr(series, md) == metadata[md] + assert_metadata_is_retained(series, original_df=example_df, is_series=True) df = series.to_frame() - for md in metadata: - assert getattr(df, md) == metadata[md] + assert_metadata_is_retained(df, original_df=example_df) def test_dropping_index_levels_retains_metadata(): df = example_df.copy() - metadata = {md: getattr(example_df, md) for md in METADATA} df.index = df.index.get_level_values("event_start") # drop all other index levels - for md in metadata: - assert getattr(df, md) == metadata[md] + assert_metadata_is_retained(df, original_df=example_df) @pytest.mark.parametrize("drop_level", [True, False]) @@ -383,12 +378,9 @@ def test_slicing_retains_metadata(drop_level): Test whether slicing the index of a BeliefsDataFrame retains the metadata. """ df = example_df - metadata = {md: getattr(example_df, md) for md in METADATA} df = df.xs("2000-01-03 10:00:00+00:00", level="event_start", drop_level=drop_level) print(df) - assert isinstance(df, tb.BeliefsDataFrame) - for md in metadata: - assert getattr(df, md) == metadata[md] + assert_metadata_is_retained(df, original_df=example_df) @pytest.mark.parametrize("resolution", [timedelta(minutes=30), timedelta(hours=2)]) @@ -400,15 +392,13 @@ def test_mean_resampling_retains_metadata(resolution): Succeeds with pandas==1.1.0 """ df = example_df - metadata = {md: getattr(example_df, md) for md in METADATA} df = df.resample(resolution, level="event_start").mean() print(df) - assert isinstance(df, tb.BeliefsDataFrame) - for md in metadata: - # if md == "event_resolution": - # assert df.event_resolution == resolution - # else: # todo: the event_resolution metadata is only updated when resampling using df.resample_events(). A reason to override the original resample method, or otherwise something to document. - assert getattr(df, md) == metadata[md] + assert_metadata_is_retained( + df, + original_df=example_df, + event_resolution=example_df.event_resolution, + ) # todo: the event_resolution metadata is only updated when resampling using df.resample_events(). A reason to override the original resample method, or otherwise something to document. @pytest.mark.parametrize("resolution", [timedelta(minutes=30), timedelta(hours=2)]) @@ -419,7 +409,6 @@ def _test_agg_resampling_retains_metadata(resolution): Fails with pandas==1.1.5 """ df = example_df - metadata = {md: getattr(example_df, md) for md in METADATA} df = df.reset_index(level=["belief_time", "source", "cumulative_probability"]) df = df.resample(resolution).agg( { @@ -431,12 +420,11 @@ def _test_agg_resampling_retains_metadata(resolution): ) df = df.set_index(["belief_time", "source", "cumulative_probability"], append=True) print(df) - assert isinstance(df, tb.BeliefsDataFrame) - for md in metadata: - # if md == "event_resolution": - # assert df.event_resolution == resolution - # else: # todo: the event_resolution metadata is only updated when resampling using df.resample_events(). A reason to override the original resample method, or otherwise something to document. - assert getattr(df, md) == metadata[md] + assert_metadata_is_retained( + df, + original_df=example_df, + event_resolution=example_df.event_resolution, + ) # todo: the event_resolution metadata is only updated when resampling using df.resample_events(). A reason to override the original resample method, or otherwise something to document. def test_groupby_retains_metadata(): @@ -447,19 +435,14 @@ def test_groupby_retains_metadata(): Fixed with pandas==1.1.5 """ df = example_df - metadata = {md: getattr(example_df, md) for md in METADATA} def assert_function(x): print(x) - assert isinstance(x, tb.BeliefsDataFrame) - for md in metadata: - assert getattr(x, md) == metadata[md] + assert_metadata_is_retained(x, original_df=example_df) return x df = df.groupby(level="event_start").apply(lambda x: assert_function(x)) - assert isinstance(df, tb.BeliefsDataFrame) - for md in metadata: - assert getattr(df, md) == metadata[md] + assert_metadata_is_retained(df, original_df=example_df) def test_copy_series_retains_name_and_metadata(): @@ -589,3 +572,19 @@ def _constructor_sliced(self): df2 = getattr(df.groupby("x"), att)(*args) print(df2) assert df2.a == "b" + + +@pytest.mark.parametrize("constant", [1, -1, 3.14, timedelta(hours=1), ["TiledString"]]) +def test_multiplication_with_constant_retains_metadata(constant): + """ Check whether the metadata is still there after multiplication. """ + # GH 35 + df = example_df * constant + assert_metadata_is_retained(df, original_df=example_df) + + # Also check suggested workarounds from GH 35 + if constant == -1: + df = -example_df + assert_metadata_is_retained(df, original_df=example_df) + + df = example_df.abs() + assert_metadata_is_retained(df, original_df=example_df) diff --git a/timely_beliefs/tests/utils.py b/timely_beliefs/tests/utils.py index 05fc7d9c..3e368c8c 100644 --- a/timely_beliefs/tests/utils.py +++ b/timely_beliefs/tests/utils.py @@ -1,7 +1,37 @@ -from typing import Union +from datetime import timedelta +from typing import Optional, Union import numpy as np +import timely_beliefs as tb +from timely_beliefs.beliefs.classes import METADATA + def equal_lists(list_a: Union[list, np.ndarray], list_b: Union[list, np.ndarray]): return all(np.isclose(a, b) for a, b in zip(list_a, list_b)) + + +def assert_metadata_is_retained( + result_df: Union[tb.BeliefsDataFrame, tb.BeliefsSeries], + original_df: tb.BeliefsDataFrame, + is_series: bool = False, + event_resolution: Optional[timedelta] = None, +): + """Fail if result_df is not a BeliefsDataFrame with the same metadata as the original BeliefsDataFrame. + + Can also be used to check for a BeliefsSeries (using is_series=True). + + :param result_df: BeliefsDataFrame or BeliefsSeries to be checked for metadata propagation + :param original_df: BeliefsDataFrame containing the original metadata + :param is_series: if True, we check that the result is a BeliefsSeries rather than a BeliefsDataFrame + :param event_resolution: optional timedelta in case we expect a different event_resolution than the original + """ + metadata = {md: getattr(original_df, md) for md in METADATA} + assert isinstance( + result_df, tb.BeliefsDataFrame if not is_series else tb.BeliefsSeries + ) + for md in metadata: + if md == "event_resolution" and event_resolution is not None: + assert result_df.event_resolution == event_resolution + else: + assert getattr(result_df, md) == metadata[md]