Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Inherit metadata when resampling and grouping #23

Merged
merged 15 commits into from
Dec 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ This will create an interactive Vega-Lite chart like the one in the screenshot a

## Development

We welcome other contributions to timely_beliefs.
The `timely_beliefs` package runs on `pandas>=1.1.5`.
Contact us if you need support for older versions.
We welcome other contributions to `timely_beliefs`.

[See our developer docs for details.](dev/dev.md)
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@
"uncertainty",
"lineage",
],
version="0.1.3",
version="1.0.0",
install_requires=[
"pytz",
"pandas>=0.24,<1.1", # test_groupby_preserves_metadata fails on 1.1
"pandas>=1.1.5",
"numpy",
"pyerf",
"SQLAlchemy",
Expand Down
5 changes: 3 additions & 2 deletions timely_beliefs/beliefs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from datetime import datetime, timedelta
from typing import List

import pandas as pd
from pandas.api.extensions import register_dataframe_accessor


Expand Down Expand Up @@ -104,7 +105,7 @@ def number_of_belief_times(self):
def number_of_beliefs(self) -> int:
"""Return the total number of beliefs in the BeliefsDataFrame, including both deterministic beliefs (which
require a single row) and probabilistic beliefs (which require multiple rows)."""
return len(self._obj.for_each_belief(df=self._obj))
return len(self._obj.for_each_belief())

@property
def sources(self) -> List[int]:
Expand All @@ -120,7 +121,7 @@ def number_of_sources(self):
@property
def number_of_probabilistic_beliefs(self) -> int:
"""Return the number of beliefs in the BeliefsDataFrame that are probabilistic (more than 1 unique value)."""
df = self._obj.for_each_belief(df=self._obj).nunique(dropna=True)
df = self._obj.for_each_belief(fnc=pd.DataFrame.nunique, dropna=True)
return len(df[df > 1].max(axis=1).dropna())

@property
Expand Down
79 changes: 55 additions & 24 deletions timely_beliefs/beliefs/classes.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import math
from datetime import datetime, timedelta
from typing import Any, Callable, List, Optional, Tuple, Union
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

import altair as alt
import numpy as np
import pandas as pd
from pandas.core.groupby import DataFrameGroupBy
from sqlalchemy import Column, DateTime, Float, ForeignKey, Integer, Interval
Expand Down Expand Up @@ -375,7 +374,11 @@ class BeliefsSeries(pd.Series):

@property
def _constructor(self):
return BeliefsSeries
def f(*args, **kwargs):
""" Call __finalize__() after construction to inherit metadata. """
return BeliefsSeries(*args, **kwargs).__finalize__(self, method="inherit")

return f

@property
def _constructor_expanddim(self):
Expand Down Expand Up @@ -436,7 +439,13 @@ class BeliefsDataFrame(pd.DataFrame):

@property
def _constructor(self):
return BeliefsDataFrame
def f(*args, **kwargs):
""" Call __finalize__() after construction to inherit metadata. """
return BeliefsDataFrame(*args, **kwargs).__finalize__(
self, method="inherit"
)

return f

@property
def _constructor_sliced(self):
Expand Down Expand Up @@ -1002,32 +1011,28 @@ def resample_events(
and keep_only_most_recent_belief
and df.lineage.number_of_sources == 1
):
df = df.reset_index(
level=[belief_timing_col, "source", "cumulative_probability"]
)
if event_resolution > self.event_resolution:
# downsample
df = df.resample(event_resolution).agg(
{
"event_value": np.nanmean,
"source": "first", # keep the only source
belief_timing_col: "max"
if belief_timing_col == "belief_time"
else "min", # keep only most recent belief
"cumulative_probability": "prod", # assume independent variables
}
)
# make a new BeliefsDataFrame, because agg() doesn't behave nicely for subclassed DataFrames
df = BeliefsDataFrame(
df.reset_index(),
sensor=self.sensor,
event_resolution=event_resolution,
column_functions = {
"event_value": "mean",
"source": "first", # keep the only source
belief_timing_col: "max"
if belief_timing_col == "belief_time"
else "min", # keep only most recent belief
"cumulative_probability": "prod", # assume independent variables
}
df = downsample_beliefs_data_frame(
df, event_resolution, column_functions
)
df.event_resolution = event_resolution
else:
# upsample
df = df.reset_index(
level=[belief_timing_col, "source", "cumulative_probability"]
)
new_index = pd.date_range(
start=df.index[0],
periods=len(df) * self.event_resolution // event_resolution,
periods=len(df) * (self.event_resolution // event_resolution),
freq=event_resolution,
name="event_start",
)
Expand Down Expand Up @@ -1454,7 +1459,7 @@ def set_columns_and_indices_for_empty_frame(df, columns, indices, default_types)
elif default_types[col] in (int, float):
df[col] = pd.to_numeric(df[col])

df.set_index(indices, inplace=True)
df.set_index(indices, inplace=True) # todo: pandas GH30517


def assign_sensor_and_event_resolution(df, sensor, event_resolution):
Expand All @@ -1467,3 +1472,29 @@ def assign_sensor_and_event_resolution(df, sensor, event_resolution):
if sensor
else None
)


def downsample_beliefs_data_frame(
df: BeliefsDataFrame, event_resolution: timedelta, col_att_dict: Dict[str, str]
) -> BeliefsDataFrame:
"""Because df.resample().agg() doesn't behave nicely for subclassed DataFrames,
we aggregate each index level and column separately against the resampled event_start level,
and then recombine them afterwards.
"""
belief_timing_col = (
"belief_time" if "belief_time" in df.index.names else "belief_horizon"
)
event_timing_col = "event_start" if "event_start" in df.index.names else "event_end"
return pd.concat(
[
getattr(
df.reset_index()
.set_index(event_timing_col)[col]
.to_frame()
.resample(event_resolution),
att,
)()
for col, att in col_att_dict.items()
],
axis=1,
).set_index([belief_timing_col, "source", "cumulative_probability"], append=True)
4 changes: 3 additions & 1 deletion timely_beliefs/sources/classes.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
from functools import total_ordering
from typing import Union

from sqlalchemy import Column, Integer, String

from timely_beliefs.db_base import Base


@total_ordering
Flix6x marked this conversation as resolved.
Show resolved Hide resolved
class BeliefSource(object):

"""
A belief source is any data-creating entitiy such as a user, a ML model or a script.
A belief source is any data-creating entity such as a user, a ML model or a script.
"""

name: str
Expand Down
Loading