-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Resample beliefs about instantaneous sensors #118
Changes from 27 commits
55341e8
7f13512
53a78dc
4f5de41
13f1fbc
66ee209
a904765
f3feabc
167f007
3537784
707ea63
e262dd8
15f6429
d1bbb5b
cc00933
9330d16
b951c78
7bf2217
5395a6d
3bd3d35
1d98721
b3a70b9
bb703f9
5621506
9dec4da
ee00e40
23353ea
ed18518
f943d06
a4fca7e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -73,7 +73,7 @@ class TimedBelief(object): | |
- a cumulative probability (the likelihood of the value being equal or lower than stated)* | ||
|
||
* The default assumption is that the mean value is given (cp=0.5), but if no beliefs about possible other outcomes | ||
are given, then this will be treated as a deterministic belief (cp=1). As an alternative to specifying an cumulative | ||
are given, then this will be treated as a deterministic belief (cp=1). As an alternative to specifying a cumulative | ||
probability explicitly, you can specify an integer number of standard deviations which is translated | ||
into a cumulative probability assuming a normal distribution (e.g. sigma=-1 becomes cp=0.1587). | ||
""" | ||
|
@@ -697,6 +697,15 @@ def __repr__(self): | |
"""Add the sensor and event resolution to the string representation of the BeliefsSeries.""" | ||
return super().__repr__() + "\n" + meta_repr(self) | ||
|
||
@property | ||
def event_frequency(self) -> Optional[timedelta]: | ||
"""Duration between observations of events. | ||
|
||
:returns: a timedelta for regularly spaced observations | ||
None for irregularly spaced observations | ||
""" | ||
return pd.Timedelta(pd.infer_freq(self.index.unique("event_start"))) | ||
|
||
|
||
class BeliefsDataFrame(pd.DataFrame): | ||
"""Beliefs about a sensor. | ||
|
@@ -1053,6 +1062,15 @@ def set_event_value_from_source( | |
) | ||
) | ||
|
||
@property | ||
def event_frequency(self) -> Optional[timedelta]: | ||
"""Duration between observations of events. | ||
|
||
:returns: a timedelta for regularly spaced observations | ||
None for irregularly spaced observations | ||
""" | ||
return pd.Timedelta(pd.infer_freq(self.index.unique("event_start"))) | ||
|
||
@property | ||
def knowledge_times(self) -> pd.DatetimeIndex: | ||
return pd.DatetimeIndex( | ||
|
@@ -1416,6 +1434,13 @@ def resample_events( | |
return self | ||
df = self | ||
|
||
# Resample instantaneous sensors | ||
# The event resolution stays zero, but the event frequency is updated | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, the event resolution doesn't stay zero in all cases, now, right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It does here, because the resampling method defaults to |
||
if df.event_resolution == timedelta(0): | ||
if df.lineage.number_of_events != len(df): | ||
raise NotImplementedError("Please file a GitHub ticket.") | ||
return belief_utils.resample_instantaneous_events(df, event_resolution) | ||
|
||
belief_timing_col = ( | ||
"belief_time" if "belief_time" in df.index.names else "belief_horizon" | ||
) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,12 @@ | ||
from __future__ import annotations | ||
|
||
import warnings | ||
from datetime import datetime, timedelta | ||
from typing import List, Optional, Union | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import pytz | ||
from packaging import version | ||
from pandas.core.groupby import DataFrameGroupBy | ||
|
||
|
@@ -283,7 +286,9 @@ def join_beliefs( | |
if output_resolution > input_resolution: | ||
|
||
# Create new BeliefsDataFrame with downsampled event_start | ||
if output_resolution % input_resolution != timedelta(0): | ||
if input_resolution == timedelta( | ||
0 | ||
) or output_resolution % input_resolution != timedelta(0): | ||
raise NotImplementedError( | ||
"Cannot downsample from resolution %s to %s." | ||
% (input_resolution, output_resolution) | ||
|
@@ -778,6 +783,111 @@ def extreme_timedeltas_not_equal( | |
return td_a != td_b | ||
|
||
|
||
def resample_instantaneous_events( | ||
df: pd.DataFrame | "classes.BeliefsDataFrame", | ||
resolution: timedelta, | ||
method: str | None = None, | ||
dropna: bool = True, | ||
) -> pd.DataFrame | "classes.BeliefsDataFrame": | ||
"""Resample data representing instantaneous events. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This assumption (instantaneous events) is never checked. Or that the index is even a DateTimeIndex There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you reply on this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm more leaning towards making this a private function. Would that take away your concern? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not quite. If it matters so much, why not add an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I opened #124 as a follow-up. |
||
|
||
Updates the event frequency of the resulting data frame, and possibly also its event resolution. | ||
The event resolution is only updated if the resampling method computes a characteristic of a period of events, | ||
like 'mean' or 'first'. | ||
|
||
Note that, for resolutions over 1 hour, the data frequency may not turn out to be constant per se. | ||
This is due to DST transitions: | ||
- The duration between events is typically longer for the fall DST transition. | ||
- The duration between events is typically shorter for the spring DST transition. | ||
This is done to keep the data frequency in step with midnight in the sensor's timezone. | ||
""" | ||
|
||
# Default resampling method for instantaneous sensors | ||
if method is None: | ||
method = "asfreq" | ||
|
||
# Use event_start as the only index level | ||
index_names = df.index.names | ||
df = df.reset_index().set_index("event_start") | ||
|
||
# Resample the data in each unique fixed timezone offset that belongs to the given IANA timezone, then recombine | ||
unique_offsets = df.index.map(lambda x: x.utcoffset()).unique() | ||
resampled_df_offsets = [] | ||
for offset in unique_offsets: | ||
df_offset = df.copy() | ||
# Convert all the data to given timezone offset | ||
df_offset.index = df.index.tz_convert( | ||
pytz.FixedOffset(offset.seconds // 60) | ||
) # offset is max 1439 minutes, so we don't need to check offset.days | ||
# Resample all the data in the given timezone offset, using the given method | ||
resampled_df_offset = getattr(df_offset.resample(resolution), method)() | ||
# Convert back to the original timezone | ||
if isinstance(df.index, pd.DatetimeIndex) and df.index.tz is not None: | ||
resampled_df_timezone = resampled_df_offset.tz_convert(df.index.tz) | ||
elif isinstance(df, classes.BeliefsDataFrame): | ||
# As a backup, use the original timezone from the BeliefsDataFrame's sensor | ||
resampled_df_timezone = resampled_df_offset.tz_convert(df.sensor.timezone) | ||
else: | ||
ValueError("Missing original timezone.") | ||
# See which resampled rows still fall in the given offset, in this timezone | ||
resampled_df_timezone = resampled_df_timezone[ | ||
resampled_df_timezone.index.map(lambda x: x.utcoffset()) == offset | ||
] | ||
resampled_df_offsets.append(resampled_df_timezone) | ||
resampled_df = pd.concat(resampled_df_offsets).sort_index() | ||
|
||
# If possible, infer missing frequency | ||
if resampled_df.index.freq is None and len(resampled_df) > 2: | ||
resampled_df.index.freq = pd.infer_freq(resampled_df.index) | ||
|
||
# Restore the original index levels | ||
resampled_df = resampled_df.reset_index().set_index(index_names) | ||
|
||
if method in ( | ||
"mean", | ||
"max", | ||
"min", | ||
"median", | ||
"count", | ||
"nunique", | ||
"first", | ||
"last", | ||
"ohlc", | ||
"prod", | ||
"size", | ||
"sem", | ||
"std", | ||
"sum", | ||
"var", | ||
"quantile", | ||
): | ||
# These methods derive properties of a period of events. | ||
# Therefore, the event resolution is updated. | ||
# The methods are typically used for downsampling. | ||
resampled_df.event_resolution = resolution | ||
elif method in ( | ||
"asfreq", | ||
"interpolate", | ||
"ffill", | ||
"bfill", | ||
"pad", | ||
"backfill", | ||
"nearest", | ||
): | ||
# These methods derive intermediate events. | ||
# Therefore, the event resolution is unaffected. | ||
# The methods are typically used for upsampling. | ||
pass | ||
else: | ||
raise NotImplementedError( | ||
f"Please file a GitHub ticket for timely-beliefs to support the '{method}' method." | ||
) | ||
|
||
if dropna: | ||
return resampled_df.dropna() | ||
return resampled_df | ||
|
||
|
||
def meta_repr( | ||
tb_structure: Union["classes.BeliefsDataFrame", "classes.BeliefsSeries"] | ||
) -> str: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can't explain why your new function only applies to instantaneous sensors. Can you add that?
And why is that the one case in which we take so much care of DST transitions? (this
resample_events
function here has two other cases)Note: The NB about
keep_only_most_recent_belief=True
applies to only one case, the reader might assume it's about all of them.