diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 6c4d65f0d8..29516aee22 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -2,6 +2,8 @@ Release Notes ------------- **Future Releases** * Enhancements + * Improved decomposer ``determine_periodicity`` functionality for better period guesses :pr:`3912` + * Added ``dates_needed_for_prediction`` for time series pipelines :pr:`3906` * Added ``RFClassifierRFESelector`` and ``RFRegressorRFESelector`` components for feature selection using recursive feature elimination :pr:`3934` * Fixes * Fixed `set_period()` not updating decomposer parameters :pr:`3932` @@ -25,7 +27,6 @@ Release Notes * Added the ability to retrieve prediction intervals for estimators that support time series regression :pr:`3876` * Added utils to handle the logic for threshold tuning objective and resplitting data :pr:`3888` * Integrated ``OrdinalEncoder`` into AutoMLSearch :pr:`3765` - * Added ``dates_needed_for_prediction`` for time series pipelines :pr:`3906` * Fixes * Fixed ARIMA not accounting for gap in prediction from end of training data :pr:`3884` * Fixed ``DefaultAlgorithm`` adding an extra ``OneHotEncoder`` when a categorical column is not selected :pr:`3914` diff --git a/docs/source/user_guide/timeseries.ipynb b/docs/source/user_guide/timeseries.ipynb index b390aedb61..ec23ddbd8b 100644 --- a/docs/source/user_guide/timeseries.ipynb +++ b/docs/source/user_guide/timeseries.ipynb @@ -187,7 +187,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can use the `TimeSeriesRegularizer` component to restore the missing and NaN DateTime values we've created in our data. This component is designed to infer the proper frequency using [Woodwork's \"infer_frequency\"](https://woodwork.alteryx.com/en/stable/generated/woodwork.statistics_utils.infer_frequency.html) function and generate a new DataFrame that follows it. In order to maintain as much original information from the input data as possible, all rows with completely correct times are ported over into this new DataFrame. If there are any rows that have the same timestamp as another, these will be dropped. The first occurrence of a date or time maintains priority. If there are any values that don’t quite line up with the inferred frequency they will be shifted to any closely missing datetimes, or dropped if there are none nearby." + "We can use the `TimeSeriesRegularizer` component to restore the missing and NaN DateTime values we've created in our data. This component is designed to infer the proper frequency using [Woodwork's \"infer_frequency\"](https://woodwork.alteryx.com/en/stable/generated/woodwork.statistics_utils.infer_frequency.html) function and generate a new DataFrame that follows it. In order to maintain as much original information from the input data as possible, all rows with completely correct times are ported over into this new DataFrame. If there are any rows that have the same timestamp as another, these will be dropped. The first occurrence of a date or time maintains priority. If there are any values that don't quite line up with the inferred frequency they will be shifted to any closely missing datetimes, or dropped if there are none nearby." ] }, { @@ -229,7 +229,7 @@ "source": [ "### Time Series Imputer\n", "\n", - "We could easily use the `Imputer` and `TargetImputer` components to fill in the missing gaps in our `X` and `y` data. However, these tools are not built for time series problems. Their supported imputation strategies of \"mean\", \"most_frequent\", or similar are all static. They don’t account for the passing of time, and how neighboring data points may have more predictive power than simply taking the average. The `TimeSeriesImputer` solves this problem by offering three different imputation strategies:\n", + "We could easily use the `Imputer` and `TargetImputer` components to fill in the missing gaps in our `X` and `y` data. However, these tools are not built for time series problems. Their supported imputation strategies of \"mean\", \"most_frequent\", or similar are all static. They don't account for the passing of time, and how neighboring data points may have more predictive power than simply taking the average. The `TimeSeriesImputer` solves this problem by offering three different imputation strategies:\n", "- \"forwards_fill\": fills in any NaN values with the same value as found in the previous non-NaN cell.\n", "- \"backwards_fill\": fills in any NaN values with the same value as found in the next non-NaN cell.\n", "- \"interpolate\": (numeric columns only) fills in any NaN values by linearly interpolating the values of the previous and next non-NaN cells." @@ -386,7 +386,7 @@ "pdc.fit(X_train_time, y_train_time)\n", "assert pdc.period == 7\n", "pdc.set_period(X_train_time, y_train_time)\n", - "assert 363 < pdc.period < 368" + "assert 350 < pdc.period < 370" ] }, { diff --git a/evalml/pipelines/components/transformers/preprocessing/decomposer.py b/evalml/pipelines/components/transformers/preprocessing/decomposer.py index 8466d01bc3..19c96b0119 100644 --- a/evalml/pipelines/components/transformers/preprocessing/decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/decomposer.py @@ -26,6 +26,9 @@ class Decomposer(Transformer): seasonal_smoother (int): The seasonal smoothing parameter for STLDecomposer, not used for PolynomialDecomposer. time_index (str) : The column name of the feature matrix (X) that the datetime information should be pulled from. + acf_threshold (float) : The threshold for the autocorrelation function to determine the period. Any values below + the threshold are considered to be 0 and will not be considered for the period. Defaults to 0.01. + rel_max_order (int) : The order of the relative maximum to determine the period. Defaults to 5. """ name = "Decomposer" @@ -43,6 +46,8 @@ def __init__( period: int = -1, seasonal_smoother: int = 7, time_index: str = None, + acf_threshold: float = 0.01, + rel_max_order: int = 5, **kwargs, ): degree = self._raise_typeerror_if_not_int("degree", degree) @@ -57,6 +62,8 @@ def __init__( "period": period, "seasonal_smoother": self.seasonal_smoother, "time_index": time_index, + "acf_threshold": acf_threshold, + "rel_max_order": rel_max_order, } parameters.update(kwargs) super().__init__( @@ -126,75 +133,62 @@ def determine_periodicity( self, X: pd.DataFrame, y: pd.Series, - method: str = "autocorrelation", ): - """Function that uses autocorrelative methods to determine the first, signficant period of the seasonal signal. + """Function that uses autocorrelative methods to determine the likely most signficant period of the seasonal signal. Args: X (pandas.DataFrame): The feature data of the time series problem. y (pandas.Series): The target data of a time series problem. - method (str): Either "autocorrelation" or "partial-autocorrelation". The method by which to determine the - first period of the seasonal part of the target signal. "partial-autocorrelation" should currently not - be used. Defaults to "autocorrelation". Returns: - (list[int]): The integer numbers of entries in time series data over which the seasonal part of the target data + int: The integer number of entries in time series data over which the seasonal part of the target data repeats. If the time series data is in days, then this is the number of days that it takes the target's seasonal signal to repeat. Note: the target data can contain multiple seasonal signals. This function - will only return the first, and thus, shortest period. E.g. if the target has both weekly and yearly - seasonality, the function will only return "7" and not return "365". If no period is detected, returns [None]. + will only return the stronger. E.g. if the target has both weekly and yearly seasonality, the function + may return either "7" or "365", depending on which seasonality is more strongly autocorrelated. If no + period is detected, returns None. """ def _get_rel_max_from_acf(y): """Determines the relative maxima of the target's autocorrelation.""" + acf_threshold = self.parameters.get("acf_threshold", 0.01) + rel_max_order = self.parameters.get("rel_max_order", 5) + acf = sm.tsa.acf(y, nlags=np.maximum(400, len(y))) - filter_acf = [acf[i] if (acf[i] > 0) else 0 for i in range(len(acf))] + # Filter out small values to avoid picking up noise + filter_acf = [ + acf[i] if (acf[i] > acf_threshold) else 0 for i in range(len(acf)) + ] rel_max = argrelextrema( np.array(filter_acf), np.greater, - order=5, # considers 5 points on either side to determine rel max + order=rel_max_order, # considers `order` points on either side to determine rel max )[0] + if len(rel_max) == 0: + return None max_acfs = [acf[i] for i in rel_max] - if len(max_acfs) > 0: - rel_max = np.array([filter_acf.index(max(max_acfs))]) - else: - rel_max = [] - return rel_max - - def _get_rel_max_from_pacf(y): - """Determines the relative maxima of the target's partial autocorrelation.""" - pacf = sm.tsa.pacf(y) - return argrelextrema(pacf, np.greater)[0] + return rel_max[np.argmax(max_acfs)] def _detrend_on_fly(X, y): - """Uses the underlying decomposer to determine the target's trend and remove it.""" - self.fit(X, y) - res = self.get_trend_dataframe(X, y) - y_time_index = self._set_time_index(X, y) - y_detrended = y_time_index - res[0]["trend"] - return y_detrended - - if method == "autocorrelation": - _get_rel_max = _get_rel_max_from_acf - elif method == "partial-autocorrelation": - self.logger.warning( - "Partial autocorrelations are not currently guaranteed to be accurate due to the need for continuing " - "algorithmic work and should not be used at this time.", - ) - _get_rel_max = _get_rel_max_from_pacf + """Uses a moving average to determine the target's trend and remove it.""" + # A larger moving average will be less likely to remove the seasonal signal + # but we need to make sure we're not passing in a window that's larger than the data + moving_avg = min(51, len(y) // 3) + y_trend_estimate = y.rolling(moving_avg).mean().dropna() + y_detrended = y - y_trend_estimate + return round( + y_detrended.dropna(), + 10, + ) # round to 10 decimal places to avoid floating point errors # Make the data more stationary by detrending y_detrended = _detrend_on_fly(X, y) - relative_maxima = _get_rel_max(y_detrended) - self.logger.info( - f"Decomposer discovered {len(relative_maxima)} possible periods.", - ) + relative_maxima = _get_rel_max_from_acf(y_detrended) - if len(relative_maxima) == 0: + if relative_maxima is None: self.logger.warning("No periodic signal could be detected in target data.") - relative_maxima = [None] - return relative_maxima[0] + return relative_maxima def set_period(self, X: pd.DataFrame, y: pd.Series): """Function to set the component's seasonal period based on the target's seasonality. diff --git a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py index e8884e69e8..d47b3ae688 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py @@ -443,27 +443,19 @@ def test_decomposer_set_period(decomposer_child_class, period, generate_seasonal "decomposer_child_class", decomposer_list, ) -@pytest.mark.parametrize( - "periodicity_determination_method", - [ - "autocorrelation", - pytest.param( - "partial-autocorrelation", - marks=pytest.mark.xfail(reason="Partial Autocorrelation not working yet."), - ), - ], -) @pytest.mark.parametrize("decomposer_picked_correct_degree", [True, False]) @pytest.mark.parametrize( "synthetic_data,trend_degree,period", - [*itertools.product(["synthetic"], [1, 2, 3], [7, 30, 365]), ("real", 1, 365)], + [ + *itertools.product(["synthetic"], [1, 2, 3], [None, 7, 30, 365]), + ("real", 1, 365), + ], ) def test_decomposer_determine_periodicity( decomposer_child_class, period, trend_degree, decomposer_picked_correct_degree, - periodicity_determination_method, synthetic_data, generate_seasonal_data, ): @@ -478,16 +470,11 @@ def test_decomposer_determine_periodicity( trend_degree = 1 if trend_degree in [2, 3] else 2 dec = decomposer_child_class(degree=trend_degree, period=period) - ac = dec.determine_periodicity(X, y, method=periodicity_determination_method) + ac = dec.determine_periodicity(X, y) - # There's one flaky test case, but only in GitHub CI. - # Will file an issue to investigate why it's different in CI. - if ( - synthetic_data != "synthetic" - and trend_degree != 3 - and period != 365 - and not isinstance(decomposer_child_class, STLDecomposer) - ): + if period is None: + assert ac is None + else: assert 0.95 * period <= ac <= 1.05 * period diff --git a/evalml/tests/component_tests/decomposer_tests/test_polynomial_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_polynomial_decomposer.py index a0fccf7423..c069479649 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_polynomial_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_polynomial_decomposer.py @@ -18,6 +18,8 @@ def test_polynomial_decomposer_init(): "period": -1, "seasonal_smoother": 7, "time_index": "dates", + "acf_threshold": 0.01, + "rel_max_order": 5, } diff --git a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py index ea91fbe315..9c9a32fce0 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py @@ -18,6 +18,8 @@ def test_stl_decomposer_init(): "period": None, "seasonal_smoother": 7, "time_index": "dates", + "acf_threshold": 0.01, + "rel_max_order": 5, }