From 97ff8a58d67d9e979387da8315611f6975890c16 Mon Sep 17 00:00:00 2001 From: Ihssan Date: Wed, 26 Dec 2018 21:06:40 +0200 Subject: [PATCH 1/5] wrote methods for dynamic error thresholding --- mlprimitives/timeseries_errors.py | 244 ++++++++++++++++++++++++++++++ 1 file changed, 244 insertions(+) create mode 100644 mlprimitives/timeseries_errors.py diff --git a/mlprimitives/timeseries_errors.py b/mlprimitives/timeseries_errors.py new file mode 100644 index 00000000..b79e2e22 --- /dev/null +++ b/mlprimitives/timeseries_errors.py @@ -0,0 +1,244 @@ +""" +Methods to do dynamic error thresholding on timeseries data. +Implementation inspired by: https://arxiv.org/pdf/1802.04431.pdf +""" + +import numpy as np +import more_itertools as mit + +def get_forecast_error(y_hat, y_true, window_size=5, batch_size=30, smoothing_percent=0.05, smoothed=True): + """ + Calculates the forecasting error for two arrays of data. If smoothed errors desired, + runs EWMA. + Args: + y_hat (list): forecasted values. len(y_hat)==len(y_true). + y_true (list): true values. len(y_hat)==len(y_true). + window_size (int): + batch_size (int): + smoothing_percent (float): + smoothed (bool): whether the returned errors should be smoothed with EWMA. + Returns: + (list): error residuals. Smoothed if specified by user. + """ + errors = [abs(y_h - y_t) for y_h, y_t in zip(y_hat, y_true)] + + if not smoothed: + return errors + + historical_error_window = window_size * batch_size * smoothing_percent + moving_avg = [] + for i in range(len(errors)): + left_window = i - historical_error_window + right_window = i + historical_error_window + 1 + if left_window < 0: + left_window = 0 + if right_window > len(errors): + right_window = len(errors) + moving_avg.append(np.mean(errors[left_window:right_window])) + + return moving_avg + +def extract_anomalies(y_true, smoothed_errors, window_size, batch_size, error_buffer): + """ + Extracts anomalies from the errors. + Args: + y_true (): + smoothed_errors (): + window_size (int): + batch_size (int): + error_buffer (int): + Returns: + """ + if len(y_true) <= batch_size * window_size: + raise ValueError("Window size (%s) larger than y_true (len=%s)." % (batch_size, len(y_true))) + num_windows = (len(y_true) - (batch_size * window_size)) / batch_size + + anomalies_indices = [] + + + for i in range(num_windows + 1): + prev_index = i * batch_size + curr_index = (window_size * batch_size) + (i * batch_size) + + if i == num_windows + 1: + curr_index = len(y_true) + + + window_smoothed_errors = smoothed_errors[prev_index:curr_index] + window_y_true = y_true[prev_index:curr_index] + + epsilon, sd_threshold = compute_threshold(window_smoothed_errors, error_buffer) + window_anom_indices = get_anomalies(window_smoothed_errors, window_y_true, sd_threshold, i, anomalies_indices, len(y_true)) + + # get anomalies from inverse of smoothed errors + # This was done in the implementation of NASA paper but + # wasn't referenced in the paper + + # we get the inverse by flipping around the mean + smoothed_errors_inv = [mu + (mu-e) for e in window_smoothed_errors] + epsilon_inv, sd_inv = compute_threshold(smoothed_errors_inv, error_buffer) + inv_anom_indices = get_anomalies(smoothed_errors_inv, window_y_true, sd_inv, i, anomalies_indices, len(y_true)) + + anomalies_indices = list(set(anomalies_indices + inv_anom_indices)) + + anomalies_indices.extend([i_a + i*batch_size for i_a in window_anom_indices]) + + # group anomalies + anomalies_indices = sorted(list(set(anomalies_indices))) + anomalies_groups = [list(group) for group in mit.consecutive_groups(anomalies_indices)] + anomaly_sequences = [(g[0], g[-1]) for g in anomalies_groups if not g[0] == g[-1]] + + # generate "scores" for anomalies based on the max distance from epsilon for each sequence + anomalies_scores = [] + for e_seq in anomaly_sequences: + denominator = np.mean(smoothed_errors) + np.std(smoothed_errors) + score = max([abs(smoothed_errors[x] - epsilon) / denominator + for x in range(e_seq[0], e_seq[1])]) + anomalies_scores.append(score) + + return anomaly_sequences, anomalies_scores + +def compute_threshold(smoothed_errors, error_buffer, sd_limit=12.0): + """ + Helper method for extract_anomalies(). + Calculates the epsilon (threshold) for anomalies. + """ + mu = np.mean(smoothed_errors) + sigma = np.std(smoothed_errors) + + max_epsilon = 0 + sigma_threshold = sd_limit + + # The treshold is determined dynamically by testing multiple Zs. + # z is drawn from an ordered set of positive values representing the + # number of standard deviations above mean(smoothed_errors) + + # here we iterate in increments of 0.5 on the range that the NASA paper found to be good + for z in np.arange(2.5, sd_limit, 0.5): + epsilon = mu + (sigma * z) + below_epsilon, below_indices, above_epsilon = [], [], [] + + for i in range(len(smoothed_errors)): + e = smoothed_errors[i] + if e < epsilon: + # save to compute delta mean and delta std + # these are important for epsilon calculation + below_epsilon.append(e) + below_indices.append(i) + if e > epsilon: + # above_epsilon values are anomalies + for j in range(0, error_buffer): + if (i + j) not in above_epsilon and (i + j) < len(smoothed_errors): + above_epsilon.append(i + j) + if (i - j) not in above_epsilon and (i - j) >= 0: + above_epsilon.append(i - j) + + if len(above_epsilon) == 0: + continue + + # generate sequences + above_epsilon = sorted(list(set(above_epsilon))) + groups = [list(group) for group in mit.consecutive_groups(above_epsilon)] + above_sequences = [(g[0], g[-1]) for g in groups if not g[0] == g[-1]] + + mean_perc_decrease = (mu - np.mean(pruned_errors)) / mu + sd_perc_decrease = (sigma - np.std(pruned_errors)) / sd + epsilon = (mean_perc_decrease + sd_perc_decrease) / (len(above_sequences)**2 + len(anomaly_indices)) + + # update the largest epsilon we've seen so far + if epsilon > max_epsilon: + sd_threshold = z + max_epsilon = epsilon + + # sd_threshold can be multiplied by sigma to get epsilon + return max_epsilon, sd_threshold + + +def get_anomalies(smoothed_errors, y_true, z): + """ + Helper method to get anomalies. + """ + + mu = np.mean(smoothed_errors) + sigma = np.std(smoothed_errors) + + epsilon = mu + z*sigma + + # compare to epsilon + errors_seq, anomaly_indices, max_error_below_e = group_consecutive_anomalies(smoothed_errors, epsilon, y_true, error_buffer, window, all_anomalies) + + if len(errors_seq) > 0: + anomaly_indices = prune_anomalies(errors_seq, smoothed_errors, non_anomaly_max, anomaly_indices) + else: + print("No anomalies found.") + + return anomaly_indices + +def group_consecutive_anomalies(smoothed_errors, epsilon, y_true, error_buffer, window, all_anomalies, batch_size=30): + channel_std = np.std(y_true) + upper_percentile, lower_percentile = np.percentile(y_true, [95, 5] + accepted_range = upper_percentile - lower_percentile + + anomaly_indices = [] + max_error_below_e = 0 + + for i in range(len(smoothed_errors)): + if smoothed_errors[i] <= epsilon or smoothed_errors[i] <= 0.05*accepted_range: + # not an anomaly + continue + for j in range(error_buffer): + if (i + j) < len(smoothed_errors) and (i + j) not in anomaly_indices: + anomaly_indices.append(i + j) + if (i - j) < len(smoothed_errors) and (i - j) not in anomaly_indices: + anomaly_indices.append(i - j) + + # get all the errors that are below epsilon and which weren't identified as anomalies to process them + for i in range(len(smoothed_errors)): + adjusted_index = i + (window - 1) * batch_size + if smoothed_errors[i] > max_error_below_e and adjusted_index not in all_anomalies and i not in anomaly_indices: + max_error_below_e = smoothed_errors[i] + + # group anomalies into continuous sequences + anomaly_indices = sorted(list(set(anomaly_indices))) + groups = [list(group) for group in mit.consecutive_groups(anomaly_indices)] + E_seq = [(g[0], g[-1]) for g in group if g[0] != g[-1]] + + return E_seq, anomaly_indices, max_error_below_e + +def prune_anomalies(E_seq, smoothed_errors, max_error_below_e, anomaly_indices): + """ Helper method that removes anomalies which don't meet + a minimum separation from next anomaly. + """ + # min accepted perc decrease btwn max errors in anomalous sequences + MIN_PERCENT_DECREASE = 0.05 + E_seq_max, smoothed_errors_max = [], [] + + for error_seq in E_seq: + sliced_errors = smoothed_errors[error_seq[0]:error_seq[1]] + E_seq_max.append(max(sliced_errors)) + smoothed_errors_max.append(max(sliced_errors)) + + smoothed_errors_max.sort(reverse=True) + + if max_error_below_e > 0: + smoothed_errors_max.append(max_error_below_e) + indices_remove = [] + + for i in range(len(smoothed_errors_max)): + if i < len(smoothed_errors_max) - 1: + delta = smoothed_errors_max[i] - smoothed_errors_max[i + 1] + perc_change = delta / smoothed_errors_max[i] + if perc_change < MIN_PERCENT_DECREASE: + indices_remove.append(E_seq_max.index(smoothed_errors_max[i])) + + for index in sorted(indices_remove, reverse = True): + del E_seq[index] + + pruned_indices = [] + + for i in anomaly_indices: + for error_seq in E_seq: + if i >= error_seq[0] and i <= error_seq[1]: + pruned_indices.append(i) + + return pruned_indices From 715a4cbd11742c070849777c5539f110498902e5 Mon Sep 17 00:00:00 2001 From: Ihssan Date: Wed, 26 Dec 2018 22:20:54 +0200 Subject: [PATCH 2/5] Issue 48: created json primitive files --- ...s.timeseries_errors.extract_anomalies.json | 62 +++++++++++++++++ ....timeseries_errors.get_forecast_error.json | 66 +++++++++++++++++++ mlprimitives/timeseries_errors.py | 66 ++++++++++--------- 3 files changed, 163 insertions(+), 31 deletions(-) create mode 100644 mlblocks_primitives/mlprimitives.timeseries_errors.extract_anomalies.json create mode 100644 mlblocks_primitives/mlprimitives.timeseries_errors.get_forecast_error.json diff --git a/mlblocks_primitives/mlprimitives.timeseries_errors.extract_anomalies.json b/mlblocks_primitives/mlprimitives.timeseries_errors.extract_anomalies.json new file mode 100644 index 00000000..6848c154 --- /dev/null +++ b/mlblocks_primitives/mlprimitives.timeseries_errors.extract_anomalies.json @@ -0,0 +1,62 @@ +{ + "name": "mlprimitives.timeseries_errors.extract_anomalies", + "author": "Ihssan Tinawi ", + "description": "mlprimitives.timeseries_errors.extract_anomalies", + "classifiers": { + "type": "preprocessor", + "subtype": "feature_extractor" + }, + "modalities": ["timeseries"], + "primitive": "mlprimitives.timeseries_errors.extract_anomalies", + "produce": { + "args": [ + { + "name": "y_true", + "type": "np.array" + }, + { + "name": "smoothed_errors", + "type": "list" + }, + { + "name": "window_size", + "type": "int" + }, + { + "name": "batch_size", + "type": "int" + }, + { + "name": "error_buffer", + "type": "int" + } + ], + "output": [ + { + "name": "anomaly_sequences", + "type": "list" + }, + { + "name": "anomalies_scores", + "type": "list" + } + ] + }, + "hyperparameters": { + "fixed": { + "window_size": { + "type": "int", + "default": 50 + }, + "batch_size": { + "type": "int", + "default": 200 + }, + "error_buffer": { + "type": "int", + "default": 10 + } + }, + "tunable": {} + } +} diff --git a/mlblocks_primitives/mlprimitives.timeseries_errors.get_forecast_error.json b/mlblocks_primitives/mlprimitives.timeseries_errors.get_forecast_error.json new file mode 100644 index 00000000..eb176fbf --- /dev/null +++ b/mlblocks_primitives/mlprimitives.timeseries_errors.get_forecast_error.json @@ -0,0 +1,66 @@ +{ + "name": "mlprimitives.timeseries_errors.get_forecast_errors", + "author": "Ihssan Tinawi ", + "description": "mlprimitives.timeseries_errors.get_forecast_errors", + "classifiers": { + "type": "preprocessor", + "subtype": "feature_extractor" + }, + "modalities": ["timeseries"], + "primitive": "mlprimitives.timeseries_errors.get_forecast_errors", + "produce": { + "args": [ + { + "name": "y_hat", + "type": "np.array" + }, + { + "name": "y_true", + "type": "np.array" + }, + { + "name": "window_size", + "type": "int" + }, + { + "name": "batch_size", + "type": "int" + }, + { + "name": "smoothing_percent", + "type": "float" + }, + { + "name": "smoothed", + "type": "bool" + } + ], + "output": [ + { + "name": "moving_avg", + "type": "list" + } + ] + }, + "hyperparameters": { + "fixed": { + "window_size": { + "type": "int", + "default": 5 + }, + "batch_size": { + "type": "int", + "default": 30 + }, + "smoothing_percent": { + "type": "float", + "default": 0.05 + }, + "smoothed": { + "type": "bool", + "default": True + } + }, + "tunable": {} + } +} diff --git a/mlprimitives/timeseries_errors.py b/mlprimitives/timeseries_errors.py index b79e2e22..52b76dfe 100644 --- a/mlprimitives/timeseries_errors.py +++ b/mlprimitives/timeseries_errors.py @@ -1,10 +1,9 @@ -""" -Methods to do dynamic error thresholding on timeseries data. -Implementation inspired by: https://arxiv.org/pdf/1802.04431.pdf -""" - -import numpy as np import more_itertools as mit +import numpy as np + +# Methods to do dynamic error thresholding on timeseries data +# Implementation inspired by: https://arxiv.org/pdf/1802.04431.pdf + def get_forecast_error(y_hat, y_true, window_size=5, batch_size=30, smoothing_percent=0.05, smoothed=True): """ @@ -38,6 +37,7 @@ def get_forecast_error(y_hat, y_true, window_size=5, batch_size=30, smoothing_pe return moving_avg + def extract_anomalies(y_true, smoothed_errors, window_size, batch_size, error_buffer): """ Extracts anomalies from the errors. @@ -55,7 +55,6 @@ def extract_anomalies(y_true, smoothed_errors, window_size, batch_size, error_bu anomalies_indices = [] - for i in range(num_windows + 1): prev_index = i * batch_size curr_index = (window_size * batch_size) + (i * batch_size) @@ -63,25 +62,26 @@ def extract_anomalies(y_true, smoothed_errors, window_size, batch_size, error_bu if i == num_windows + 1: curr_index = len(y_true) - window_smoothed_errors = smoothed_errors[prev_index:curr_index] window_y_true = y_true[prev_index:curr_index] epsilon, sd_threshold = compute_threshold(window_smoothed_errors, error_buffer) - window_anom_indices = get_anomalies(window_smoothed_errors, window_y_true, sd_threshold, i, anomalies_indices, len(y_true)) + + window_anom_indices = get_anomalies(window_smoothed_errors, window_y_true, sd_threshold, i, anomalies_indices, error_buffer) # get anomalies from inverse of smoothed errors # This was done in the implementation of NASA paper but # wasn't referenced in the paper - + # we get the inverse by flipping around the mean - smoothed_errors_inv = [mu + (mu-e) for e in window_smoothed_errors] + mu = np.mean(window_smoothed_errors) + smoothed_errors_inv = [mu + (mu - e) for e in window_smoothed_errors] epsilon_inv, sd_inv = compute_threshold(smoothed_errors_inv, error_buffer) inv_anom_indices = get_anomalies(smoothed_errors_inv, window_y_true, sd_inv, i, anomalies_indices, len(y_true)) anomalies_indices = list(set(anomalies_indices + inv_anom_indices)) - anomalies_indices.extend([i_a + i*batch_size for i_a in window_anom_indices]) + anomalies_indices.extend([i_a + i * batch_size for i_a in window_anom_indices]) # group anomalies anomalies_indices = sorted(list(set(anomalies_indices))) @@ -98,6 +98,7 @@ def extract_anomalies(y_true, smoothed_errors, window_size, batch_size, error_bu return anomaly_sequences, anomalies_scores + def compute_threshold(smoothed_errors, error_buffer, sd_limit=12.0): """ Helper method for extract_anomalies(). @@ -107,10 +108,10 @@ def compute_threshold(smoothed_errors, error_buffer, sd_limit=12.0): sigma = np.std(smoothed_errors) max_epsilon = 0 - sigma_threshold = sd_limit + sd_threshold = sd_limit # The treshold is determined dynamically by testing multiple Zs. - # z is drawn from an ordered set of positive values representing the + # z is drawn from an ordered set of positive values representing the # number of standard deviations above mean(smoothed_errors) # here we iterate in increments of 0.5 on the range that the NASA paper found to be good @@ -140,10 +141,10 @@ def compute_threshold(smoothed_errors, error_buffer, sd_limit=12.0): above_epsilon = sorted(list(set(above_epsilon))) groups = [list(group) for group in mit.consecutive_groups(above_epsilon)] above_sequences = [(g[0], g[-1]) for g in groups if not g[0] == g[-1]] - - mean_perc_decrease = (mu - np.mean(pruned_errors)) / mu - sd_perc_decrease = (sigma - np.std(pruned_errors)) / sd - epsilon = (mean_perc_decrease + sd_perc_decrease) / (len(above_sequences)**2 + len(anomaly_indices)) + + mean_perc_decrease = (mu - np.mean(below_epsilon)) / mu + sd_perc_decrease = (sigma - np.std(below_epsilon)) / sigma + epsilon = (mean_perc_decrease + sd_perc_decrease) / (len(above_sequences)**2 + len(above_epsilon)) # update the largest epsilon we've seen so far if epsilon > max_epsilon: @@ -154,7 +155,7 @@ def compute_threshold(smoothed_errors, error_buffer, sd_limit=12.0): return max_epsilon, sd_threshold -def get_anomalies(smoothed_errors, y_true, z): +def get_anomalies(smoothed_errors, y_true, z, window, all_anomalies, error_buffer): """ Helper method to get anomalies. """ @@ -162,28 +163,28 @@ def get_anomalies(smoothed_errors, y_true, z): mu = np.mean(smoothed_errors) sigma = np.std(smoothed_errors) - epsilon = mu + z*sigma + epsilon = mu + (z * sigma) # compare to epsilon errors_seq, anomaly_indices, max_error_below_e = group_consecutive_anomalies(smoothed_errors, epsilon, y_true, error_buffer, window, all_anomalies) if len(errors_seq) > 0: - anomaly_indices = prune_anomalies(errors_seq, smoothed_errors, non_anomaly_max, anomaly_indices) + anomaly_indices = prune_anomalies(errors_seq, smoothed_errors, max_error_below_e, anomaly_indices) else: print("No anomalies found.") return anomaly_indices + def group_consecutive_anomalies(smoothed_errors, epsilon, y_true, error_buffer, window, all_anomalies, batch_size=30): - channel_std = np.std(y_true) - upper_percentile, lower_percentile = np.percentile(y_true, [95, 5] + upper_percentile, lower_percentile = np.percentile(y_true, [95, 5]) accepted_range = upper_percentile - lower_percentile anomaly_indices = [] max_error_below_e = 0 - + for i in range(len(smoothed_errors)): - if smoothed_errors[i] <= epsilon or smoothed_errors[i] <= 0.05*accepted_range: + if smoothed_errors[i] <= epsilon or smoothed_errors[i] <= 0.05 * accepted_range: # not an anomaly continue for j in range(error_buffer): @@ -192,19 +193,22 @@ def group_consecutive_anomalies(smoothed_errors, epsilon, y_true, error_buffer, if (i - j) < len(smoothed_errors) and (i - j) not in anomaly_indices: anomaly_indices.append(i - j) - # get all the errors that are below epsilon and which weren't identified as anomalies to process them + # get all the errors that are below epsilon and which + # weren't identified as anomalies to process them for i in range(len(smoothed_errors)): adjusted_index = i + (window - 1) * batch_size - if smoothed_errors[i] > max_error_below_e and adjusted_index not in all_anomalies and i not in anomaly_indices: - max_error_below_e = smoothed_errors[i] + if smoothed_errors[i] > max_error_below_e and adjusted_index not in all_anomalies: + if i not in anomaly_indices: + max_error_below_e = smoothed_errors[i] # group anomalies into continuous sequences anomaly_indices = sorted(list(set(anomaly_indices))) groups = [list(group) for group in mit.consecutive_groups(anomaly_indices)] - E_seq = [(g[0], g[-1]) for g in group if g[0] != g[-1]] + E_seq = [(g[0], g[-1]) for g in groups if g[0] != g[-1]] return E_seq, anomaly_indices, max_error_below_e + def prune_anomalies(E_seq, smoothed_errors, max_error_below_e, anomaly_indices): """ Helper method that removes anomalies which don't meet a minimum separation from next anomaly. @@ -231,7 +235,7 @@ def prune_anomalies(E_seq, smoothed_errors, max_error_below_e, anomaly_indices): if perc_change < MIN_PERCENT_DECREASE: indices_remove.append(E_seq_max.index(smoothed_errors_max[i])) - for index in sorted(indices_remove, reverse = True): + for index in sorted(indices_remove, reverse=True): del E_seq[index] pruned_indices = [] @@ -239,6 +243,6 @@ def prune_anomalies(E_seq, smoothed_errors, max_error_below_e, anomaly_indices): for i in anomaly_indices: for error_seq in E_seq: if i >= error_seq[0] and i <= error_seq[1]: - pruned_indices.append(i) + pruned_indices.append(i) return pruned_indices From 0aa74a5ef921f70d87571107410a1339e76ada8b Mon Sep 17 00:00:00 2001 From: Ihssan Date: Thu, 3 Jan 2019 20:12:26 +0200 Subject: [PATCH 3/5] fixed lint errors, removed args that should be hyperparameters --- ...s.timeseries_errors.extract_anomalies.json | 16 +--- ....timeseries_errors.get_forecast_error.json | 27 ++---- mlprimitives/timeseries_errors.py | 92 ++++++++++++++----- 3 files changed, 78 insertions(+), 57 deletions(-) diff --git a/mlblocks_primitives/mlprimitives.timeseries_errors.extract_anomalies.json b/mlblocks_primitives/mlprimitives.timeseries_errors.extract_anomalies.json index 6848c154..7655396a 100644 --- a/mlblocks_primitives/mlprimitives.timeseries_errors.extract_anomalies.json +++ b/mlblocks_primitives/mlprimitives.timeseries_errors.extract_anomalies.json @@ -17,18 +17,6 @@ { "name": "smoothed_errors", "type": "list" - }, - { - "name": "window_size", - "type": "int" - }, - { - "name": "batch_size", - "type": "int" - }, - { - "name": "error_buffer", - "type": "int" } ], "output": [ @@ -43,7 +31,7 @@ ] }, "hyperparameters": { - "fixed": { + "tunable": { "window_size": { "type": "int", "default": 50 @@ -57,6 +45,6 @@ "default": 10 } }, - "tunable": {} + "fixed": {} } } diff --git a/mlblocks_primitives/mlprimitives.timeseries_errors.get_forecast_error.json b/mlblocks_primitives/mlprimitives.timeseries_errors.get_forecast_error.json index eb176fbf..ec3b9289 100644 --- a/mlblocks_primitives/mlprimitives.timeseries_errors.get_forecast_error.json +++ b/mlblocks_primitives/mlprimitives.timeseries_errors.get_forecast_error.json @@ -17,22 +17,6 @@ { "name": "y_true", "type": "np.array" - }, - { - "name": "window_size", - "type": "int" - }, - { - "name": "batch_size", - "type": "int" - }, - { - "name": "smoothing_percent", - "type": "float" - }, - { - "name": "smoothed", - "type": "bool" } ], "output": [ @@ -43,7 +27,7 @@ ] }, "hyperparameters": { - "fixed": { + "tunable": { "window_size": { "type": "int", "default": 5 @@ -55,12 +39,13 @@ "smoothing_percent": { "type": "float", "default": 0.05 - }, + } + }, + "fixed": { "smoothed": { "type": "bool", - "default": True + "default": true } - }, - "tunable": {} + } } } diff --git a/mlprimitives/timeseries_errors.py b/mlprimitives/timeseries_errors.py index 52b76dfe..1d296bf3 100644 --- a/mlprimitives/timeseries_errors.py +++ b/mlprimitives/timeseries_errors.py @@ -5,7 +5,12 @@ # Implementation inspired by: https://arxiv.org/pdf/1802.04431.pdf -def get_forecast_error(y_hat, y_true, window_size=5, batch_size=30, smoothing_percent=0.05, smoothed=True): +def get_forecast_error(y_hat, + y_true, + window_size=5, + batch_size=30, + smoothing_percent=0.05, + smoothed=True): """ Calculates the forecasting error for two arrays of data. If smoothed errors desired, runs EWMA. @@ -31,8 +36,10 @@ def get_forecast_error(y_hat, y_true, window_size=5, batch_size=30, smoothing_pe right_window = i + historical_error_window + 1 if left_window < 0: left_window = 0 + if right_window > len(errors): right_window = len(errors) + moving_avg.append(np.mean(errors[left_window:right_window])) return moving_avg @@ -50,7 +57,9 @@ def extract_anomalies(y_true, smoothed_errors, window_size, batch_size, error_bu Returns: """ if len(y_true) <= batch_size * window_size: - raise ValueError("Window size (%s) larger than y_true (len=%s)." % (batch_size, len(y_true))) + raise ValueError("Window size (%s) larger than y_true (len=%s)." + % (batch_size, len(y_true))) + num_windows = (len(y_true) - (batch_size * window_size)) / batch_size anomalies_indices = [] @@ -67,7 +76,14 @@ def extract_anomalies(y_true, smoothed_errors, window_size, batch_size, error_bu epsilon, sd_threshold = compute_threshold(window_smoothed_errors, error_buffer) - window_anom_indices = get_anomalies(window_smoothed_errors, window_y_true, sd_threshold, i, anomalies_indices, error_buffer) + window_anom_indices = get_anomalies( + window_smoothed_errors, + window_y_true, + sd_threshold, + i, + anomalies_indices, + error_buffer + ) # get anomalies from inverse of smoothed errors # This was done in the implementation of NASA paper but @@ -77,7 +93,14 @@ def extract_anomalies(y_true, smoothed_errors, window_size, batch_size, error_bu mu = np.mean(window_smoothed_errors) smoothed_errors_inv = [mu + (mu - e) for e in window_smoothed_errors] epsilon_inv, sd_inv = compute_threshold(smoothed_errors_inv, error_buffer) - inv_anom_indices = get_anomalies(smoothed_errors_inv, window_y_true, sd_inv, i, anomalies_indices, len(y_true)) + inv_anom_indices = get_anomalies( + smoothed_errors_inv, + window_y_true, + sd_inv, + i, + anomalies_indices, + len(y_true) + ) anomalies_indices = list(set(anomalies_indices + inv_anom_indices)) @@ -92,17 +115,19 @@ def extract_anomalies(y_true, smoothed_errors, window_size, batch_size, error_bu anomalies_scores = [] for e_seq in anomaly_sequences: denominator = np.mean(smoothed_errors) + np.std(smoothed_errors) - score = max([abs(smoothed_errors[x] - epsilon) / denominator - for x in range(e_seq[0], e_seq[1])]) + score = max([ + abs(smoothed_errors[x] - epsilon) / denominator + for x in range(e_seq[0], e_seq[1]) + ]) + anomalies_scores.append(score) return anomaly_sequences, anomalies_scores def compute_threshold(smoothed_errors, error_buffer, sd_limit=12.0): - """ - Helper method for extract_anomalies(). - Calculates the epsilon (threshold) for anomalies. + """Helper method for `extract_anomalies` method. + Calculates the epsilon (threshold) for anomalies. """ mu = np.mean(smoothed_errors) sigma = np.std(smoothed_errors) @@ -126,11 +151,13 @@ def compute_threshold(smoothed_errors, error_buffer, sd_limit=12.0): # these are important for epsilon calculation below_epsilon.append(e) below_indices.append(i) + if e > epsilon: # above_epsilon values are anomalies for j in range(0, error_buffer): if (i + j) not in above_epsilon and (i + j) < len(smoothed_errors): above_epsilon.append(i + j) + if (i - j) not in above_epsilon and (i - j) >= 0: above_epsilon.append(i - j) @@ -144,7 +171,8 @@ def compute_threshold(smoothed_errors, error_buffer, sd_limit=12.0): mean_perc_decrease = (mu - np.mean(below_epsilon)) / mu sd_perc_decrease = (sigma - np.std(below_epsilon)) / sigma - epsilon = (mean_perc_decrease + sd_perc_decrease) / (len(above_sequences)**2 + len(above_epsilon)) + epsilon = (mean_perc_decrease + sd_perc_decrease) /\ + (len(above_sequences)**2 + len(above_epsilon)) # update the largest epsilon we've seen so far if epsilon > max_epsilon: @@ -166,17 +194,35 @@ def get_anomalies(smoothed_errors, y_true, z, window, all_anomalies, error_buffe epsilon = mu + (z * sigma) # compare to epsilon - errors_seq, anomaly_indices, max_error_below_e = group_consecutive_anomalies(smoothed_errors, epsilon, y_true, error_buffer, window, all_anomalies) + errors_seq, anomaly_indices, max_error_below_e = group_consecutive_anomalies( + smoothed_errors, + epsilon, + y_true, + error_buffer, + window, + all_anomalies + ) if len(errors_seq) > 0: - anomaly_indices = prune_anomalies(errors_seq, smoothed_errors, max_error_below_e, anomaly_indices) + anomaly_indices = prune_anomalies( + errors_seq, + smoothed_errors, + max_error_below_e, + anomaly_indices + ) else: print("No anomalies found.") return anomaly_indices -def group_consecutive_anomalies(smoothed_errors, epsilon, y_true, error_buffer, window, all_anomalies, batch_size=30): +def group_consecutive_anomalies(smoothed_errors, + epsilon, + y_true, + error_buffer, + window, + all_anomalies, + batch_size=30): upper_percentile, lower_percentile = np.percentile(y_true, [95, 5]) accepted_range = upper_percentile - lower_percentile @@ -187,9 +233,11 @@ def group_consecutive_anomalies(smoothed_errors, epsilon, y_true, error_buffer, if smoothed_errors[i] <= epsilon or smoothed_errors[i] <= 0.05 * accepted_range: # not an anomaly continue + for j in range(error_buffer): if (i + j) < len(smoothed_errors) and (i + j) not in anomaly_indices: anomaly_indices.append(i + j) + if (i - j) < len(smoothed_errors) and (i - j) not in anomaly_indices: anomaly_indices.append(i - j) @@ -204,22 +252,22 @@ def group_consecutive_anomalies(smoothed_errors, epsilon, y_true, error_buffer, # group anomalies into continuous sequences anomaly_indices = sorted(list(set(anomaly_indices))) groups = [list(group) for group in mit.consecutive_groups(anomaly_indices)] - E_seq = [(g[0], g[-1]) for g in groups if g[0] != g[-1]] + e_seq = [(g[0], g[-1]) for g in groups if g[0] != g[-1]] - return E_seq, anomaly_indices, max_error_below_e + return e_seq, anomaly_indices, max_error_below_e -def prune_anomalies(E_seq, smoothed_errors, max_error_below_e, anomaly_indices): +def prune_anomalies(e_seq, smoothed_errors, max_error_below_e, anomaly_indices): """ Helper method that removes anomalies which don't meet a minimum separation from next anomaly. """ # min accepted perc decrease btwn max errors in anomalous sequences MIN_PERCENT_DECREASE = 0.05 - E_seq_max, smoothed_errors_max = [], [] + e_seq_max, smoothed_errors_max = [], [] - for error_seq in E_seq: + for error_seq in e_seq: sliced_errors = smoothed_errors[error_seq[0]:error_seq[1]] - E_seq_max.append(max(sliced_errors)) + e_seq_max.append(max(sliced_errors)) smoothed_errors_max.append(max(sliced_errors)) smoothed_errors_max.sort(reverse=True) @@ -233,15 +281,15 @@ def prune_anomalies(E_seq, smoothed_errors, max_error_below_e, anomaly_indices): delta = smoothed_errors_max[i] - smoothed_errors_max[i + 1] perc_change = delta / smoothed_errors_max[i] if perc_change < MIN_PERCENT_DECREASE: - indices_remove.append(E_seq_max.index(smoothed_errors_max[i])) + indices_remove.append(e_seq_max.index(smoothed_errors_max[i])) for index in sorted(indices_remove, reverse=True): - del E_seq[index] + del e_seq[index] pruned_indices = [] for i in anomaly_indices: - for error_seq in E_seq: + for error_seq in e_seq: if i >= error_seq[0] and i <= error_seq[1]: pruned_indices.append(i) From b8aa84b3f781b31da98d70a3f80854933a124478 Mon Sep 17 00:00:00 2001 From: Ihssan Date: Thu, 3 Jan 2019 20:24:25 +0200 Subject: [PATCH 4/5] changed names of files to avoid failing tests --- ... => mlprimitives.timeseries_errors.get_forecast_errors.json} | 0 mlprimitives/timeseries_errors.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename mlblocks_primitives/{mlprimitives.timeseries_errors.get_forecast_error.json => mlprimitives.timeseries_errors.get_forecast_errors.json} (100%) diff --git a/mlblocks_primitives/mlprimitives.timeseries_errors.get_forecast_error.json b/mlblocks_primitives/mlprimitives.timeseries_errors.get_forecast_errors.json similarity index 100% rename from mlblocks_primitives/mlprimitives.timeseries_errors.get_forecast_error.json rename to mlblocks_primitives/mlprimitives.timeseries_errors.get_forecast_errors.json diff --git a/mlprimitives/timeseries_errors.py b/mlprimitives/timeseries_errors.py index 1d296bf3..3ba0498a 100644 --- a/mlprimitives/timeseries_errors.py +++ b/mlprimitives/timeseries_errors.py @@ -5,7 +5,7 @@ # Implementation inspired by: https://arxiv.org/pdf/1802.04431.pdf -def get_forecast_error(y_hat, +def get_forecast_errors(y_hat, y_true, window_size=5, batch_size=30, From b3d08e701abc30de8c696a14969c1850974f4488 Mon Sep 17 00:00:00 2001 From: Ihssan Date: Thu, 3 Jan 2019 21:04:47 +0200 Subject: [PATCH 5/5] fixed lint error: under indented lines --- mlprimitives/timeseries_errors.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mlprimitives/timeseries_errors.py b/mlprimitives/timeseries_errors.py index 3ba0498a..19eb2ae7 100644 --- a/mlprimitives/timeseries_errors.py +++ b/mlprimitives/timeseries_errors.py @@ -6,11 +6,11 @@ def get_forecast_errors(y_hat, - y_true, - window_size=5, - batch_size=30, - smoothing_percent=0.05, - smoothed=True): + y_true, + window_size=5, + batch_size=30, + smoothing_percent=0.05, + smoothed=True): """ Calculates the forecasting error for two arrays of data. If smoothed errors desired, runs EWMA.