-
Notifications
You must be signed in to change notification settings - Fork 835
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #320 from arnaudvl/components
outlier detection component
- Loading branch information
Showing
18 changed files
with
4,018 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
MODEL_NAME=OutlierVAE | ||
API_TYPE=REST | ||
SERVICE_TYPE=MODEL | ||
PERSISTENCE=0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,176 @@ | ||
import numpy as np | ||
import pickle | ||
import random | ||
|
||
from model import model | ||
from utils import flatten, performance, outlier_stats | ||
|
||
|
||
class OutlierVAE(object): | ||
""" Outlier detection using variational autoencoders (VAE). | ||
Arguments: | ||
- threshold: (float): reconstruction error (mse) threshold used to classify outliers | ||
- reservoir_size (int): number of observations kept in memory using reservoir sampling used for mean and stdev | ||
Functions: | ||
- reservoir_sampling: applies reservoir sampling to incoming data | ||
- predict: detect and return outliers | ||
- send_feedback: add target labels as part of the feedback loop | ||
- metrics: return custom metrics | ||
""" | ||
def __init__(self,threshold=10,reservoir_size=50000,load_path='./models/'): | ||
|
||
self.threshold = threshold | ||
self.reservoir_size = reservoir_size | ||
self.batch = [] | ||
self.N = 0 # total sample count up until now for reservoir sampling | ||
|
||
# load model architecture parameters | ||
with open(load_path + 'model.pickle', 'rb') as f: | ||
n_features, hidden_layers, latent_dim, hidden_dim = pickle.load(f) | ||
|
||
# instantiate model | ||
self.vae = model(n_features,hidden_layers=hidden_layers,latent_dim=latent_dim,hidden_dim=hidden_dim) | ||
self.vae.load_weights(load_path + 'vae_weights.h5') # load pretrained model weights | ||
self.vae._make_predict_function() | ||
|
||
# load mu and sigma vectors for each feature | ||
with open(load_path + 'mu_sigma.pickle', 'rb') as f: | ||
self.mu, self.sigma = pickle.load(f) | ||
|
||
self._predictions = [] | ||
self._labels = [] | ||
self._mse = [] | ||
self.roll_window = 100 | ||
self.metric = [float('nan') for i in range(18)] | ||
|
||
|
||
def reservoir_sampling(self,X,update_stand=False): | ||
""" Keep batch of data in memory using reservoir sampling. """ | ||
for item in X: | ||
self.N+=1 | ||
if len(self.batch) < self.reservoir_size: | ||
self.batch.append(item) | ||
else: | ||
s = int(random.random() * self.N) | ||
if s < self.reservoir_size: | ||
self.batch[s] = item | ||
|
||
if update_stand: # update mu and sigma | ||
self.mu = np.mean(self.batch,axis=0) | ||
self.sigma = np.std(self.batch,axis=0) | ||
return | ||
|
||
|
||
def predict(self,X,feature_names): | ||
""" Detect outliers from mse using the threshold. | ||
Arguments: | ||
- X: input data | ||
- feature_names | ||
""" | ||
|
||
if self.N < self.reservoir_size: | ||
update_stand = False | ||
else: | ||
update_stand = True | ||
|
||
self.reservoir_sampling(X,update_stand=update_stand) | ||
|
||
X_scaled = (X - self.mu) / (self.sigma + 1e-10) # standardize input variables | ||
|
||
# sample latent variables and calculate reconstruction errors | ||
N = 10 | ||
mse = np.zeros([X.shape[0],N]) | ||
for i in range(N): | ||
preds = self.vae.predict(X_scaled) | ||
mse[:,i] = np.mean(np.power(X_scaled - preds, 2), axis=1) | ||
self.mse = np.mean(mse, axis=1) | ||
self._mse.append(self.mse) | ||
self._mse = flatten(self._mse) | ||
|
||
# make prediction | ||
self.prediction = np.array([1 if e > self.threshold else 0 for e in self.mse]).astype(int) | ||
self._predictions.append(self.prediction) | ||
self._predictions = flatten(self._predictions) | ||
|
||
return self.prediction | ||
|
||
|
||
def send_feedback(self,X,feature_names,reward,truth): | ||
""" Return outlier labels as part of the feedback loop. | ||
Arguments: | ||
- X: input data | ||
- feature_names | ||
- reward | ||
- truth: outlier labels | ||
""" | ||
self.label = truth | ||
self._labels.append(self.label) | ||
self._labels = flatten(self._labels) | ||
|
||
scores = performance(self._labels,self._predictions,roll_window=self.roll_window) | ||
stats = outlier_stats(self._labels,self._predictions,roll_window=self.roll_window) | ||
|
||
convert = flatten([scores,stats]) | ||
metric = [] | ||
for c in convert: # convert from np to native python type to jsonify | ||
metric.append(np.asscalar(np.asarray(c))) | ||
self.metric = metric | ||
|
||
return | ||
|
||
|
||
def metrics(self): | ||
""" Return custom metrics. | ||
Printed with a delay of 1 prediction because the labels are returned in the feedback step. | ||
""" | ||
|
||
if self.mse.shape[0]>1: | ||
raise ValueError('Metrics can only handle single observations.') | ||
|
||
if self.N==1: | ||
pred = float('nan') | ||
err = float('nan') | ||
y_true = float('nan') | ||
else: | ||
pred = int(self._predictions[-2]) | ||
err = self._mse[-2] | ||
y_true = int(self.label[0]) | ||
|
||
is_outlier = {"type":"GAUGE","key":"is_outlier","value":pred} | ||
mse = {"type":"GAUGE","key":"mse","value":err} | ||
obs = {"type":"GAUGE","key":"observation","value":self.N - 1} | ||
threshold = {"type":"GAUGE","key":"threshold","value":self.threshold} | ||
|
||
label = {"type":"GAUGE","key":"label","value":y_true} | ||
|
||
accuracy_tot = {"type":"GAUGE","key":"accuracy_tot","value":self.metric[4]} | ||
precision_tot = {"type":"GAUGE","key":"precision_tot","value":self.metric[5]} | ||
recall_tot = {"type":"GAUGE","key":"recall_tot","value":self.metric[6]} | ||
f1_score_tot = {"type":"GAUGE","key":"f1_tot","value":self.metric[7]} | ||
f2_score_tot = {"type":"GAUGE","key":"f2_tot","value":self.metric[8]} | ||
|
||
accuracy_roll = {"type":"GAUGE","key":"accuracy_roll","value":self.metric[9]} | ||
precision_roll = {"type":"GAUGE","key":"precision_roll","value":self.metric[10]} | ||
recall_roll = {"type":"GAUGE","key":"recall_roll","value":self.metric[11]} | ||
f1_score_roll = {"type":"GAUGE","key":"f1_roll","value":self.metric[12]} | ||
f2_score_roll = {"type":"GAUGE","key":"f2_roll","value":self.metric[13]} | ||
|
||
true_negative = {"type":"GAUGE","key":"true_negative","value":self.metric[0]} | ||
false_positive = {"type":"GAUGE","key":"false_positive","value":self.metric[1]} | ||
false_negative = {"type":"GAUGE","key":"false_negative","value":self.metric[2]} | ||
true_positive = {"type":"GAUGE","key":"true_positive","value":self.metric[3]} | ||
|
||
nb_outliers_roll = {"type":"GAUGE","key":"nb_outliers_roll","value":self.metric[14]} | ||
nb_labels_roll = {"type":"GAUGE","key":"nb_labels_roll","value":self.metric[15]} | ||
nb_outliers_tot = {"type":"GAUGE","key":"nb_outliers_tot","value":self.metric[16]} | ||
nb_labels_tot = {"type":"GAUGE","key":"nb_labels_tot","value":self.metric[17]} | ||
|
||
return [is_outlier,mse,obs,threshold,label, | ||
accuracy_tot,precision_tot,recall_tot,f1_score_tot,f2_score_tot, | ||
accuracy_roll,precision_roll,recall_roll,f1_score_roll,f2_score_roll, | ||
true_negative,false_positive,false_negative,true_positive, | ||
nb_outliers_roll,nb_labels_roll,nb_outliers_tot,nb_labels_tot] |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
from keras.layers import Lambda, Input, Dense | ||
from keras.models import Model | ||
from keras.losses import mse | ||
from keras import backend as K | ||
from keras.optimizers import Adam | ||
import numpy as np | ||
|
||
def sampling(args): | ||
""" Reparameterization trick by sampling from an isotropic unit Gaussian. | ||
Arguments: | ||
- args (tensor): mean and log of variance of Q(z|X) | ||
Returns: | ||
- z (tensor): sampled latent vector | ||
""" | ||
z_mean, z_log_var = args | ||
batch = K.shape(z_mean)[0] | ||
dim = K.int_shape(z_mean)[1] | ||
epsilon = K.random_normal(shape=(batch, dim)) # by default, random_normal has mean=0 and std=1.0 | ||
return z_mean + K.exp(0.5 * z_log_var) * epsilon # mean + stdev * eps | ||
|
||
def model(n_features,hidden_layers=1,latent_dim=2,hidden_dim=[]): | ||
""" Build VAE model. | ||
Arguments: | ||
- n_features (int): number of features in the data | ||
- hidden_layers (int): number of hidden layers used in encoder/decoder | ||
- latent_dim (int): dimension of latent variable | ||
- hidden_dim (list): list with dimension of each hidden layer | ||
""" | ||
|
||
# set dimensions hidden layers | ||
if hidden_dim==[]: | ||
i = 0 | ||
dim = n_features | ||
while i < hidden_layers: | ||
hidden_dim.append(int(np.max([dim/2,2]))) | ||
dim/=2 | ||
i+=1 | ||
|
||
# VAE = encoder + decoder | ||
# encoder | ||
inputs = Input(shape=(n_features,), name='encoder_input') | ||
# define hidden layers | ||
enc_hidden = Dense(hidden_dim[0], activation='relu', name='encoder_hidden_0')(inputs) | ||
i = 1 | ||
while i < hidden_layers: | ||
enc_hidden = Dense(hidden_dim[i],activation='relu',name='encoder_hidden_'+str(i))(enc_hidden) | ||
i+=1 | ||
|
||
z_mean = Dense(latent_dim, name='z_mean')(enc_hidden) | ||
z_log_var = Dense(latent_dim, name='z_log_var')(enc_hidden) | ||
# reparametrization trick to sample z | ||
z = Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var]) | ||
# instantiate encoder model | ||
encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder') | ||
|
||
# decoder | ||
latent_inputs = Input(shape=(latent_dim,), name='z_sampling') | ||
# define hidden layers | ||
dec_hidden = Dense(hidden_dim[-1], activation='relu', name='decoder_hidden_0')(latent_inputs) | ||
|
||
i = 2 | ||
while i < hidden_layers+1: | ||
dec_hidden = Dense(hidden_dim[-i],activation='relu',name='decoder_hidden_'+str(i-1))(dec_hidden) | ||
i+=1 | ||
|
||
outputs = Dense(n_features, activation='sigmoid', name='decoder_output')(dec_hidden) | ||
# instantiate decoder model | ||
decoder = Model(latent_inputs, outputs, name='decoder') | ||
|
||
# instantiate VAE model | ||
outputs = decoder(encoder(inputs)[2]) | ||
vae = Model(inputs, outputs, name='vae') | ||
|
||
# define VAE loss, optimizer and compile model | ||
reconstruction_loss = mse(inputs, outputs) | ||
reconstruction_loss *= n_features | ||
kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var) | ||
kl_loss = K.sum(kl_loss, axis=-1) | ||
kl_loss *= -0.5 | ||
vae_loss = K.mean(reconstruction_loss + kl_loss) | ||
vae.add_loss(vae_loss) | ||
|
||
optimizer = Adam(lr=.001) | ||
vae.compile(optimizer=optimizer) | ||
|
||
return vae |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Oops, something went wrong.