diff --git a/imxgboost/__init__.py b/imxgboost/__init__.py index 3ec72c6..34eb8d4 100644 --- a/imxgboost/__init__.py +++ b/imxgboost/__init__.py @@ -1,8 +1,2 @@ import sys -sys.path.append("..") -import imxgboost.weighted_loss -from imxgboost.weighted_loss import Weight_Binary_Cross_Entropy -import imxgboost.focal_loss -from imxgboost.focal_loss import Focal_Binary_Loss -import imxgboost.imbalance_xgb -from imxgboost.imbalance_xgb import imbalance_xgboost \ No newline at end of file +sys.path.append("..") \ No newline at end of file diff --git a/imxgboost/focal_loss.py b/imxgboost/focal_loss.py index 20d507e..592575a 100644 --- a/imxgboost/focal_loss.py +++ b/imxgboost/focal_loss.py @@ -31,13 +31,14 @@ def focal_binary_object(self, pred, dtrain): g4 = 1 - label - ((-1) ** label) * sigmoid_pred g5 = label + ((-1) ** label) * sigmoid_pred # combine the gradient - grad = gamma_indct * g3 * self.robust_pow(g2, gamma_indct) * np.log(g4 + 1e-9) + ((-1) ** label) * self.robust_pow(g5, ( - gamma_indct + 1)) + grad = gamma_indct * g3 * self.robust_pow(g2, gamma_indct) * np.log(g4 + 1e-9) + \ + ((-1) ** label) * self.robust_pow(g5, (gamma_indct + 1)) # combine the gradient parts to get hessian components - hess_1 = self.robust_pow(g2, gamma_indct) + gamma_indct * ((-1) ** label) * g3 * self.robust_pow(g2, (gamma_indct - 1)) + hess_1 = self.robust_pow(g2, gamma_indct) + \ + gamma_indct * ((-1) ** label) * g3 * self.robust_pow(g2, (gamma_indct - 1)) hess_2 = ((-1) ** label) * g3 * self.robust_pow(g2, gamma_indct) / g4 # get the final 2nd order derivative - hess = ((hess_1 * np.log(g4 + 1e-9) - hess_2) * gamma_indct + (gamma_indct + 1) * self.robust_pow(g5, - gamma_indct)) * g1 + hess = ((hess_1 * np.log(g4 + 1e-9) - hess_2) * gamma_indct + + (gamma_indct + 1) * self.robust_pow(g5, gamma_indct)) * g1 return grad, hess \ No newline at end of file diff --git a/imxgboost/imbalance_xgb.py b/imxgboost/imbalance_xgb.py index da28900..797e35b 100644 --- a/imxgboost/imbalance_xgb.py +++ b/imxgboost/imbalance_xgb.py @@ -1,5 +1,3 @@ -import sys -sys.path.append("..") import numpy as np import xgboost as xgb from imxgboost.weighted_loss import Weight_Binary_Cross_Entropy @@ -7,33 +5,39 @@ from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef + def evalerror(preds, dtrain): labels = dtrain.get_label() # return a pair metric_name, result # since preds are margin(before logistic transformation, cutoff at 0) - + return 'error', float(sum(labels != (preds > 0.0))) / len(labels) + def two_class_encoding(flat_prediction): - if len(np.shape(flat_prediction))==2: + if len(np.shape(flat_prediction)) == 2: return flat_prediction else: # class 1 probability class_one_prob = 1.0 / (1.0 + np.exp(-flat_prediction)) - class_one_prob = np.reshape(class_one_prob,[-1,1]) + class_one_prob = np.reshape(class_one_prob, [-1, 1]) # class 0 probability class_zero_prob = 1 - class_one_prob - class_zero_prob = np.reshape(class_zero_prob,[-1,1]) + class_zero_prob = np.reshape(class_zero_prob, [-1, 1]) # concatenate the probabilities to get the final prediction - sigmoid_two_class_pred = np.concatenate((class_zero_prob,class_one_prob),axis=1) - + sigmoid_two_class_pred = np.concatenate((class_zero_prob, class_one_prob), axis=1) + return sigmoid_two_class_pred -class imbalance_xgboost(BaseEstimator,ClassifierMixin): + +class imbalance_xgboost(BaseEstimator, ClassifierMixin): """Data in the form of [nData * nDim], where nDim stands for the number of features. This wrapper would provide a Xgboost interface with sklearn estimiator structure, which could be stacked in other Sk pipelines """ - def __init__(self,num_round=10,max_depth=10,eta=0.3,silent_mode=True,objective_func='binary:logitraw',eval_metric='logloss',booster='gbtree',special_objective=None, imbalance_alpha=None, focal_gamma=None): + + def __init__(self, num_round=10, max_depth=10, eta=0.3, silent_mode=True, objective_func='binary:logitraw', + eval_metric='logloss', booster='gbtree', special_objective=None, imbalance_alpha=None, + focal_gamma=None): """ Parameters to initialize a Xgboost estimator :param num_round. The rounds we would like to iterate to train the model @@ -60,35 +64,34 @@ def __init__(self,num_round=10,max_depth=10,eta=0.3,silent_mode=True,objective_f self.imbalance_alpha = imbalance_alpha self.focal_gamma = focal_gamma - - def fit(self,data_x,data_y): + def fit(self, data_x, data_y): if self.special_objective is None: # get the parameter list self.para_dict = {'max_depth': self.max_depth, - 'eta': self.eta, - 'silent': self.silent_mode, + 'eta': self.eta, + 'silent': self.silent_mode, 'objective': self.objective_func, 'eval_metric': self.eval_metric, 'booster': self.booster} else: # get the parameter list, without stating the objective function self.para_dict = {'max_depth': self.max_depth, - 'eta': self.eta, + 'eta': self.eta, 'silent': self.silent_mode, 'eval_metric': self.eval_metric, 'booster': self.booster} # make sure data is in [nData * nSample] format - assert len(data_x.shape)==2 + assert len(data_x.shape) == 2 # check if data length is the same - if data_x.shape[0]!=data_y.shape[0]: + if data_x.shape[0] != data_y.shape[0]: raise ValueError('The numbner of instances for x and y data should be the same!') # data_x is in [nData*nDim] nData = data_x.shape[0] nDim = data_x.shape[1] # split the data into train and validation holistic_ind = np.random.permutation(nData) - train_ind = holistic_ind[0:nData*3//4] - valid_ind = holistic_ind[nData*3//4:nData] + train_ind = holistic_ind[0:nData * 3 // 4] + valid_ind = holistic_ind[nData * 3 // 4:nData] # indexing and get the data train_data = data_x[train_ind] train_label = data_y[train_ind] @@ -108,7 +111,9 @@ def fit(self,data_x,data_y): # construct the object with imbalanced alpha value weighted_loss_obj = Weight_Binary_Cross_Entropy(imbalance_alpha=self.imbalance_alpha) # fit the classfifier - self.boosting_model = xgb.train(self.para_dict, dtrain, self.num_round, self.eval_list, obj=weighted_loss_obj.weighted_binary_cross_entropy, feval=evalerror, verbose_eval=False) + self.boosting_model = xgb.train(self.para_dict, dtrain, self.num_round, self.eval_list, + obj=weighted_loss_obj.weighted_binary_cross_entropy, feval=evalerror, + verbose_eval=False) elif self.special_objective == 'focal': # if the gamma value is None then raise an error if self.focal_gamma is None: @@ -116,38 +121,40 @@ def fit(self,data_x,data_y): # construct the object with focal gamma value focal_loss_obj = Focal_Binary_Loss(gamma_indct=self.focal_gamma) # fit the classfifier - self.boosting_model = xgb.train(self.para_dict, dtrain, self.num_round, self.eval_list, obj=focal_loss_obj.focal_binary_object, feval=evalerror, verbose_eval=False) + self.boosting_model = xgb.train(self.para_dict, dtrain, self.num_round, self.eval_list, + obj=focal_loss_obj.focal_binary_object, feval=evalerror, verbose_eval=False) else: - raise ValueError('The input special objective mode not recognized! Could only be \'weighted\' or \'focal\', but got '+str(self.special_objective)) + raise ValueError( + 'The input special objective mode not recognized! Could only be \'weighted\' or \'focal\', but got ' + str( + self.special_objective)) - - def predict(self,data_x,y=None): + def predict(self, data_x, y=None): # matrixilize if y is not None: try: - dtest = xgb.DMatrix(data_x,label=y) + dtest = xgb.DMatrix(data_x, label=y) except: raise ValueError('Test data invalid!') else: dtest = xgb.DMatrix(data_x) - + prediction_output = self.boosting_model.predict(dtest) - + return prediction_output - def predict_sigmoid(self,data_x, y=None): + def predict_sigmoid(self, data_x, y=None): # sigmoid output, for the prob = 1 raw_output = self.predict(data_x, y) sigmoid_output = 1. / (1. + np.exp(-raw_output)) return sigmoid_output - - def predict_determine(self,data_x,y=None): + + def predict_determine(self, data_x, y=None): # deterministic output sigmoid_output = self.predict_sigmoid(data_x, y) prediction_output = np.round(sigmoid_output) - + return prediction_output def predict_two_class(self, data_x, y=None): @@ -161,11 +168,11 @@ def score(self, X, y, sample_weight=None): score_pred = accuracy_score(y_true=y, y_pred=label_pred) return score_pred - + def score_eval_func(self, y_true, y_pred, mode='accuracy'): prob_pred = two_class_encoding(y_pred) label_pred = np.argmax(prob_pred, axis=1) - if mode=='accuracy': + if mode == 'accuracy': score_pred = accuracy_score(y_true=y_true, y_pred=label_pred) elif mode == 'precision': score_pred = precision_score(y_true=y_true, y_pred=label_pred) @@ -178,5 +185,25 @@ def score_eval_func(self, y_true, y_pred, mode='accuracy'): else: raise ValueError('Score function mode unrecognized! Must from one in the list ' '[\'accuracy\', \'precision\',\'recall\',\'f1\',\'MCC\']') - - return score_pred \ No newline at end of file + + return score_pred + + def correct_eval_func(self, y_true, y_pred, mode='TP'): + # get the predictions first + prob_pred = two_class_encoding(y_pred) + label_pred = np.argmax(prob_pred, axis=1) + # logic-not for the tn predictions + y_true_negative = np.logical_not(y_true) + y_pred_negative = np.logical_not(label_pred) + # return values based on cases + if mode == 'TP': + return np.sum(np.logical_and(y_true, label_pred)) + elif mode == 'TN': + return np.sum(np.logical_and(y_true_negative, y_pred_negative)) + elif mode == 'FP': + return np.sum(np.logical_and(y_true_negative, label_pred)) + elif mode == 'FN': + return np.sum(np.logical_and(y_true, y_pred_negative)) + else: + raise ValueError('Corrective evaluation mode not recognized! ' + 'Must be one of \'TP\', \'TN\', \'FP\', or \'FN\'') diff --git a/imxgboost/weighted_loss.py b/imxgboost/weighted_loss.py index c5afcca..a26493e 100644 --- a/imxgboost/weighted_loss.py +++ b/imxgboost/weighted_loss.py @@ -1,5 +1,6 @@ import numpy as np + class Weight_Binary_Cross_Entropy: ''' The class of binary cross entropy loss, allows the users to change the weight parameter @@ -22,4 +23,4 @@ def weighted_binary_cross_entropy(self, pred, dtrain): grad = -(imbalance_alpha ** label) * (label - sigmoid_pred) hess = (imbalance_alpha ** label) * sigmoid_pred * (1.0 - sigmoid_pred) - return grad, hess \ No newline at end of file + return grad, hess