From c5441e32745f44e912f8930704cd5feb2122c972 Mon Sep 17 00:00:00 2001
From: Philip Cho <chohyu01@cs.washington.edu>
Date: Wed, 29 Aug 2018 18:37:02 -0700
Subject: [PATCH] Add validate_features parameter to sklearn API

---
 python-package/xgboost/sklearn.py | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index f37415ac4525..df3aac0cafbb 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -339,7 +339,7 @@ def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None,
             self.best_ntree_limit = self._Booster.best_ntree_limit
         return self
 
-    def predict(self, data, output_margin=False, ntree_limit=None):
+    def predict(self, data, output_margin=False, ntree_limit=None, validate_features=True):
         """
         Predict with `data`.
 
@@ -369,6 +369,9 @@ def predict(self, data, output_margin=False, ntree_limit=None):
         ntree_limit : int
             Limit number of trees in the prediction; defaults to best_ntree_limit if defined
             (i.e. it has been trained with early stopping), otherwise 0 (use all trees).
+        validate_features : bool
+            When this is True, validate that the Booster's and data's feature_names are identical.
+            Otherwise, it is assumed that the feature_names are the same.
         Returns
         -------
         prediction : numpy array
@@ -381,7 +384,8 @@ def predict(self, data, output_margin=False, ntree_limit=None):
             ntree_limit = getattr(self, "best_ntree_limit", 0)
         return self.get_booster().predict(test_dmatrix,
                                           output_margin=output_margin,
-                                          ntree_limit=ntree_limit)
+                                          ntree_limit=ntree_limit,
+                                          validate_features=validate_features)
 
     def apply(self, X, ntree_limit=0):
         """Return the predicted leaf every tree for each sample.
@@ -604,7 +608,7 @@ def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None,
 
         return self
 
-    def predict(self, data, output_margin=False, ntree_limit=None):
+    def predict(self, data, output_margin=False, ntree_limit=None, validate_features=True):
         """
         Predict with `data`.
 
@@ -634,6 +638,9 @@ def predict(self, data, output_margin=False, ntree_limit=None):
         ntree_limit : int
             Limit number of trees in the prediction; defaults to best_ntree_limit if defined
             (i.e. it has been trained with early stopping), otherwise 0 (use all trees).
+        validate_features : bool
+            When this is True, validate that the Booster's and data's feature_names are identical.
+            Otherwise, it is assumed that the feature_names are the same.
         Returns
         -------
         prediction : numpy array
@@ -643,7 +650,8 @@ def predict(self, data, output_margin=False, ntree_limit=None):
             ntree_limit = getattr(self, "best_ntree_limit", 0)
         class_probs = self.get_booster().predict(test_dmatrix,
                                                  output_margin=output_margin,
-                                                 ntree_limit=ntree_limit)
+                                                 ntree_limit=ntree_limit,
+                                                 validate_features=validate_features)
         if len(class_probs.shape) > 1:
             column_indexes = np.argmax(class_probs, axis=1)
         else:
@@ -651,7 +659,7 @@ def predict(self, data, output_margin=False, ntree_limit=None):
             column_indexes[class_probs > 0.5] = 1
         return self._le.inverse_transform(column_indexes)
 
-    def predict_proba(self, data, ntree_limit=None):
+    def predict_proba(self, data, ntree_limit=None, validate_features=True):
         """
         Predict the probability of each `data` example being of a given class.
 
@@ -668,6 +676,9 @@ def predict_proba(self, data, ntree_limit=None):
         ntree_limit : int
             Limit number of trees in the prediction; defaults to best_ntree_limit if defined
             (i.e. it has been trained with early stopping), otherwise 0 (use all trees).
+        validate_features : bool
+            When this is True, validate that the Booster's and data's feature_names are identical.
+            Otherwise, it is assumed that the feature_names are the same.
 
         Returns
         -------
@@ -678,7 +689,8 @@ def predict_proba(self, data, ntree_limit=None):
         if ntree_limit is None:
             ntree_limit = getattr(self, "best_ntree_limit", 0)
         class_probs = self.get_booster().predict(test_dmatrix,
-                                                 ntree_limit=ntree_limit)
+                                                 ntree_limit=ntree_limit,
+                                                 validate_features=validate_features)
         if self.objective == "multi:softprob":
             return class_probs
         else:
@@ -964,7 +976,7 @@ def _dmat_init(group, **params):
 
         return self
 
-    def predict(self, data, output_margin=False, ntree_limit=0):
+    def predict(self, data, output_margin=False, ntree_limit=0, validate_features=True):
 
         test_dmatrix = DMatrix(data, missing=self.missing)
         if ntree_limit is None:
@@ -972,6 +984,7 @@ def predict(self, data, output_margin=False, ntree_limit=0):
 
         return self.get_booster().predict(test_dmatrix,
                                           output_margin=output_margin,
-                                          ntree_limit=ntree_limit)
+                                          ntree_limit=ntree_limit,
+                                          validate_features=validate_features)
 
     predict.__doc__ = XGBModel.predict.__doc__