Merge pull request #221 from DoubleML/j-external-predictions

Introducing External Predictions, Dummy Learners, and Nuisance Estimation Updates
DoubleML · Dec 11, 2023 · 1773384 · 1773384
2 parents 681da5a + ee04037
commit 1773384
Show file tree

Hide file tree

Showing 32 changed files with 2,052 additions and 642 deletions.
diff --git a/.gitignore b/.gitignore
@@ -29,3 +29,4 @@ share/python-wheels/
 MANIFEST
 *.idea
 *.vscode
+.flake8
diff --git a/doubleml/_utils.py b/doubleml/_utils.py
@@ -333,3 +333,9 @@ def _var_est(psi, psi_deriv, apply_cross_fitting, smpls, is_cluster_data,
     sigma2_hat = np.multiply(scaling, gamma_hat)
 
     return sigma2_hat, var_scaling_factor
+
+
+def _cond_targets(target, cond_sample):
+    cond_target = target.astype(float)
+    cond_target[np.invert(cond_sample)] = np.nan
+    return cond_target
diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py
@@ -64,6 +64,9 @@ def __init__(self,
         self._sensitivity_elements = None
         self._sensitivity_params = None
 
+        # initialize external predictions
+        self._external_predictions_implemented = False
+
         # check resampling specifications
         if not isinstance(n_folds, int):
             raise TypeError('The number of folds must be of int type. '
@@ -124,7 +127,7 @@ def __init__(self,
             self.draw_sample_splitting()
 
         # initialize arrays according to obj_dml_data and the resampling settings
-        self._psi, self._psi_deriv, self._psi_elements,\
+        self._psi, self._psi_deriv, self._psi_elements, \
             self._coef, self._se, self._all_coef, self._all_se, self._all_dml1_coef = self._initialize_arrays()
 
         # also initialize bootstrap arrays with the default number of bootstrap replications
@@ -486,7 +489,7 @@ def __psi_deriv(self):
     def __all_se(self):
         return self._all_se[self._i_treat, self._i_rep]
 
-    def fit(self, n_jobs_cv=None, store_predictions=True, store_models=False):
+    def fit(self, n_jobs_cv=None, store_predictions=True, external_predictions=None, store_models=False):
         """
         Estimate DoubleML models.
 
@@ -505,6 +508,13 @@ def fit(self, n_jobs_cv=None, store_predictions=True, store_models=False):
             to analyze the fitted models or extract information like variable importance.
             Default is ``False``.
 
+        external_predictions : None or dict
+            If `None` all models for the learners are fitted and evaluated. If a dictionary containing predictions
+            for a specific learner is supplied, the model will use the supplied nuisance predictions instead. Has to
+            be a nested dictionary where the keys refer to the treatment and the keys of the nested dictionarys refer to the
+            corresponding learners.
+            Default is `None`.
+
         Returns
         -------
         self : object
@@ -523,6 +533,13 @@ def fit(self, n_jobs_cv=None, store_predictions=True, store_models=False):
             raise TypeError('store_models must be True or False. '
                             f'Got {str(store_models)}.')
 
+        # check if external predictions are implemented
+        if self._external_predictions_implemented:
+            # check prediction format
+            self._check_external_predictions(external_predictions)
+        elif not self._external_predictions_implemented and external_predictions is not None:
+            raise NotImplementedError(f"External predictions not implemented for {self.__class__.__name__}.")
+
         # initialize rmse arrays for nuisance functions evaluation
         self._initialize_rmses()
 
@@ -546,8 +563,24 @@ def fit(self, n_jobs_cv=None, store_predictions=True, store_models=False):
                 if self._dml_data.n_treat > 1:
                     self._dml_data.set_x_d(self._dml_data.d_cols[i_d])
 
+                # set the supplied predictions for the treatment and each learner (including None)
+                ext_prediction_dict = {}
+                for learner in self.params_names:
+                    if external_predictions is None:
+                        ext_prediction_dict[learner] = None
+                    elif learner in external_predictions[self._dml_data.d_cols[i_d]].keys():
+                        if isinstance(external_predictions[self._dml_data.d_cols[i_d]][learner], np.ndarray):
+                            ext_prediction_dict[learner] = external_predictions[self._dml_data.d_cols[i_d]][learner][:, i_rep]
+                        else:
+                            ext_prediction_dict[learner] = None
+                    else:
+                        ext_prediction_dict[learner] = None
+
                 # ml estimation of nuisance models and computation of score elements
-                score_elements, preds = self._nuisance_est(self.__smpls, n_jobs_cv, return_models=store_models)
+                score_elements, preds = self._nuisance_est(self.__smpls, n_jobs_cv,
+                                                           external_predictions=ext_prediction_dict,
+                                                           return_models=store_models)
+
                 self._set_score_elements(score_elements, self._i_rep, self._i_treat)
 
                 # calculate rmses and store predictions and targets of the nuisance models
@@ -985,7 +1018,7 @@ def _initialize_ml_nuisance_params(self):
         pass
 
     @abstractmethod
-    def _nuisance_est(self, smpls, n_jobs_cv, return_models):
+    def _nuisance_est(self, smpls, n_jobs_cv, return_models, external_predictions):
         pass
 
     @abstractmethod
@@ -1037,6 +1070,48 @@ def _check_learner(learner, learner_name, regressor, classifier):
 
         return learner_is_classifier
 
+    def _check_external_predictions(self, external_predictions):
+        if external_predictions is not None:
+            if not isinstance(external_predictions, dict):
+                raise TypeError('external_predictions must be a dictionary. '
+                                f'{str(external_predictions)} of type {str(type(external_predictions))} was passed.')
+
+            supplied_treatments = list(external_predictions.keys())
+            valid_treatments = self._dml_data.d_cols
+            if not set(supplied_treatments).issubset(valid_treatments):
+                raise ValueError('Invalid external_predictions. '
+                                 f'Invalid treatment variable in {str(supplied_treatments)}. '
+                                 'Valid treatment variables ' + ' or '.join(valid_treatments) + '.')
+
+            for treatment in supplied_treatments:
+                if not isinstance(external_predictions[treatment], dict):
+                    raise TypeError('external_predictions must be a nested dictionary. '
+                                    f'For treatment {str(treatment)} a value of type '
+                                    f'{str(type(external_predictions[treatment]))} was passed.')
+
+                supplied_learners = list(external_predictions[treatment].keys())
+                valid_learners = self.params_names
+                if not set(supplied_learners).issubset(valid_learners):
+                    raise ValueError('Invalid external_predictions. '
+                                     f'Invalid nuisance learner for treatment {str(treatment)} in {str(supplied_learners)}. '
+                                     'Valid nuisance learners ' + ' or '.join(valid_learners) + '.')
+
+                for learner in supplied_learners:
+                    if not isinstance(external_predictions[treatment][learner], np.ndarray):
+                        raise TypeError('Invalid external_predictions. '
+                                        'The values of the nested list must be a numpy array. '
+                                        'Invalid predictions for treatment ' + str(treatment) +
+                                        ' and learner ' + str(learner) + '. ' +
+                                        f'Object of type {str(type(external_predictions[treatment][learner]))} was passed.')
+
+                    expected_shape = (self._dml_data.n_obs, self.n_rep)
+                    if external_predictions[treatment][learner].shape != expected_shape:
+                        raise ValueError('Invalid external_predictions. '
+                                         f'The supplied predictions have to be of shape {str(expected_shape)}. '
+                                         'Invalid predictions for treatment ' + str(treatment) +
+                                         ' and learner ' + str(learner) + '. ' +
+                                         f'Predictions of shape {str(external_predictions[treatment][learner].shape)} passed.')
+
     def _initialize_arrays(self):
         # scores
         psi = np.full((self._dml_data.n_obs, self.n_rep, self._dml_data.n_coefs), np.nan)

diff --git a/doubleml/double_ml_cvar.py b/doubleml/double_ml_cvar.py
@@ -7,7 +7,7 @@
 from .double_ml import DoubleML
 from .double_ml_score_mixins import LinearScoreMixin
 from ._utils import _dml_cv_predict, _trimm, _predict_zero_one_propensity, \
-    _normalize_ipw, _dml_tune, _get_bracket_guess, _solve_ipw_score
+    _normalize_ipw, _dml_tune, _get_bracket_guess, _solve_ipw_score, _cond_targets
 from .double_ml_data import DoubleMLData
 from ._utils_resampling import DoubleMLResampling
 from ._utils_checks import _check_score, _check_trimming, _check_zero_one_treatment, _check_treatment, \
@@ -207,7 +207,7 @@ def _initialize_ml_nuisance_params(self):
         self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols}
                         for learner in ['ml_g', 'ml_m']}
 
-    def _nuisance_est(self, smpls, n_jobs_cv, return_models=False):
+    def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False):
         x, y = check_X_y(self._dml_data.x, self._dml_data.y,
                          force_all_finite=False)
         x, d = check_X_y(x, self._dml_data.d,
@@ -296,8 +296,7 @@ def ipw_score(theta):
         m_hat['targets'] = d
 
         # set the target for g to be a float and only relevant values
-        g_hat['targets'] = g_hat['targets'].astype(float)
-        g_hat['targets'][d != self.treatment] = np.nan
+        g_hat['targets'] = _cond_targets(g_hat['targets'], cond_sample=(d == self.treatment))
 
         if return_models:
             g_hat['models'] = fitted_models['ml_g']

diff --git a/doubleml/double_ml_did.py b/doubleml/double_ml_did.py
@@ -146,8 +146,8 @@ def __init__(self,
         self._trimming_rule = trimming_rule
         self._trimming_threshold = trimming_threshold
         _check_trimming(self._trimming_rule, self._trimming_threshold)
-
         self._sensitivity_implemented = True
+        self._external_predictions_implemented = True
 
     @property
     def in_sample_normalization(self):
@@ -194,7 +194,7 @@ def _check_data(self, obj_dml_data):
                              'needs to be specified as treatment variable.')
         return
 
-    def _nuisance_est(self, smpls, n_jobs_cv, return_models=False):
+    def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False):
         x, y = check_X_y(self._dml_data.x, self._dml_data.y,
                          force_all_finite=False)
         x, d = check_X_y(x, self._dml_data.d,
@@ -203,31 +203,49 @@ def _nuisance_est(self, smpls, n_jobs_cv, return_models=False):
         # nuisance g
         # get train indices for d == 0
         smpls_d0, smpls_d1 = _get_cond_smpls(smpls, d)
-        g_hat0 = _dml_cv_predict(self._learner['ml_g'], x, y, smpls=smpls_d0, n_jobs=n_jobs_cv,
-                                 est_params=self._get_params('ml_g0'), method=self._predict_method['ml_g'],
-                                 return_models=return_models)
-
-        _check_finite_predictions(g_hat0['preds'], self._learner['ml_g'], 'ml_g', smpls)
-        # adjust target values to consider only compatible subsamples
-        g_hat0['targets'] = g_hat0['targets'].astype(float)
-        g_hat0['targets'][d == 1] = np.nan
 
-        g_hat1 = _dml_cv_predict(self._learner['ml_g'], x, y, smpls=smpls_d1, n_jobs=n_jobs_cv,
-                                 est_params=self._get_params('ml_g1'), method=self._predict_method['ml_g'],
-                                 return_models=return_models)
+        # nuisance g for d==0
+        if external_predictions['ml_g0'] is not None:
+            g_hat0 = {'preds': external_predictions['ml_g0'],
+                      'targets': None,
+                      'models': None}
+        else:
+            g_hat0 = _dml_cv_predict(self._learner['ml_g'], x, y, smpls=smpls_d0, n_jobs=n_jobs_cv,
+                                     est_params=self._get_params('ml_g0'), method=self._predict_method['ml_g'],
+                                     return_models=return_models)
+
+            _check_finite_predictions(g_hat0['preds'], self._learner['ml_g'], 'ml_g', smpls)
+            # adjust target values to consider only compatible subsamples
+            g_hat0['targets'] = g_hat0['targets'].astype(float)
+            g_hat0['targets'][d == 1] = np.nan
+
+        # nuisance g for d==1
+        if external_predictions['ml_g1'] is not None:
+            g_hat1 = {'preds': external_predictions['ml_g1'],
+                      'targets': None,
+                      'models': None}
+        else:
+            g_hat1 = _dml_cv_predict(self._learner['ml_g'], x, y, smpls=smpls_d1, n_jobs=n_jobs_cv,
+                                     est_params=self._get_params('ml_g1'), method=self._predict_method['ml_g'],
+                                     return_models=return_models)
 
-        _check_finite_predictions(g_hat0['preds'], self._learner['ml_g'], 'ml_g', smpls)
-        # adjust target values to consider only compatible subsamples
-        g_hat1['targets'] = g_hat1['targets'].astype(float)
-        g_hat1['targets'][d == 0] = np.nan
+            _check_finite_predictions(g_hat1['preds'], self._learner['ml_g'], 'ml_g', smpls)
+            # adjust target values to consider only compatible subsamples
+            g_hat1['targets'] = g_hat1['targets'].astype(float)
+            g_hat1['targets'][d == 0] = np.nan
 
         # only relevant for observational setting
         m_hat = {'preds': None, 'targets': None, 'models': None}
         if self.score == 'observational':
             # nuisance m
-            m_hat = _dml_cv_predict(self._learner['ml_m'], x, d, smpls=smpls, n_jobs=n_jobs_cv,
-                                    est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'],
-                                    return_models=return_models)
+            if external_predictions['ml_m'] is not None:
+                m_hat = {'preds': external_predictions['ml_m'],
+                         'targets': None,
+                         'models': None}
+            else:
+                m_hat = _dml_cv_predict(self._learner['ml_m'], x, d, smpls=smpls, n_jobs=n_jobs_cv,
+                                        est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'],
+                                        return_models=return_models)
             _check_finite_predictions(m_hat['preds'], self._learner['ml_m'], 'ml_m', smpls)
             _check_is_propensity(m_hat['preds'], self._learner['ml_m'], 'ml_m', smpls, eps=1e-12)
             m_hat['preds'] = _trimm(m_hat['preds'], self.trimming_rule, self.trimming_threshold)
-Original file line number
+Diff line change
@@ Expand Up / @@ -29,3 +29,4 @@ share/python-wheels/ @@
     MANIFEST
     *.idea
     *.vscode
+    .flake8