Skip to content

Commit

Permalink
Merge pull request #221 from DoubleML/j-external-predictions
Browse files Browse the repository at this point in the history
Introducing External Predictions, Dummy Learners, and Nuisance Estimation Updates
  • Loading branch information
SvenKlaassen authored Dec 11, 2023
2 parents 681da5a + ee04037 commit 1773384
Show file tree
Hide file tree
Showing 32 changed files with 2,052 additions and 642 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,4 @@ share/python-wheels/
MANIFEST
*.idea
*.vscode
.flake8
6 changes: 6 additions & 0 deletions doubleml/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,3 +333,9 @@ def _var_est(psi, psi_deriv, apply_cross_fitting, smpls, is_cluster_data,
sigma2_hat = np.multiply(scaling, gamma_hat)

return sigma2_hat, var_scaling_factor


def _cond_targets(target, cond_sample):
cond_target = target.astype(float)
cond_target[np.invert(cond_sample)] = np.nan
return cond_target
83 changes: 79 additions & 4 deletions doubleml/double_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ def __init__(self,
self._sensitivity_elements = None
self._sensitivity_params = None

# initialize external predictions
self._external_predictions_implemented = False

# check resampling specifications
if not isinstance(n_folds, int):
raise TypeError('The number of folds must be of int type. '
Expand Down Expand Up @@ -124,7 +127,7 @@ def __init__(self,
self.draw_sample_splitting()

# initialize arrays according to obj_dml_data and the resampling settings
self._psi, self._psi_deriv, self._psi_elements,\
self._psi, self._psi_deriv, self._psi_elements, \
self._coef, self._se, self._all_coef, self._all_se, self._all_dml1_coef = self._initialize_arrays()

# also initialize bootstrap arrays with the default number of bootstrap replications
Expand Down Expand Up @@ -486,7 +489,7 @@ def __psi_deriv(self):
def __all_se(self):
return self._all_se[self._i_treat, self._i_rep]

def fit(self, n_jobs_cv=None, store_predictions=True, store_models=False):
def fit(self, n_jobs_cv=None, store_predictions=True, external_predictions=None, store_models=False):
"""
Estimate DoubleML models.
Expand All @@ -505,6 +508,13 @@ def fit(self, n_jobs_cv=None, store_predictions=True, store_models=False):
to analyze the fitted models or extract information like variable importance.
Default is ``False``.
external_predictions : None or dict
If `None` all models for the learners are fitted and evaluated. If a dictionary containing predictions
for a specific learner is supplied, the model will use the supplied nuisance predictions instead. Has to
be a nested dictionary where the keys refer to the treatment and the keys of the nested dictionarys refer to the
corresponding learners.
Default is `None`.
Returns
-------
self : object
Expand All @@ -523,6 +533,13 @@ def fit(self, n_jobs_cv=None, store_predictions=True, store_models=False):
raise TypeError('store_models must be True or False. '
f'Got {str(store_models)}.')

# check if external predictions are implemented
if self._external_predictions_implemented:
# check prediction format
self._check_external_predictions(external_predictions)
elif not self._external_predictions_implemented and external_predictions is not None:
raise NotImplementedError(f"External predictions not implemented for {self.__class__.__name__}.")

# initialize rmse arrays for nuisance functions evaluation
self._initialize_rmses()

Expand All @@ -546,8 +563,24 @@ def fit(self, n_jobs_cv=None, store_predictions=True, store_models=False):
if self._dml_data.n_treat > 1:
self._dml_data.set_x_d(self._dml_data.d_cols[i_d])

# set the supplied predictions for the treatment and each learner (including None)
ext_prediction_dict = {}
for learner in self.params_names:
if external_predictions is None:
ext_prediction_dict[learner] = None
elif learner in external_predictions[self._dml_data.d_cols[i_d]].keys():
if isinstance(external_predictions[self._dml_data.d_cols[i_d]][learner], np.ndarray):
ext_prediction_dict[learner] = external_predictions[self._dml_data.d_cols[i_d]][learner][:, i_rep]
else:
ext_prediction_dict[learner] = None
else:
ext_prediction_dict[learner] = None

# ml estimation of nuisance models and computation of score elements
score_elements, preds = self._nuisance_est(self.__smpls, n_jobs_cv, return_models=store_models)
score_elements, preds = self._nuisance_est(self.__smpls, n_jobs_cv,
external_predictions=ext_prediction_dict,
return_models=store_models)

self._set_score_elements(score_elements, self._i_rep, self._i_treat)

# calculate rmses and store predictions and targets of the nuisance models
Expand Down Expand Up @@ -985,7 +1018,7 @@ def _initialize_ml_nuisance_params(self):
pass

@abstractmethod
def _nuisance_est(self, smpls, n_jobs_cv, return_models):
def _nuisance_est(self, smpls, n_jobs_cv, return_models, external_predictions):
pass

@abstractmethod
Expand Down Expand Up @@ -1037,6 +1070,48 @@ def _check_learner(learner, learner_name, regressor, classifier):

return learner_is_classifier

def _check_external_predictions(self, external_predictions):
if external_predictions is not None:
if not isinstance(external_predictions, dict):
raise TypeError('external_predictions must be a dictionary. '
f'{str(external_predictions)} of type {str(type(external_predictions))} was passed.')

supplied_treatments = list(external_predictions.keys())
valid_treatments = self._dml_data.d_cols
if not set(supplied_treatments).issubset(valid_treatments):
raise ValueError('Invalid external_predictions. '
f'Invalid treatment variable in {str(supplied_treatments)}. '
'Valid treatment variables ' + ' or '.join(valid_treatments) + '.')

for treatment in supplied_treatments:
if not isinstance(external_predictions[treatment], dict):
raise TypeError('external_predictions must be a nested dictionary. '
f'For treatment {str(treatment)} a value of type '
f'{str(type(external_predictions[treatment]))} was passed.')

supplied_learners = list(external_predictions[treatment].keys())
valid_learners = self.params_names
if not set(supplied_learners).issubset(valid_learners):
raise ValueError('Invalid external_predictions. '
f'Invalid nuisance learner for treatment {str(treatment)} in {str(supplied_learners)}. '
'Valid nuisance learners ' + ' or '.join(valid_learners) + '.')

for learner in supplied_learners:
if not isinstance(external_predictions[treatment][learner], np.ndarray):
raise TypeError('Invalid external_predictions. '
'The values of the nested list must be a numpy array. '
'Invalid predictions for treatment ' + str(treatment) +
' and learner ' + str(learner) + '. ' +
f'Object of type {str(type(external_predictions[treatment][learner]))} was passed.')

expected_shape = (self._dml_data.n_obs, self.n_rep)
if external_predictions[treatment][learner].shape != expected_shape:
raise ValueError('Invalid external_predictions. '
f'The supplied predictions have to be of shape {str(expected_shape)}. '
'Invalid predictions for treatment ' + str(treatment) +
' and learner ' + str(learner) + '. ' +
f'Predictions of shape {str(external_predictions[treatment][learner].shape)} passed.')

def _initialize_arrays(self):
# scores
psi = np.full((self._dml_data.n_obs, self.n_rep, self._dml_data.n_coefs), np.nan)
Expand Down
7 changes: 3 additions & 4 deletions doubleml/double_ml_cvar.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from .double_ml import DoubleML
from .double_ml_score_mixins import LinearScoreMixin
from ._utils import _dml_cv_predict, _trimm, _predict_zero_one_propensity, \
_normalize_ipw, _dml_tune, _get_bracket_guess, _solve_ipw_score
_normalize_ipw, _dml_tune, _get_bracket_guess, _solve_ipw_score, _cond_targets
from .double_ml_data import DoubleMLData
from ._utils_resampling import DoubleMLResampling
from ._utils_checks import _check_score, _check_trimming, _check_zero_one_treatment, _check_treatment, \
Expand Down Expand Up @@ -207,7 +207,7 @@ def _initialize_ml_nuisance_params(self):
self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols}
for learner in ['ml_g', 'ml_m']}

def _nuisance_est(self, smpls, n_jobs_cv, return_models=False):
def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False):
x, y = check_X_y(self._dml_data.x, self._dml_data.y,
force_all_finite=False)
x, d = check_X_y(x, self._dml_data.d,
Expand Down Expand Up @@ -296,8 +296,7 @@ def ipw_score(theta):
m_hat['targets'] = d

# set the target for g to be a float and only relevant values
g_hat['targets'] = g_hat['targets'].astype(float)
g_hat['targets'][d != self.treatment] = np.nan
g_hat['targets'] = _cond_targets(g_hat['targets'], cond_sample=(d == self.treatment))

if return_models:
g_hat['models'] = fitted_models['ml_g']
Expand Down
58 changes: 38 additions & 20 deletions doubleml/double_ml_did.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,8 @@ def __init__(self,
self._trimming_rule = trimming_rule
self._trimming_threshold = trimming_threshold
_check_trimming(self._trimming_rule, self._trimming_threshold)

self._sensitivity_implemented = True
self._external_predictions_implemented = True

@property
def in_sample_normalization(self):
Expand Down Expand Up @@ -194,7 +194,7 @@ def _check_data(self, obj_dml_data):
'needs to be specified as treatment variable.')
return

def _nuisance_est(self, smpls, n_jobs_cv, return_models=False):
def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False):
x, y = check_X_y(self._dml_data.x, self._dml_data.y,
force_all_finite=False)
x, d = check_X_y(x, self._dml_data.d,
Expand All @@ -203,31 +203,49 @@ def _nuisance_est(self, smpls, n_jobs_cv, return_models=False):
# nuisance g
# get train indices for d == 0
smpls_d0, smpls_d1 = _get_cond_smpls(smpls, d)
g_hat0 = _dml_cv_predict(self._learner['ml_g'], x, y, smpls=smpls_d0, n_jobs=n_jobs_cv,
est_params=self._get_params('ml_g0'), method=self._predict_method['ml_g'],
return_models=return_models)

_check_finite_predictions(g_hat0['preds'], self._learner['ml_g'], 'ml_g', smpls)
# adjust target values to consider only compatible subsamples
g_hat0['targets'] = g_hat0['targets'].astype(float)
g_hat0['targets'][d == 1] = np.nan

g_hat1 = _dml_cv_predict(self._learner['ml_g'], x, y, smpls=smpls_d1, n_jobs=n_jobs_cv,
est_params=self._get_params('ml_g1'), method=self._predict_method['ml_g'],
return_models=return_models)
# nuisance g for d==0
if external_predictions['ml_g0'] is not None:
g_hat0 = {'preds': external_predictions['ml_g0'],
'targets': None,
'models': None}
else:
g_hat0 = _dml_cv_predict(self._learner['ml_g'], x, y, smpls=smpls_d0, n_jobs=n_jobs_cv,
est_params=self._get_params('ml_g0'), method=self._predict_method['ml_g'],
return_models=return_models)

_check_finite_predictions(g_hat0['preds'], self._learner['ml_g'], 'ml_g', smpls)
# adjust target values to consider only compatible subsamples
g_hat0['targets'] = g_hat0['targets'].astype(float)
g_hat0['targets'][d == 1] = np.nan

# nuisance g for d==1
if external_predictions['ml_g1'] is not None:
g_hat1 = {'preds': external_predictions['ml_g1'],
'targets': None,
'models': None}
else:
g_hat1 = _dml_cv_predict(self._learner['ml_g'], x, y, smpls=smpls_d1, n_jobs=n_jobs_cv,
est_params=self._get_params('ml_g1'), method=self._predict_method['ml_g'],
return_models=return_models)

_check_finite_predictions(g_hat0['preds'], self._learner['ml_g'], 'ml_g', smpls)
# adjust target values to consider only compatible subsamples
g_hat1['targets'] = g_hat1['targets'].astype(float)
g_hat1['targets'][d == 0] = np.nan
_check_finite_predictions(g_hat1['preds'], self._learner['ml_g'], 'ml_g', smpls)
# adjust target values to consider only compatible subsamples
g_hat1['targets'] = g_hat1['targets'].astype(float)
g_hat1['targets'][d == 0] = np.nan

# only relevant for observational setting
m_hat = {'preds': None, 'targets': None, 'models': None}
if self.score == 'observational':
# nuisance m
m_hat = _dml_cv_predict(self._learner['ml_m'], x, d, smpls=smpls, n_jobs=n_jobs_cv,
est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'],
return_models=return_models)
if external_predictions['ml_m'] is not None:
m_hat = {'preds': external_predictions['ml_m'],
'targets': None,
'models': None}
else:
m_hat = _dml_cv_predict(self._learner['ml_m'], x, d, smpls=smpls, n_jobs=n_jobs_cv,
est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'],
return_models=return_models)
_check_finite_predictions(m_hat['preds'], self._learner['ml_m'], 'ml_m', smpls)
_check_is_propensity(m_hat['preds'], self._learner['ml_m'], 'ml_m', smpls, eps=1e-12)
m_hat['preds'] = _trimm(m_hat['preds'], self.trimming_rule, self.trimming_threshold)
Expand Down
Loading

0 comments on commit 1773384

Please sign in to comment.