diff --git a/README.md b/README.md index c154a684c..d236ec2e7 100644 --- a/README.md +++ b/README.md @@ -401,7 +401,7 @@ The reStructuredText files that make up the documentation are stored in the [doc * June 2019: [Treatment Effects with Instruments paper](https://arxiv.org/pdf/1905.10176.pdf) -* May 2019: [Open Data Science Conference Workshop](https://staging5.odsc.com/training/portfolio/machine-learning-estimation-of-heterogeneous-treatment-effect-the-microsoft-econml-library) +* May 2019: [Open Data Science Conference Workshop](https://odsc.com/speakers/machine-learning-estimation-of-heterogeneous-treatment-effect-the-microsoft-econml-library/) * 2018: [Orthogonal Random Forests paper](http://proceedings.mlr.press/v97/oprescu19a.html) diff --git a/doc/map.svg b/doc/map.svg index f01d6c67d..6cfa16e4f 100644 --- a/doc/map.svg +++ b/doc/map.svg @@ -1,88 +1,88 @@ - + - - - + + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - - - + + + + Did you run an EXPERIMENT? - - - + + + Was @@ -90,32 +90,38 @@ Incomplete? - - - Exploit Explicit Randomization - + + + Exploit Explicit Randomization + - 2SLS IV + 2SLS IV - - - Coming soon: - LinearDRIV - , - SparseLinearDRIV - - + + + IntentToTreatDRIV + + + + LinearIntentToTreatDRIV + + + - Deep IV + Deep IV - - - Coming soon: DMLIV, DRIV, - DeepDMLIV - , + + + NonParamDMLIV + + + + Coming soon: + DeepDMLIV + , DeepDRIV - + LINEAR @@ -123,9 +129,9 @@ EFFECTS? - - - + + + Do you MEASURE @@ -134,15 +140,15 @@ ASSIGNMENT? - - + + LINEAR HETEROGENEITY? - + CONFIDENCE @@ -151,7 +157,7 @@ MODEL SELECTION - + Few features affect @@ -159,72 +165,73 @@ RESPONSIVENESS? - + Identify and Exploit Natural - Experimentation + Experimentation - + SparseLinearDMLCateEstimator - + SparseLinearDRLearner - - + + LinearDMLCateEstimator - + LinearDRLearner - - + + ContinuousTreatmentOrthoForest - + DiscreteTreatmentOrthoForest - + ForestDRLearner - + ForestDMLCateEstimator - - + + - MetaLearners + MetaLearners - + - DRLearner + DRLearner - + - DMLCateEstimator + DMLCateEstimator - + NonParamDMLCateEstimator - + - Coming soon: DeepRLearner + Coming soon: + DeepRLearner - - - - - - + + + + + + CAUTION @@ -234,8 +241,8 @@ not eliminate it.** - - + + Is treatment @@ -246,8 +253,8 @@ your outcome*? - - + + Set Z to be the @@ -255,7 +262,7 @@ treatment - + Can set W @@ -263,72 +270,72 @@ to None - - + + Yes - - + + Yes - - + + Yes - - + + Yes - - + + No - - + + Yes - - + + CI - - + + No - - + + No - - + + No - - + + MS - - + + No - - + + No - - + + No - - - + + + Yes - - - - - + + + + + Yes diff --git a/econml/_ortho_learner.py b/econml/_ortho_learner.py index 074c8d132..363d00dbc 100644 --- a/econml/_ortho_learner.py +++ b/econml/_ortho_learner.py @@ -27,9 +27,9 @@ class in this module implements the general logic in a very versatile way import numpy as np import copy from warnings import warn -from .utilities import (shape, reshape, ndim, hstack, cross_product, transpose, +from .utilities import (shape, reshape, ndim, hstack, cross_product, transpose, inverse_onehot, broadcast_unit_treatments, reshape_treatmentwise_effects, - StatsModelsLinearRegression, LassoCVWrapper) + StatsModelsLinearRegression, _EncoderWrapper) from sklearn.model_selection import KFold, StratifiedKFold, check_cv from sklearn.linear_model import LinearRegression, LassoCV from sklearn.preprocessing import (PolynomialFeatures, LabelEncoder, OneHotEncoder, @@ -76,7 +76,7 @@ def _crossfit(model, folds, *args, **kwargs): ------- nuisances : tuple of numpy matrices Each entry in the tuple is a nuisance parameter matrix. Each row i-th in the - matric corresponds to the value of the nuisancee parameter for the i-th input + matrix corresponds to the value of the nuisance parameter for the i-th input sample. model_list : list of objects of same type as input model The cloned and fitted models for each fold. Can be used for inspection of the @@ -85,6 +85,8 @@ def _crossfit(model, folds, *args, **kwargs): The indices of the arrays for which the nuisance value was calculated. This corresponds to the union of the indices of the test part of each fold in the input fold list. + scores : tuple of list of float or None + The out-of-sample model scores for each nuisance model Examples -------- @@ -108,7 +110,7 @@ def predict(self, X, y, W=None): y = X[:, 0] + np.random.normal(size=(5000,)) folds = list(KFold(2).split(X, y)) model = Lasso(alpha=0.01) - nuisance, model_list, fitted_inds = _crossfit(Wrapper(model), folds, X, y, W=y, Z=None) + nuisance, model_list, fitted_inds, scores = _crossfit(Wrapper(model), folds, X, y, W=y, Z=None) >>> nuisance (array([-1.105728... , -1.537566..., -2.451827... , ..., 1.106287..., @@ -121,18 +123,25 @@ def predict(self, X, y, W=None): """ model_list = [] fitted_inds = [] + calculate_scores = hasattr(model, 'score') if folds is None: # skip crossfitting model_list.append(clone(model, safe=False)) kwargs = {k: v for k, v in kwargs.items() if v is not None} model_list[0].fit(*args, **kwargs) nuisances = model_list[0].predict(*args, **kwargs) + scores = model_list[0].score(*args, **kwargs) if calculate_scores else None if not isinstance(nuisances, tuple): nuisances = (nuisances,) + if not isinstance(scores, tuple): + scores = (scores,) + + # scores entries should be lists of scores, so make each entry a singleton list + scores = tuple([s] for s in scores) first_arr = args[0] if args else kwargs.items()[0][1] - return nuisances, model_list, np.arange(first_arr.shape[0]) + return nuisances, model_list, np.arange(first_arr.shape[0]), scores for idx, (train_idxs, test_idxs) in enumerate(folds): model_list.append(clone(model, safe=False)) @@ -162,7 +171,19 @@ def predict(self, X, y, W=None): for it, nuis in enumerate(nuisance_temp): nuisances[it][test_idxs] = nuis - return nuisances, model_list, np.sort(fitted_inds.astype(int)) + if calculate_scores: + score_temp = model_list[idx].score(*args_test, **kwargs_test) + + if not isinstance(score_temp, tuple): + score_temp = (score_temp,) + + if idx == 0: + scores = tuple([] for _ in score_temp) + + for it, score in enumerate(score_temp): + scores[it].append(score) + + return nuisances, model_list, np.sort(fitted_inds.astype(int)), (scores if calculate_scores else None) class _OrthoLearner(TreatmentExpansionMixin, LinearCateEstimator): @@ -235,6 +256,9 @@ class _OrthoLearner(TreatmentExpansionMixin, LinearCateEstimator): methods. If ``discrete_treatment=True``, then the input ``T`` to both above calls will be the one-hot encoding of the original input ``T``, excluding the first column of the one-hot. + If the estimator also provides a score method with the same arguments as fit, it will be used to + calculate scores during training. + model_final: estimator for fitting the response residuals to the features and treatment residuals Must implement `fit` and `predict` methods that must have signatures:: @@ -256,6 +280,10 @@ class _OrthoLearner(TreatmentExpansionMixin, LinearCateEstimator): discrete_instrument: bool Whether the instrument values should be treated as categorical, rather than continuous, quantities + categories: 'auto' or list + The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). + The first category will be treated as the control treatment. + n_splits: int, cross-validation generator or an iterable Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -321,7 +349,7 @@ def score(self, Y, T, W=None, nuisances=None): est = _OrthoLearner(ModelNuisance(LinearRegression(), LinearRegression()), ModelFinal(), n_splits=2, discrete_treatment=False, discrete_instrument=False, - random_state=None) + categories='auto', random_state=None) est.fit(y, X[:, 0], W=X[:, 1:]) >>> est.score_ @@ -383,7 +411,7 @@ def score(self, Y, T, W=None, nuisances=None): y = T + W[:, 0] + np.random.normal(0, 0.01, size=(100,)) est = _OrthoLearner(ModelNuisance(LogisticRegression(solver='lbfgs'), LinearRegression()), ModelFinal(), n_splits=2, discrete_treatment=True, discrete_instrument=False, - random_state=None) + categories='auto', random_state=None) est.fit(y, T, W=W) >>> est.score_ @@ -408,10 +436,12 @@ def score(self, Y, T, W=None, nuisances=None): If the model_final has a score method, then `score_` contains the outcome of the final model score when evaluated on the fitted nuisances from the first stage. Represents goodness of fit, of the final CATE model. + nuisance_scores_ : tuple of lists of floats or None + The out-of-sample scores from training each nuisance model """ def __init__(self, model_nuisance, model_final, *, - discrete_treatment, discrete_instrument, n_splits, random_state): + discrete_treatment, discrete_instrument, categories, n_splits, random_state): self._model_nuisance = clone(model_nuisance, safe=False) self._models_nuisance = None self._model_final = clone(model_final, safe=False) @@ -420,8 +450,9 @@ def __init__(self, model_nuisance, model_final, *, self._discrete_instrument = discrete_instrument self._random_state = check_random_state(random_state) if discrete_treatment: - self._label_encoder = LabelEncoder() - self._one_hot_encoder = OneHotEncoder(categories='auto', sparse=False) + if categories != 'auto': + categories = [categories] # OneHotEncoder expects a 2D array with features per column + self._one_hot_encoder = OneHotEncoder(categories=categories, sparse=False, drop='first') super().__init__() @staticmethod @@ -511,26 +542,25 @@ def _fit_nuisances(self, Y, T, X=None, W=None, Z=None, sample_weight=None): stratify = self._discrete_treatment or self._discrete_instrument if self._discrete_treatment: - T = self._label_encoder.fit_transform(T.ravel()) + T = self._one_hot_encoder.fit_transform(reshape(T, (-1, 1))) if self._discrete_instrument: z_enc = LabelEncoder() Z = z_enc.fit_transform(Z.ravel()) if self._discrete_treatment: # need to stratify on combination of Z and T - to_split = T + Z * len(self._label_encoder.classes_) + to_split = inverse_onehot(T) + Z * len(self._one_hot_encoder.categories_[0]) else: to_split = Z # just stratify on Z - z_ohe = OneHotEncoder(categories='auto', sparse=False) - Z = z_ohe.fit_transform(reshape(Z, (-1, 1)))[:, 1:] + z_ohe = OneHotEncoder(categories='auto', sparse=False, drop='first') + Z = z_ohe.fit_transform(reshape(Z, (-1, 1))) self.z_transformer = FunctionTransformer( - func=(lambda Z: - z_ohe.transform( - reshape(z_enc.transform(Z.ravel()), (-1, 1)))[:, 1:]), + func=_EncoderWrapper(z_ohe, z_enc).encode, validate=False) else: - to_split = T # stratify on T if discrete, and fine to pass T as second arg to KFold.split even when not + # stratify on T if discrete, and fine to pass T as second arg to KFold.split even when not + to_split = inverse_onehot(T) if self._discrete_treatment else T self.z_transformer = None if self._n_splits == 1: # special case, no cross validation @@ -550,19 +580,15 @@ def _fit_nuisances(self, Y, T, X=None, W=None, Z=None, sample_weight=None): folds = splitter.split(np.ones((T.shape[0], 1)), to_split) if self._discrete_treatment: - # drop first column since all columns sum to one - T = self._one_hot_encoder.fit_transform(reshape(T, (-1, 1)))[:, 1:] - self._d_t = shape(T)[1:] self.transformer = FunctionTransformer( - func=(lambda T: - self._one_hot_encoder.transform( - reshape(self._label_encoder.transform(T.ravel()), (-1, 1)))[:, 1:]), + func=_EncoderWrapper(self._one_hot_encoder).encode, validate=False) - nuisances, fitted_models, fitted_inds = _crossfit(self._model_nuisance, folds, - Y, T, X=X, W=W, Z=Z, sample_weight=sample_weight) + nuisances, fitted_models, fitted_inds, scores = _crossfit(self._model_nuisance, folds, + Y, T, X=X, W=W, Z=Z, sample_weight=sample_weight) self._models_nuisance = fitted_models + self.nuisance_scores_ = scores return nuisances, fitted_inds def _fit_final(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): diff --git a/econml/_rlearner.py b/econml/_rlearner.py index 9d6f62f10..5d0dcbe9c 100644 --- a/econml/_rlearner.py +++ b/econml/_rlearner.py @@ -34,6 +34,82 @@ from ._ortho_learner import _OrthoLearner +class _ModelNuisance: + """ + Nuisance model fits the model_y and model_t at fit time and at predict time + calculates the residual Y and residual T based on the fitted models and returns + the residuals as two nuisance parameters. + """ + + def __init__(self, model_y, model_t): + self._model_y = clone(model_y, safe=False) + self._model_t = clone(model_t, safe=False) + + def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + assert Z is None, "Cannot accept instrument!" + self._model_t.fit(X, W, T, sample_weight=sample_weight) + self._model_y.fit(X, W, Y, sample_weight=sample_weight) + return self + + def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + if hasattr(self._model_y, 'score'): + Y_score = self._model_y.score(X, W, Y, sample_weight=sample_weight) + else: + Y_score = None + if hasattr(self._model_t, 'score'): + T_score = self._model_t.score(X, W, T, sample_weight=sample_weight) + else: + T_score = None + return Y_score, T_score + + def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + Y_pred = self._model_y.predict(X, W) + T_pred = self._model_t.predict(X, W) + if (X is None) and (W is None): # In this case predict above returns a single row + Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1)) + T_pred = np.tile(T_pred.reshape(1, -1), (T.shape[0], 1)) + Y_res = Y - Y_pred.reshape(Y.shape) + T_res = T - T_pred.reshape(T.shape) + return Y_res, T_res + + +class _ModelFinal: + """ + Final model at fit time, fits a residual on residual regression with a heterogeneous coefficient + that depends on X, i.e. + + .. math :: + Y - E[Y | X, W] = \\theta(X) \\cdot (T - E[T | X, W]) + \\epsilon + + and at predict time returns :math:`\\theta(X)`. The score method returns the MSE of this final + residual on residual regression. + """ + + def __init__(self, model_final): + self._model_final = clone(model_final, safe=False) + + def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): + Y_res, T_res = nuisances + self._model_final.fit(X, T_res, Y_res, sample_weight=sample_weight, sample_var=sample_var) + return self + + def predict(self, X=None): + return self._model_final.predict(X) + + def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): + Y_res, T_res = nuisances + if Y_res.ndim == 1: + Y_res = Y_res.reshape((-1, 1)) + if T_res.ndim == 1: + T_res = T_res.reshape((-1, 1)) + effects = self._model_final.predict(X).reshape((-1, Y_res.shape[1], T_res.shape[1])) + Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape) + if sample_weight is not None: + return np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0)) + else: + return np.mean((Y_res - Y_res_pred) ** 2) + + class _RLearner(_OrthoLearner): """ Base class for CATE learners that residualize treatment and outcome and run residual on residual regression. @@ -89,6 +165,10 @@ class _RLearner(_OrthoLearner): discrete_treatment: bool Whether the treatment values should be treated as categorical, rather than continuous, quantities + categories: 'auto' or list + The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). + The first category will be treated as the control treatment. + n_splits: int, cross-validation generator or an iterable Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -146,7 +226,7 @@ def predict(self, X): est = _RLearner(ModelFirst(LinearRegression()), ModelFirst(LinearRegression()), ModelFinal(), - n_splits=2, discrete_treatment=False, random_state=None) + n_splits=2, discrete_treatment=False, categories='auto', random_state=None) est.fit(y, X[:, 0], X=np.ones((X.shape[0], 1)), W=X[:, 1:]) >>> est.const_marginal_effect(np.ones((1,1))) @@ -177,7 +257,11 @@ def predict(self, X): model_final : object of type(model_final) An instance of the model_final object that was fitted after calling fit. score_ : float - The MSE in the final residual on residual regression, i.e. + The MSE in the final residual on residual regression + nuisance_scores_y : list of float + The out-of-sample scores for each outcome model + nuisance_scores_t : list of float + The out-of-sample scores for each treatment model .. math:: \\frac{1}{n} \\sum_{i=1}^n (Y_i - \\hat{E}[Y|X_i, W_i]\ @@ -188,74 +272,13 @@ def predict(self, X): """ def __init__(self, model_y, model_t, model_final, - discrete_treatment, n_splits, random_state): - class ModelNuisance: - """ - Nuisance model fits the model_y and model_t at fit time and at predict time - calculates the residual Y and residual T based on the fitted models and returns - the residuals as two nuisance parameters. - """ - - def __init__(self, model_y, model_t): - self._model_y = clone(model_y, safe=False) - self._model_t = clone(model_t, safe=False) - - def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None): - assert Z is None, "Cannot accept instrument!" - self._model_t.fit(X, W, T, sample_weight=sample_weight) - self._model_y.fit(X, W, Y, sample_weight=sample_weight) - return self - - def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None): - Y_pred = self._model_y.predict(X, W) - T_pred = self._model_t.predict(X, W) - if (X is None) and (W is None): # In this case predict above returns a single row - Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1)) - T_pred = np.tile(T_pred.reshape(1, -1), (T.shape[0], 1)) - Y_res = Y - Y_pred.reshape(Y.shape) - T_res = T - T_pred.reshape(T.shape) - return Y_res, T_res - - class ModelFinal: - """ - Final model at fit time, fits a residual on residual regression with a heterogeneous coefficient - that depends on X, i.e. - - .. math :: - Y - E[Y | X, W] = \\theta(X) \\cdot (T - E[T | X, W]) + \\epsilon - - and at predict time returns :math:`\\theta(X)`. The score method returns the MSE of this final - residual on residual regression. - """ - - def __init__(self, model_final): - self._model_final = clone(model_final, safe=False) + discrete_treatment, categories, n_splits, random_state): - def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): - Y_res, T_res = nuisances - self._model_final.fit(X, T_res, Y_res, sample_weight=sample_weight, sample_var=sample_var) - return self - - def predict(self, X=None): - return self._model_final.predict(X) - - def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): - Y_res, T_res = nuisances - if Y_res.ndim == 1: - Y_res = Y_res.reshape((-1, 1)) - if T_res.ndim == 1: - T_res = T_res.reshape((-1, 1)) - effects = self._model_final.predict(X).reshape((-1, Y_res.shape[1], T_res.shape[1])) - Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape) - if sample_weight is not None: - return np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0)) - else: - return np.mean((Y_res - Y_res_pred)**2) - - super().__init__(ModelNuisance(model_y, model_t), - ModelFinal(model_final), + super().__init__(_ModelNuisance(model_y, model_t), + _ModelFinal(model_final), discrete_treatment=discrete_treatment, discrete_instrument=False, # no instrument, so doesn't matter + categories=categories, n_splits=n_splits, random_state=random_state) @@ -327,3 +350,11 @@ def models_y(self): @property def models_t(self): return [mdl._model_t for mdl in super().models_nuisance] + + @property + def nuisance_scores_y(self): + return self.nuisance_scores_[0] + + @property + def nuisance_scores_t(self): + return self.nuisance_scores_[1] diff --git a/econml/dml.py b/econml/dml.py index 4f7fe54b0..acf6d585f 100644 --- a/econml/dml.py +++ b/econml/dml.py @@ -37,7 +37,7 @@ import copy from warnings import warn from .utilities import (shape, reshape, ndim, hstack, cross_product, transpose, inverse_onehot, - broadcast_unit_treatments, reshape_treatmentwise_effects, + broadcast_unit_treatments, reshape_treatmentwise_effects, add_intercept, StatsModelsLinearRegression, LassoCVWrapper, check_high_dimensional) from econml.sklearn_extensions.linear_model import MultiOutputDebiasedLasso, WeightedLassoCVWrapper from econml.sklearn_extensions.ensemble import SubsampledHonestForest @@ -100,6 +100,20 @@ def predict(self, X, W): else: return self._model.predict(self._combine(X, W, n_samples, fitting=False)) + def score(self, X, W, Target, sample_weight=None): + if hasattr(self._model, 'score'): + if (not self._is_Y) and self._discrete_treatment: + # In this case, the Target is the one-hot-encoding of the treatment variable + # We need to go back to the label representation of the one-hot so as to call + # the classifier. + Target = inverse_onehot(Target) + if sample_weight is not None: + return self._model.score(self._combine(X, W, Target.shape[0]), Target, sample_weight=sample_weight) + else: + return self._model.score(self._combine(X, W, Target.shape[0]), Target) + else: + return None + class _FinalWrapper: def __init__(self, model_final, fit_cate_intercept, featurizer, use_weight_trick): @@ -112,14 +126,13 @@ def __init__(self, model_final, fit_cate_intercept, featurizer, use_weight_trick else: self._fit_cate_intercept = fit_cate_intercept if self._fit_cate_intercept: - add_intercept = FunctionTransformer(lambda F: - hstack([np.ones((F.shape[0], 1)), F]), - validate=True) + add_intercept_trans = FunctionTransformer(add_intercept, + validate=True) if featurizer: self._featurizer = Pipeline([('featurize', self._original_featurizer), - ('add_intercept', add_intercept)]) + ('add_intercept', add_intercept_trans)]) else: - self._featurizer = add_intercept + self._featurizer = add_intercept_trans else: self._featurizer = self._original_featurizer @@ -374,6 +387,10 @@ class takes as input the parameter `model_t`, which is an arbitrary scikit-learn discrete_treatment: bool, optional, default False Whether the treatment values should be treated as categorical, rather than continuous, quantities + categories: 'auto' or list, default 'auto' + The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). + The first category will be treated as the control treatment. + n_splits: int, cross-validation generator or an iterable, optional, default 2 Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -404,6 +421,7 @@ def __init__(self, fit_cate_intercept=True, linear_first_stages=False, discrete_treatment=False, + categories='auto', n_splits=2, random_state=None): @@ -422,6 +440,7 @@ def __init__(self, featurizer, linear_first_stages, discrete_treatment), model_final=_FinalWrapper(model_final, fit_cate_intercept, featurizer, False), discrete_treatment=discrete_treatment, + categories=categories, n_splits=n_splits, random_state=random_state) @@ -458,6 +477,10 @@ class LinearDMLCateEstimator(StatsModelsCateEstimatorMixin, DMLCateEstimator): discrete_treatment: bool, optional (default is ``False``) Whether the treatment values should be treated as categorical, rather than continuous, quantities + categories: 'auto' or list, default 'auto' + The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). + The first category will be treated as the control treatment. + n_splits: int, cross-validation generator or an iterable, optional (Default=2) Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -488,6 +511,7 @@ def __init__(self, fit_cate_intercept=True, linear_first_stages=True, discrete_treatment=False, + categories='auto', n_splits=2, random_state=None): super().__init__(model_y=model_y, @@ -497,6 +521,7 @@ def __init__(self, fit_cate_intercept=fit_cate_intercept, linear_first_stages=linear_first_stages, discrete_treatment=discrete_treatment, + categories=categories, n_splits=n_splits, random_state=random_state) @@ -584,6 +609,10 @@ class SparseLinearDMLCateEstimator(DebiasedLassoCateEstimatorMixin, DMLCateEstim discrete_treatment: bool, optional (default is ``False``) Whether the treatment values should be treated as categorical, rather than continuous, quantities + categories: 'auto' or list, default 'auto' + The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). + The first category will be treated as the control treatment. + n_splits: int, cross-validation generator or an iterable, optional (Default=2) Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -616,6 +645,7 @@ def __init__(self, fit_cate_intercept=True, linear_first_stages=True, discrete_treatment=False, + categories='auto', n_splits=2, random_state=None): model_final = MultiOutputDebiasedLasso( @@ -630,6 +660,7 @@ def __init__(self, fit_cate_intercept=fit_cate_intercept, linear_first_stages=linear_first_stages, discrete_treatment=discrete_treatment, + categories=categories, n_splits=n_splits, random_state=random_state) @@ -672,6 +703,21 @@ def fit(self, Y, T, X=None, W=None, sample_weight=None, sample_var=None, inferen return super().fit(Y, T, X=X, W=W, sample_weight=sample_weight, sample_var=None, inference=inference) +class _RandomFeatures(TransformerMixin): + def __init__(self, dim, bw, random_state): + self._dim = dim + self._bw = bw + self._random_state = check_random_state(random_state) + + def fit(self, X): + self.omegas = self._random_state.normal(0, 1 / self._bw, size=(shape(X)[1], self._dim)) + self.biases = self._random_state.uniform(0, 2 * np.pi, size=(1, self._dim)) + return self + + def transform(self, X): + return np.sqrt(2 / self._dim) * np.cos(np.matmul(X, self.omegas) + self.biases) + + class KernelDMLCateEstimator(DMLCateEstimator): """ A specialized version of the linear Double ML Estimator that uses random fourier features. @@ -703,6 +749,10 @@ class KernelDMLCateEstimator(DMLCateEstimator): discrete_treatment: bool, optional (default is ``False``) Whether the treatment values should be treated as categorical, rather than continuous, quantities + categories: 'auto' or list, default 'auto' + The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). + The first category will be treated as the control treatment. + n_splits: int, cross-validation generator or an iterable, optional (Default=2) Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -727,24 +777,14 @@ class KernelDMLCateEstimator(DMLCateEstimator): """ def __init__(self, model_y=WeightedLassoCVWrapper(), model_t='auto', fit_cate_intercept=True, - dim=20, bw=1.0, discrete_treatment=False, n_splits=2, random_state=None): - class RandomFeatures(TransformerMixin): - def __init__(self, random_state): - self._random_state = check_random_state(random_state) - - def fit(self, X): - self.omegas = self._random_state.normal(0, 1 / bw, size=(shape(X)[1], dim)) - self.biases = self._random_state.uniform(0, 2 * np.pi, size=(1, dim)) - return self - - def transform(self, X): - return np.sqrt(2 / dim) * np.cos(np.matmul(X, self.omegas) + self.biases) - + dim=20, bw=1.0, discrete_treatment=False, categories='auto', n_splits=2, random_state=None): super().__init__(model_y=model_y, model_t=model_t, model_final=ElasticNetCV(fit_intercept=False), - featurizer=RandomFeatures(random_state), + featurizer=_RandomFeatures(dim, bw, random_state), fit_cate_intercept=fit_cate_intercept, - discrete_treatment=discrete_treatment, n_splits=n_splits, random_state=random_state) + discrete_treatment=discrete_treatment, + categories=categories, + n_splits=n_splits, random_state=random_state) class NonParamDMLCateEstimator(_BaseDMLCateEstimator): @@ -776,6 +816,10 @@ class NonParamDMLCateEstimator(_BaseDMLCateEstimator): discrete_treatment: bool, optional (default is ``False``) Whether the treatment values should be treated as categorical, rather than continuous, quantities + categories: 'auto' or list, default 'auto' + The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). + The first category will be treated as the control treatment. + n_splits: int, cross-validation generator or an iterable, optional (Default=2) Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -804,6 +848,7 @@ def __init__(self, model_y, model_t, model_final, featurizer=None, discrete_treatment=False, + categories='auto', n_splits=2, random_state=None): @@ -816,6 +861,7 @@ def __init__(self, featurizer, False, discrete_treatment), model_final=_FinalWrapper(model_final, False, featurizer, True), discrete_treatment=discrete_treatment, + categories=categories, n_splits=n_splits, random_state=random_state) @@ -838,6 +884,10 @@ class ForestDMLCateEstimator(NonParamDMLCateEstimator): discrete_treatment: bool, optional (default is ``False``) Whether the treatment values should be treated as categorical, rather than continuous, quantities + categories: 'auto' or list, default 'auto' + The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). + The first category will be treated as the control treatment. + n_crossfit_splits: int, cross-validation generator or an iterable, optional (Default=2) Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -972,6 +1022,7 @@ class ForestDMLCateEstimator(NonParamDMLCateEstimator): def __init__(self, model_y, model_t, discrete_treatment=False, + categories='auto', n_crossfit_splits=2, n_estimators=100, criterion="mse", @@ -1004,6 +1055,7 @@ def __init__(self, super().__init__(model_y=model_y, model_t=model_t, model_final=model_final, featurizer=None, discrete_treatment=discrete_treatment, + categories=categories, n_splits=n_crossfit_splits, random_state=random_state) def _get_inference_options(self): diff --git a/econml/drlearner.py b/econml/drlearner.py index 1223ead3b..bc4136091 100644 --- a/econml/drlearner.py +++ b/econml/drlearner.py @@ -48,6 +48,116 @@ def _filter_none_kwargs(**kwargs): return out_kwargs +class _ModelNuisance: + def __init__(self, model_propensity, model_regression, min_propensity): + self._model_propensity = model_propensity + self._model_regression = model_regression + self._min_propensity = min_propensity + + def _combine(self, X, W): + return np.hstack([arr for arr in [X, W] if arr is not None]) + + def fit(self, Y, T, X=None, W=None, *, sample_weight=None): + if Y.ndim != 1 and (Y.ndim != 2 or Y.shape[1] != 1): + raise ValueError("The outcome matrix must be of shape ({0}, ) or ({0}, 1), " + "instead got {1}.".format(len(X), Y.shape)) + if (X is None) and (W is None): + raise AttributeError("At least one of X or W has to not be None!") + if np.any(np.all(T == 0, axis=0)) or (not np.any(np.all(T == 0, axis=1))): + raise AttributeError("Provided crossfit folds contain training splits that " + + "don't contain all treatments") + XW = self._combine(X, W) + filtered_kwargs = _filter_none_kwargs(sample_weight=sample_weight) + self._model_propensity.fit(XW, inverse_onehot(T), **filtered_kwargs) + self._model_regression.fit(np.hstack([XW, T]), Y, **filtered_kwargs) + return self + + def score(self, Y, T, X=None, W=None, *, sample_weight=None): + XW = self._combine(X, W) + filtered_kwargs = _filter_none_kwargs(sample_weight=sample_weight) + + if hasattr(self._model_propensity, 'score'): + propensity_score = self._model_propensity.score(XW, inverse_onehot(T), **filtered_kwargs) + else: + propensity_score = None + if hasattr(self._model_regression, 'score'): + regression_score = self._model_regression.score(np.hstack([XW, T]), Y, **filtered_kwargs) + else: + regression_score = None + + return propensity_score, regression_score + + def predict(self, Y, T, X=None, W=None, *, sample_weight=None): + XW = self._combine(X, W) + propensities = np.maximum(self._model_propensity.predict_proba(XW), self._min_propensity) + n = T.shape[0] + Y_pred = np.zeros((T.shape[0], T.shape[1] + 1)) + T_counter = np.zeros(T.shape) + Y_pred[:, 0] = self._model_regression.predict(np.hstack([XW, T_counter])).reshape(n) + Y_pred[:, 0] += (Y.reshape(n) - Y_pred[:, 0]) * np.all(T == 0, axis=1) / propensities[:, 0] + for t in np.arange(T.shape[1]): + T_counter = np.zeros(T.shape) + T_counter[:, t] = 1 + Y_pred[:, t + 1] = self._model_regression.predict(np.hstack([XW, T_counter])).reshape(n) + Y_pred[:, t + 1] += (Y.reshape(n) - Y_pred[:, t + 1]) * (T[:, t] == 1) / propensities[:, t + 1] + return Y_pred.reshape(Y.shape + (T.shape[1] + 1,)) + + +class _ModelFinal: + # Coding Remark: The reasoning around the multitask_model_final could have been simplified if + # we simply wrapped the model_final with a MultiOutputRegressor. However, because we also want + # to allow even for model_final objects whose fit(X, y) can accept X=None + # (e.g. the StatsModelsLinearRegression), we cannot take that route, because the MultiOutputRegressor + # checks that X is 2D array. + def __init__(self, model_final, featurizer, multitask_model_final): + self._model_final = clone(model_final, safe=False) + self._featurizer = clone(featurizer, safe=False) + self._multitask_model_final = multitask_model_final + return + + def fit(self, Y, T, X=None, W=None, *, nuisances, sample_weight=None, sample_var=None): + Y_pred, = nuisances + self.d_y = Y_pred.shape[1:-1] # track whether there's a Y dimension (must be a singleton) + if (X is not None) and (self._featurizer is not None): + X = self._featurizer.fit_transform(X) + filtered_kwargs = _filter_none_kwargs(sample_weight=sample_weight, sample_var=sample_var) + if self._multitask_model_final: + ys = Y_pred[..., 1:] - Y_pred[..., [0]] # subtract control results from each other arm + if self.d_y: # need to squeeze out singleton so that we fit on 2D array + ys = ys.squeeze(1) + self.model_cate = self._model_final.fit(X, ys, **filtered_kwargs) + else: + self.models_cate = [clone(self._model_final, safe=False).fit(X, Y_pred[..., t] - Y_pred[..., 0], + **filtered_kwargs) + for t in np.arange(1, Y_pred.shape[-1])] + return self + + def predict(self, X=None): + if (X is not None) and (self._featurizer is not None): + X = self._featurizer.transform(X) + if self._multitask_model_final: + pred = self.model_cate.predict(X) + if self.d_y: # need to reintroduce singleton Y dimension + return pred[:, np.newaxis, :] + return pred + else: + preds = np.array([mdl.predict(X) for mdl in self.models_cate]) + return np.moveaxis(preds, 0, -1) # move treatment dim to end + + def score(self, Y, T, X=None, W=None, *, nuisances, sample_weight=None, sample_var=None): + if (X is not None) and (self._featurizer is not None): + X = self._featurizer.transform(X) + Y_pred, = nuisances + if self._multitask_model_final: + return np.mean(np.average((Y_pred[..., 1:] - Y_pred[..., [0]] - self.model_cate.predict(X))**2, + weights=sample_weight, axis=0)) + else: + return np.mean([np.average((Y_pred[..., t] - Y_pred[..., 0] - + self.models_cate[t - 1].predict(X))**2, + weights=sample_weight, axis=0) + for t in np.arange(1, Y_pred.shape[-1])]) + + class DRLearner(_OrthoLearner): """ CATE estimator that uses doubly-robust correction techniques to account for @@ -126,6 +236,10 @@ class takes as input the parameter ``model_regressor``, which is an arbitrary sc min_propensity : float, optional, default ``1e-6`` The minimum propensity at which to clip propensity estimates to avoid dividing by zero. + categories: 'auto' or list, default 'auto' + The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). + The first category will be treated as the control treatment. + n_splits: int, cross-validation generator or an iterable, optional (default is 2) Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -250,105 +364,15 @@ def __init__(self, model_propensity=LogisticRegressionCV(cv=3, solver='lbfgs', m multitask_model_final=False, featurizer=None, min_propensity=1e-6, + categories='auto', n_splits=2, random_state=None): - class ModelNuisance: - def __init__(self, model_propensity, model_regression): - self._model_propensity = model_propensity - self._model_regression = model_regression - - def _combine(self, X, W): - return np.hstack([arr for arr in [X, W] if arr is not None]) - - def fit(self, Y, T, X=None, W=None, *, sample_weight=None): - if Y.ndim != 1 and (Y.ndim != 2 or Y.shape[1] != 1): - raise ValueError("The outcome matrix must be of shape ({0}, ) or ({0}, 1), " - "instead got {1}.".format(len(X), Y.shape)) - if (X is None) and (W is None): - raise AttributeError("At least one of X or W has to not be None!") - if np.any(np.all(T == 0, axis=0)) or (not np.any(np.all(T == 0, axis=1))): - raise AttributeError("Provided crossfit folds contain training splits that " + - "don't contain all treatments") - XW = self._combine(X, W) - filtered_kwargs = _filter_none_kwargs(sample_weight=sample_weight) - self._model_propensity.fit(XW, inverse_onehot(T), **filtered_kwargs) - self._model_regression.fit(np.hstack([XW, T]), Y, **filtered_kwargs) - return self - - def predict(self, Y, T, X=None, W=None, *, sample_weight=None): - XW = self._combine(X, W) - propensities = np.maximum(self._model_propensity.predict_proba(XW), min_propensity) - n = T.shape[0] - Y_pred = np.zeros((T.shape[0], T.shape[1] + 1)) - T_counter = np.zeros(T.shape) - Y_pred[:, 0] = self._model_regression.predict(np.hstack([XW, T_counter])).reshape(n) - Y_pred[:, 0] += (Y.reshape(n) - Y_pred[:, 0]) * np.all(T == 0, axis=1) / propensities[:, 0] - for t in np.arange(T.shape[1]): - T_counter = np.zeros(T.shape) - T_counter[:, t] = 1 - Y_pred[:, t + 1] = self._model_regression.predict(np.hstack([XW, T_counter])).reshape(n) - Y_pred[:, t + 1] += (Y.reshape(n) - Y_pred[:, t + 1]) * (T[:, t] == 1) / propensities[:, t + 1] - return Y_pred.reshape(Y.shape + (T.shape[1] + 1,)) - - class ModelFinal: - # Coding Remark: The reasoning around the multitask_model_final could have been simplified if - # we simply wrapped the model_final with a MultiOutputRegressor. However, because we also want - # to allow even for model_final objects whose fit(X, y) can accept X=None - # (e.g. the StatsModelsLinearRegression), we cannot take that route, because the MultiOutputRegressor - # checks that X is 2D array. - def __init__(self, model_final, featurizer, multitask_model_final): - self._model_final = clone(model_final, safe=False) - self._featurizer = clone(featurizer, safe=False) - self._multitask_model_final = multitask_model_final - return - - def fit(self, Y, T, X=None, W=None, *, nuisances, sample_weight=None, sample_var=None): - Y_pred, = nuisances - self.d_y = Y_pred.shape[1:-1] # track whether there's a Y dimension (must be a singleton) - if (X is not None) and (self._featurizer is not None): - X = self._featurizer.fit_transform(X) - filtered_kwargs = _filter_none_kwargs(sample_weight=sample_weight, sample_var=sample_var) - if self._multitask_model_final: - ys = Y_pred[..., 1:] - Y_pred[..., [0]] # subtract control results from each other arm - if self.d_y: # need to squeeze out singleton so that we fit on 2D array - ys = ys.squeeze(1) - self.model_cate = self._model_final.fit(X, ys, **filtered_kwargs) - else: - self.models_cate = [clone(self._model_final, safe=False).fit(X, Y_pred[..., t] - Y_pred[..., 0], - **filtered_kwargs) - for t in np.arange(1, Y_pred.shape[-1])] - return self - - def predict(self, X=None): - if (X is not None) and (self._featurizer is not None): - X = self._featurizer.transform(X) - if self._multitask_model_final: - pred = self.model_cate.predict(X) - if self.d_y: # need to reintroduce singleton Y dimension - return pred[:, np.newaxis, :] - return pred - else: - preds = np.array([mdl.predict(X) for mdl in self.models_cate]) - return np.moveaxis(preds, 0, -1) # move treatment dim to end - - def score(self, Y, T, X=None, W=None, *, nuisances, sample_weight=None, sample_var=None): - if (X is not None) and (self._featurizer is not None): - X = self._featurizer.transform(X) - Y_pred, = nuisances - if self._multitask_model_final: - return np.mean(np.average((Y_pred[..., 1:] - Y_pred[..., [0]] - self.model_cate.predict(X))**2, - weights=sample_weight, axis=0)) - else: - return np.mean([np.average((Y_pred[..., t] - Y_pred[..., 0] - - self.models_cate[t - 1].predict(X))**2, - weights=sample_weight, axis=0) - for t in np.arange(1, Y_pred.shape[-1])]) - self._multitask_model_final = multitask_model_final - super().__init__(ModelNuisance(model_propensity, model_regression), - ModelFinal(model_final, featurizer, multitask_model_final), + super().__init__(_ModelNuisance(model_propensity, model_regression, min_propensity), + _ModelFinal(model_final, featurizer, multitask_model_final), n_splits=n_splits, discrete_treatment=True, discrete_instrument=False, # no instrument, so doesn't matter + categories=categories, random_state=random_state) def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, inference=None): @@ -472,6 +496,16 @@ def models_regression(self): """ return [mdl._model_regression for mdl in super().models_nuisance] + @property + def nuisance_scores_propensity(self): + """Gets the score for the propensity model on out-of-sample training data""" + return self.nuisance_scores_[0] + + @property + def nuisance_scores_regression(self): + """Gets the score for the regression model on out-of-sample training data""" + return self.nuisance_scores_[1] + @property def featurizer(self): """ @@ -561,10 +595,13 @@ class LinearDRLearner(StatsModelsCateEstimatorDiscreteMixin, DRLearner): fit_cate_intercept : bool, optional, default True Whether the linear CATE model should have a constant term. - min_propensity : float, optional, default ``1e-6`` The minimum propensity at which to clip propensity estimates to avoid dividing by zero. + categories: 'auto' or list, default 'auto' + The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). + The first category will be treated as the control treatment. + n_splits: int, cross-validation generator or an iterable, optional (default is 2) Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -637,6 +674,7 @@ def __init__(self, featurizer=None, fit_cate_intercept=True, min_propensity=1e-6, + categories='auto', n_splits=2, random_state=None): self.fit_cate_intercept = fit_cate_intercept super().__init__(model_propensity=model_propensity, @@ -645,6 +683,7 @@ def __init__(self, featurizer=featurizer, multitask_model_final=False, min_propensity=min_propensity, + categories=categories, n_splits=n_splits, random_state=random_state) @@ -760,6 +799,10 @@ class SparseLinearDRLearner(DebiasedLassoCateEstimatorDiscreteMixin, DRLearner): min_propensity : float, optional, default ``1e-6`` The minimum propensity at which to clip propensity estimates to avoid dividing by zero. + categories: 'auto' or list, default 'auto' + The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). + The first category will be treated as the control treatment. + n_splits: int, cross-validation generator or an iterable, optional, default 2 Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -835,6 +878,7 @@ def __init__(self, max_iter=1000, tol=1e-4, min_propensity=1e-6, + categories='auto', n_splits=2, random_state=None): self.fit_cate_intercept = fit_cate_intercept model_final = DebiasedLasso( @@ -848,6 +892,7 @@ def __init__(self, featurizer=featurizer, multitask_model_final=False, min_propensity=min_propensity, + categories=categories, n_splits=n_splits, random_state=random_state) @@ -924,6 +969,10 @@ class ForestDRLearner(DRLearner): min_propensity : float, optional, default ``1e-6`` The minimum propensity at which to clip propensity estimates to avoid dividing by zero. + categories: 'auto' or list, default 'auto' + The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). + The first category will be treated as the control treatment. + n_crossfit_splits: int, cross-validation generator or an iterable, optional (Default=2) Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -1058,6 +1107,7 @@ class ForestDRLearner(DRLearner): def __init__(self, model_regression, model_propensity, min_propensity=1e-6, + categories='auto', n_crossfit_splits=2, n_estimators=1000, criterion="mse", @@ -1091,6 +1141,7 @@ def __init__(self, model_final=model_final, featurizer=None, multitask_model_final=False, min_propensity=min_propensity, + categories=categories, n_splits=n_crossfit_splits, random_state=random_state) def _get_inference_options(self): diff --git a/econml/metalearners.py b/econml/metalearners.py index db5e8c268..31a2090d9 100644 --- a/econml/metalearners.py +++ b/econml/metalearners.py @@ -14,8 +14,9 @@ from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from sklearn.utils import check_array, check_X_y -from sklearn.preprocessing import OneHotEncoder, LabelEncoder, FunctionTransformer -from .utilities import check_inputs, check_models, broadcast_unit_treatments, reshape_treatmentwise_effects, transpose +from sklearn.preprocessing import OneHotEncoder, FunctionTransformer +from .utilities import (check_inputs, check_models, broadcast_unit_treatments, reshape_treatmentwise_effects, + inverse_onehot, transpose, _EncoderWrapper) class TLearner(TreatmentExpansionMixin, LinearCateEstimator): @@ -28,16 +29,18 @@ class TLearner(TreatmentExpansionMixin, LinearCateEstimator): estimators with one estimator per treatment (including control). Must implement `fit` and `predict` methods. + categories: 'auto' or list, default 'auto' + The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). + The first category will be treated as the control treatment. """ - def __init__(self, models): + def __init__(self, models, categories='auto'): self.models = clone(models, safe=False) - self._label_encoder = LabelEncoder() - self._one_hot_encoder = OneHotEncoder(categories='auto', sparse=False) + if categories != 'auto': + categories = [categories] # OneHotEncoder expects a 2D array with features per column + self._one_hot_encoder = OneHotEncoder(categories=categories, sparse=False, drop='first') self.transformer = FunctionTransformer( - func=(lambda T: - self._one_hot_encoder.transform( - self._label_encoder.transform(T).reshape(-1, 1))[:, 1:]), + func=_EncoderWrapper(self._one_hot_encoder).encode, validate=False) super().__init__() @@ -68,9 +71,9 @@ def fit(self, Y, T, X, *, inference=None): """ # Check inputs Y, T, X, _ = check_inputs(Y, T, X, multi_output_T=False) - T = self._label_encoder.fit_transform(T) - self._one_hot_encoder.fit(T.reshape(-1, 1)) - self._d_t = (len(self._label_encoder.classes_) - 1,) + T = self._one_hot_encoder.fit_transform(T.reshape(-1, 1)) + self._d_t = T.shape[1:] + T = inverse_onehot(T) self.models = check_models(self.models, self._d_t[0] + 1) for ind in range(self._d_t[0] + 1): @@ -111,16 +114,21 @@ class SLearner(TreatmentExpansionMixin, LinearCateEstimator): Model will be trained on X|T where '|' denotes concatenation. Must implement `fit` and `predict` methods. + categories: 'auto' or list, default 'auto' + The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). + The first category will be treated as the control treatment. """ - def __init__(self, overall_model): + def __init__(self, overall_model, categories='auto'): self.overall_model = clone(overall_model, safe=False) - self._label_encoder = LabelEncoder() - self._one_hot_encoder = OneHotEncoder(categories='auto', sparse=False) + if categories != 'auto': + categories = [categories] # OneHotEncoder expects a 2D array with features per column + # Note: unlike other Metalearners, we don't drop the first column because + # we concatenate all treatments to the other features; + # We might want to revisit, though, since it's linearly determined by the others + self._one_hot_encoder = OneHotEncoder(categories=categories, sparse=False) self.transformer = FunctionTransformer( - func=(lambda T: - self._one_hot_encoder.transform( - self._label_encoder.transform(T).reshape(-1, 1))[:, 1:]), + func=_EncoderWrapper(self._one_hot_encoder, drop_first=True).encode, validate=False) super().__init__() @@ -152,7 +160,6 @@ def fit(self, Y, T, X=None, *, inference=None): if X is None: X = np.zeros((Y.shape[0], 1)) Y, T, X, _ = check_inputs(Y, T, X, multi_output_T=False) - T = self._label_encoder.fit_transform(T) T = self._one_hot_encoder.fit_transform(T.reshape(-1, 1)) self._d_t = (T.shape[1] - 1,) feat_arr = np.concatenate((X, T), axis=1) @@ -208,20 +215,24 @@ class XLearner(TreatmentExpansionMixin, LinearCateEstimator): propensity_model : estimator for the propensity function Must implement `fit` and `predict_proba` methods. The `fit` method must be able to accept X and T, where T is a shape (n, ) array. + + categories: 'auto' or list, default 'auto' + The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). + The first category will be treated as the control treatment. """ def __init__(self, models, cate_models=None, - propensity_model=LogisticRegression()): + propensity_model=LogisticRegression(), + categories='auto'): self.models = clone(models, safe=False) self.cate_models = clone(cate_models, safe=False) self.propensity_model = clone(propensity_model, safe=False) - self._label_encoder = LabelEncoder() - self._one_hot_encoder = OneHotEncoder(categories='auto', sparse=False) + if categories != 'auto': + categories = [categories] # OneHotEncoder expects a 2D array with features per column + self._one_hot_encoder = OneHotEncoder(categories=categories, sparse=False, drop='first') self.transformer = FunctionTransformer( - func=(lambda T: - self._one_hot_encoder.transform( - self._label_encoder.transform(T).reshape(-1, 1))[:, 1:]), + func=_EncoderWrapper(self._one_hot_encoder).encode, validate=False) super().__init__() @@ -251,9 +262,9 @@ def fit(self, Y, T, X, *, inference=None): """ # Check inputs Y, T, X, _ = check_inputs(Y, T, X, multi_output_T=False) - T = self._label_encoder.fit_transform(T) - self._one_hot_encoder.fit(T.reshape(-1, 1)) - self._d_t = (len(self._label_encoder.classes_) - 1,) + T = self._one_hot_encoder.fit_transform(T.reshape(-1, 1)) + self._d_t = T.shape[1:] + T = inverse_onehot(T) self.models = check_models(self.models, self._d_t[0] + 1) if self.cate_models is None: self.cate_models = [clone(model, safe=False) for model in self.models] @@ -328,20 +339,23 @@ class DomainAdaptationLearner(TreatmentExpansionMixin, LinearCateEstimator): Must implement `fit` and `predict_proba` methods. The `fit` method must be able to accept X and T, where T is a shape (n, 1) array. + categories: 'auto' or list, default 'auto' + The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). + The first category will be treated as the control treatment. """ def __init__(self, models, final_models, - propensity_model=LogisticRegression()): + propensity_model=LogisticRegression(), + categories='auto'): self.models = clone(models, safe=False) self.final_models = clone(final_models, safe=False) self.propensity_model = clone(propensity_model, safe=False) - self._label_encoder = LabelEncoder() - self._one_hot_encoder = OneHotEncoder(categories='auto', sparse=False) + if categories != 'auto': + categories = [categories] # OneHotEncoder expects a 2D array with features per column + self._one_hot_encoder = OneHotEncoder(categories=categories, sparse=False, drop='first') self.transformer = FunctionTransformer( - func=(lambda T: - self._one_hot_encoder.transform( - self._label_encoder.transform(T).reshape(-1, 1))[:, 1:]), + func=_EncoderWrapper(self._one_hot_encoder).encode, validate=False) super().__init__() @@ -371,9 +385,9 @@ def fit(self, Y, T, X, *, inference=None): """ # Check inputs Y, T, X, _ = check_inputs(Y, T, X, multi_output_T=False) - T = self._label_encoder.fit_transform(T) - self._one_hot_encoder.fit(T.reshape(-1, 1)) - self._d_t = (len(self._label_encoder.classes_) - 1,) + T = self._one_hot_encoder.fit_transform(T.reshape(-1, 1)) + self._d_t = T.shape[1:] + T = inverse_onehot(T) self.models = check_models(self.models, self._d_t[0] + 1) self.final_models = check_models(self.final_models, self._d_t[0]) self.propensity_models = [] diff --git a/econml/ortho_forest.py b/econml/ortho_forest.py index 378763237..e319c9f83 100644 --- a/econml/ortho_forest.py +++ b/econml/ortho_forest.py @@ -38,7 +38,8 @@ from .cate_estimator import BaseCateEstimator, LinearCateEstimator, TreatmentExpansionMixin from .causal_tree import CausalTree from .inference import Inference -from .utilities import reshape, reshape_Y_T, MAX_RAND_SEED, check_inputs, cross_product +from .utilities import (reshape, reshape_Y_T, MAX_RAND_SEED, check_inputs, + cross_product, inverse_onehot, _EncoderWrapper) def _build_tree_in_parallel(Y, T, X, W, @@ -687,6 +688,10 @@ class DiscreteTreatmentOrthoForest(BaseOrthoForest): helper class. The model(s) must implement `fit` and `predict` methods. If parameter is set to ``None``, it defaults to the value of `model_Y` parameter. + categories: 'auto' or list + The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). + The first category will be treated as the control treatment. + n_jobs : int, optional (default=-1) The number of jobs to run in parallel for both :meth:`fit` and :meth:`effect`. ``-1`` means using all processors. Since OrthoForest methods are @@ -712,6 +717,7 @@ def __init__(self, model_Y=WeightedLassoCVWrapper(cv=3), propensity_model_final=None, model_Y_final=None, + categories='auto', n_jobs=-1, random_state=None): # Copy and/or define models @@ -723,6 +729,7 @@ def __init__(self, self.propensity_model_final = clone(self.propensity_model, safe=False) if self.model_Y_final is None: self.model_Y_final = clone(self.model_Y, safe=False) + # Nuisance estimators shall be defined during fitting because they need to know the number of distinct # treatments nuisance_estimator = None @@ -734,8 +741,10 @@ def __init__(self, # Define moment and mean gradient estimator moment_and_mean_gradient_estimator =\ DiscreteTreatmentOrthoForest.moment_and_mean_gradient_estimator_func - # Define autoencoder - self._label_encoder = LabelEncoder() + if categories != 'auto': + categories = [categories] # OneHotEncoder expects a 2D array with features per column + self._one_hot_encoder = OneHotEncoder(categories=categories, sparse=False, drop='first') + super(DiscreteTreatmentOrthoForest, self).__init__( nuisance_estimator, second_stage_nuisance_estimator, @@ -781,18 +790,16 @@ def fit(self, Y, T, X, W=None, inference=None): # Check T is numeric T = self._check_treatment(T) # Train label encoder - T = self._label_encoder.fit_transform(T) - self._one_hot_encoder = OneHotEncoder(sparse=False, categories='auto').fit(T.reshape(-1, 1)) + T = self._one_hot_encoder.fit_transform(T.reshape(-1, 1)) # Define number of classes - self.n_T = self._label_encoder.classes_.shape[0] + self.n_T = T.shape[1] + 1 # add one to compensate for dropped first + T = inverse_onehot(T) self.nuisance_estimator = DiscreteTreatmentOrthoForest.nuisance_estimator_generator( self.propensity_model, self.model_Y, self.n_T, self.random_state, second_stage=False) self.second_stage_nuisance_estimator = DiscreteTreatmentOrthoForest.nuisance_estimator_generator( self.propensity_model_final, self.model_Y_final, self.n_T, self.random_state, second_stage=True) self.transformer = FunctionTransformer( - func=(lambda T: - self._one_hot_encoder.transform( - reshape(self._label_encoder.transform(T.ravel()), (-1, 1)))[:, 1:]), + func=_EncoderWrapper(self._one_hot_encoder).encode, validate=False) # Call `fit` from parent class return super().fit(Y, T, X, W=W, inference=inference) diff --git a/econml/ortho_iv.py b/econml/ortho_iv.py index ee923322b..7f38b93f4 100644 --- a/econml/ortho_iv.py +++ b/econml/ortho_iv.py @@ -21,32 +21,38 @@ from ._ortho_learner import _OrthoLearner from .dml import _FinalWrapper -from .utilities import (hstack, StatsModelsLinearRegression, inverse_onehot) +from .utilities import (hstack, StatsModelsLinearRegression, inverse_onehot, add_intercept) from .inference import StatsModelsInference from .cate_estimator import StatsModelsCateEstimatorMixin -# A cut-down version of the DML first stage wrapper, since we don't need to support W or linear first stages +# A cut-down version of the DML first stage wrapper, since we don't need to support linear first stages class _FirstStageWrapper: def __init__(self, model, discrete_target): self._model = clone(model, safe=False) self._discrete_target = discrete_target - def _combine(self, X, Z, n_samples, fitting=True): + def _combine(self, X, W, Z, n_samples, fitting=True): + # output is + # * a column of ones if X, W, and Z are all None + # * just X or W or Z if both of the others are None + # * hstack([arrs]) for whatever subset are not None otherwise + + # ensure Z is 2D if Z is not None: Z = Z.reshape(n_samples, -1) - if X is None: - # if both X and Z are None, just return a column of ones - return (Z if Z is not None else np.ones((n_samples, 1))) - XZ = hstack([X, Z]) if Z is not None else X - return XZ - - def fit(self, X, *args, sample_weight=None): - if len(args) == 1: - Target, = args - Z = None + + if X is None and W is None and Z is None: + return np.ones((n_samples, 1)) + + arrs = [arr for arr in [X, W, Z] if arr is not None] + + if len(arrs) == 1: + return arrs[0] else: - (Z, Target) = args + return hstack(arrs) + + def fit(self, *, X, W, Target, Z=None, sample_weight=None): if self._discrete_target: # In this case, the Target is the one-hot-encoding of the treatment variable # We need to go back to the label representation of the one-hot so as to call @@ -57,60 +63,83 @@ def fit(self, X, *args, sample_weight=None): Target = inverse_onehot(Target) if sample_weight is not None: - self._model.fit(self._combine(X, Z, Target.shape[0]), Target, sample_weight=sample_weight) + self._model.fit(self._combine(X, W, Z, Target.shape[0]), Target, sample_weight=sample_weight) + else: + self._model.fit(self._combine(X, W, Z, Target.shape[0]), Target) + + def score(self, *, X, W, Target, Z=None, sample_weight=None): + if hasattr(self._model, 'score'): + if self._discrete_target: + # In this case, the Target is the one-hot-encoding of the treatment variable + # We need to go back to the label representation of the one-hot so as to call + # the classifier. + if np.any(np.all(Target == 0, axis=0)) or (not np.any(np.all(Target == 0, axis=1))): + raise AttributeError("Provided crossfit folds contain training splits that " + + "don't contain all treatments") + Target = inverse_onehot(Target) + + if sample_weight is not None: + return self._model.score(self._combine(X, W, Z, Target.shape[0]), Target, sample_weight=sample_weight) + else: + return self._model.score(self._combine(X, W, Z, Target.shape[0]), Target) else: - self._model.fit(self._combine(X, Z, Target.shape[0]), Target) + return None - def predict(self, X, Z=None): - n_samples = X.shape[0] if X is not None else (Z.shape[0] if Z is not None else 1) + def predict(self, X, W, Z=None): + arrs = [arr for arr in [X, W, Z] if arr is not None] + n_samples = arrs[0].shape[0] if arrs else 1 if self._discrete_target: - return self._model.predict_proba(self._combine(X, Z, n_samples, fitting=False))[:, 1:] + return self._model.predict_proba(self._combine(X, W, Z, n_samples, fitting=False))[:, 1:] else: - return self._model.predict(self._combine(X, Z, n_samples, fitting=False)) + return self._model.predict(self._combine(X, W, Z, n_samples, fitting=False)) + + +class _BaseDMLATEIVModelFinal: + def __init__(self): + self._first_stage = LinearRegression(fit_intercept=False) + self._model_final = _FinalWrapper(LinearRegression(fit_intercept=False), + fit_cate_intercept=True, featurizer=None, use_weight_trick=False) + + def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): + Y_res, T_res, Z_res = nuisances + if Z_res.ndim == 1: + Z_res = Z_res.reshape(-1, 1) + # DMLATEIV is just like 2SLS; first regress T_res on Z_res, then regress Y_res on predicted T_res + T_res_pred = self._first_stage.fit(Z_res, T_res, + sample_weight=sample_weight).predict(Z_res) + # TODO: allow the final model to actually use X? Then we'd need to rename the class + # since we would actually be calculating a CATE rather than ATE. + self._model_final.fit(X=None, T_res=T_res_pred, Y_res=Y_res, sample_weight=sample_weight) + return self + + def predict(self, X=None): + # TODO: allow the final model to actually use X? + return self._model_final.predict(X=None) + + def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): + Y_res, T_res, Z_res = nuisances + if Y_res.ndim == 1: + Y_res = Y_res.reshape((-1, 1)) + if T_res.ndim == 1: + T_res = T_res.reshape((-1, 1)) + # TODO: allow the final model to actually use X? + effects = self._model_final.predict(X=None).reshape((-1, Y_res.shape[1], T_res.shape[1])) + Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape) + if sample_weight is not None: + return np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0)) + else: + return np.mean((Y_res - Y_res_pred) ** 2) class _BaseDMLATEIV(_OrthoLearner): def __init__(self, model_nuisance, discrete_instrument=False, discrete_treatment=False, + categories='auto', n_splits=2, random_state=None): - class ModelFinal: - def __init__(self): - self._first_stage = LinearRegression(fit_intercept=False) - self._model_final = _FinalWrapper(LinearRegression(fit_intercept=False), - fit_cate_intercept=True, featurizer=None, use_weight_trick=False) - - def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): - Y_res, T_res, Z_res = nuisances - if Z_res.ndim == 1: - Z_res = Z_res.reshape(-1, 1) - # DMLATEIV is just like 2SLS; first regress T_res on Z_res, then regress Y_res on predicted T_res - T_res_pred = self._first_stage.fit(Z_res, T_res, - sample_weight=sample_weight).predict(Z_res) - # TODO: allow the final model to actually use X? Then we'd need to rename the class - # since we would actually be calculating a CATE rather than ATE. - self._model_final.fit(X=None, T_res=T_res_pred, Y_res=Y_res, sample_weight=sample_weight) - return self - - def predict(self, X=None): - # TODO: allow the final model to actually use X? - return self._model_final.predict(X=None) - - def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): - Y_res, T_res, Z_res = nuisances - if Y_res.ndim == 1: - Y_res = Y_res.reshape((-1, 1)) - if T_res.ndim == 1: - T_res = T_res.reshape((-1, 1)) - # TODO: allow the final model to actually use X? - effects = self._model_final.predict(X=None).reshape((-1, Y_res.shape[1], T_res.shape[1])) - Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape) - if sample_weight is not None: - return np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0)) - else: - return np.mean((Y_res - Y_res_pred) ** 2) - - super().__init__(model_nuisance, ModelFinal(), + + super().__init__(model_nuisance, _BaseDMLATEIVModelFinal(), discrete_treatment=discrete_treatment, discrete_instrument=discrete_instrument, + categories=categories, n_splits=n_splits, random_state=random_state) def fit(self, Y, T, Z, W=None, *, sample_weight=None, sample_var=None, inference=None): @@ -173,6 +202,50 @@ def score(self, Y, T, Z, W=None): return super().score(Y, T, W=W, Z=Z) +class _DMLATEIVModelNuisance: + def __init__(self, model_Y_W, model_T_W, model_Z_W): + self._model_Y_W = clone(model_Y_W, safe=False) + self._model_T_W = clone(model_T_W, safe=False) + self._model_Z_W = clone(model_Z_W, safe=False) + + def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + assert X is None, "DML ATE IV does not accept features" + self._model_Y_W.fit(X=X, W=W, Target=Y, sample_weight=sample_weight) + self._model_T_W.fit(X=X, W=W, Target=T, sample_weight=sample_weight) + self._model_Z_W.fit(X=X, W=W, Target=Z, sample_weight=sample_weight) + return self + + def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + assert X is None, "DML ATE IV does not accept features" + if hasattr(self._model_Y_W, 'score'): + Y_X_score = self._model_Y_W.score(X=X, W=W, Target=Y, sample_weight=sample_weight) + else: + Y_X_score = None + if hasattr(self._model_T_W, 'score'): + T_X_score = self._model_T_W.score(X=X, W=W, Target=T, sample_weight=sample_weight) + else: + T_X_score = None + if hasattr(self._model_Z_W, 'score'): + Z_X_score = self._model_Z_W.score(X=X, W=W, Target=Z, sample_weight=sample_weight) + else: + Z_X_score = None + return Y_X_score, T_X_score, Z_X_score + + def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + assert X is None, "DML ATE IV does not accept features" + Y_pred = self._model_Y_W.predict(X=X, W=W) + T_pred = self._model_T_W.predict(X=X, W=W) + Z_pred = self._model_Z_W.predict(X=X, W=W) + if W is None: # In this case predict above returns a single row + Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1)) + T_pred = np.tile(T_pred.reshape(1, -1), (T.shape[0], 1)) + Z_pred = np.tile(Z_pred.reshape(1, -1), (Z.shape[0], 1)) + Y_res = Y - Y_pred.reshape(Y.shape) + T_res = T - T_pred.reshape(T.shape) + Z_res = Z - Z_pred.reshape(Z.shape) + return Y_res, T_res, Z_res + + class DMLATEIV(_BaseDMLATEIV): """ Implementation of the orthogonal/double ml method for ATE estimation with @@ -187,75 +260,165 @@ class DMLATEIV(_BaseDMLATEIV): a biased ATE. """ - def __init__(self, model_Y_X, model_T_X, model_Z_X, + def __init__(self, model_Y_W, model_T_W, model_Z_W, discrete_treatment=False, discrete_instrument=False, + categories='auto', n_splits=2, random_state=None): - class ModelNuisance: - def __init__(self, model_Y_X, model_T_X, model_Z_X): - self._model_Y_X = clone(model_Y_X, safe=False) - self._model_T_X = clone(model_T_X, safe=False) - self._model_Z_X = clone(model_Z_X, safe=False) - - def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None): - assert X is None, "DML ATE IV does not accept features" - self._model_Y_X.fit(W, Y, sample_weight=sample_weight) - self._model_T_X.fit(W, T, sample_weight=sample_weight) - self._model_Z_X.fit(W, Z, sample_weight=sample_weight) - return self - - def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None): - Y_pred = self._model_Y_X.predict(W) - T_pred = self._model_T_X.predict(W) - Z_pred = self._model_Z_X.predict(W) - if W is None: # In this case predict above returns a single row - Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1)) - T_pred = np.tile(T_pred.reshape(1, -1), (T.shape[0], 1)) - Z_pred = np.tile(Z_pred.reshape(1, -1), (Z.shape[0], 1)) - Y_res = Y - Y_pred.reshape(Y.shape) - T_res = T - T_pred.reshape(T.shape) - Z_res = Z - Z_pred.reshape(Z.shape) - return Y_res, T_res, Z_res - super().__init__(ModelNuisance(model_Y_X=_FirstStageWrapper(model_Y_X, discrete_target=False), - model_T_X=_FirstStageWrapper(model_T_X, discrete_target=discrete_treatment), - model_Z_X=_FirstStageWrapper(model_Z_X, discrete_target=discrete_instrument)), + + super().__init__(_DMLATEIVModelNuisance(model_Y_W=_FirstStageWrapper(model_Y_W, discrete_target=False), + model_T_W=_FirstStageWrapper( + model_T_W, discrete_target=discrete_treatment), + model_Z_W=_FirstStageWrapper( + model_Z_W, discrete_target=discrete_instrument)), discrete_instrument=discrete_instrument, discrete_treatment=discrete_treatment, + categories=categories, n_splits=n_splits, random_state=random_state) +class _ProjectedDMLATEIVModelNuisance: + def __init__(self, model_Y_W, model_T_W, model_T_WZ): + self._model_Y_W = clone(model_Y_W, safe=False) + self._model_T_W = clone(model_T_W, safe=False) + self._model_T_WZ = clone(model_T_WZ, safe=False) + + def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + assert X is None, "DML ATE IV does not accept features" + self._model_Y_W.fit(X=X, W=W, Target=Y, sample_weight=sample_weight) + self._model_T_W.fit(X=X, W=W, Target=T, sample_weight=sample_weight) + self._model_T_WZ.fit(X=X, W=W, Z=Z, Target=T, sample_weight=sample_weight) + return self + + def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + assert X is None, "DML ATE IV does not accept features" + if hasattr(self._model_Y_W, 'score'): + Y_X_score = self._model_Y_W.score(X=X, W=W, Target=Y, sample_weight=sample_weight) + else: + Y_X_score = None + if hasattr(self._model_T_W, 'score'): + T_X_score = self._model_T_W.score(X=X, W=W, Target=T, sample_weight=sample_weight) + else: + T_X_score = None + if hasattr(self._model_T_WZ, 'score'): + T_XZ_score = self._model_T_WZ.score(X=X, W=W, Z=Z, Target=T, sample_weight=sample_weight) + else: + T_XZ_score = None + return Y_X_score, T_X_score, T_XZ_score + + def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + assert X is None, "DML ATE IV does not accept features" + Y_pred = self._model_Y_W.predict(X, W) + TX_pred = self._model_T_W.predict(X, W) + TXZ_pred = self._model_T_WZ.predict(X, W, Z) + if W is None: # In this case predict above returns a single row + Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1)) + TX_pred = np.tile(TX_pred.reshape(1, -1), (T.shape[0], 1)) + Y_res = Y - Y_pred.reshape(Y.shape) + T_res = T - TX_pred.reshape(T.shape) + Z_res = TXZ_pred.reshape(T.shape) - TX_pred.reshape(T.shape) + return Y_res, T_res, Z_res + + class ProjectedDMLATEIV(_BaseDMLATEIV): - def __init__(self, model_Y_X, model_T_X, model_T_XZ, + def __init__(self, model_Y_W, model_T_W, model_T_WZ, discrete_treatment=False, discrete_instrument=False, + categories='auto', n_splits=2, random_state=None): - class ModelNuisance: - def __init__(self, model_Y_X, model_T_X, model_T_XZ): - self._model_Y_X = clone(model_Y_X, safe=False) - self._model_T_X = clone(model_T_X, safe=False) - self._model_T_XZ = clone(model_T_XZ, safe=False) - - def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None): - assert X is None, "DML ATE IV does not accept features" - self._model_Y_X.fit(W, Y, sample_weight=sample_weight) - self._model_T_X.fit(W, T, sample_weight=sample_weight) - self._model_T_XZ.fit(W, Z, T, sample_weight=sample_weight) - return self - - def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None): - Y_pred = self._model_Y_X.predict(W) - TX_pred = self._model_T_X.predict(W) - TXZ_pred = self._model_T_XZ.predict(W, Z) - if W is None: # In this case predict above returns a single row - Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1)) - TX_pred = np.tile(TX_pred.reshape(1, -1), (T.shape[0], 1)) - Y_res = Y - Y_pred.reshape(Y.shape) - T_res = T - TX_pred.reshape(T.shape) - Z_res = TXZ_pred.reshape(T.shape) - TX_pred.reshape(T.shape) - return Y_res, T_res, Z_res - - super().__init__(ModelNuisance(model_Y_X=_FirstStageWrapper(model_Y_X, discrete_target=False), - model_T_X=_FirstStageWrapper(model_T_X, discrete_target=discrete_treatment), - model_T_XZ=_FirstStageWrapper(model_T_XZ, discrete_target=discrete_treatment)), - discrete_treatment=discrete_treatment, discrete_instrument=discrete_instrument, - n_splits=n_splits, random_state=random_state) + + super().__init__(_ProjectedDMLATEIVModelNuisance( + model_Y_W=_FirstStageWrapper( + model_Y_W, discrete_target=False), + model_T_W=_FirstStageWrapper( + model_T_W, discrete_target=discrete_treatment), + model_T_WZ=_FirstStageWrapper( + model_T_WZ, discrete_target=discrete_treatment)), + discrete_treatment=discrete_treatment, discrete_instrument=discrete_instrument, + categories=categories, + n_splits=n_splits, random_state=random_state) + + +class _BaseDMLIVModelNuisance: + """ + Nuisance model fits the three models at fit time and at predict time + returns :math:`Y-\\E[Y|X]` and :math:`\\E[T|X,Z]-\\E[T|X]` as residuals. + """ + + def __init__(self, model_Y_X, model_T_X, model_T_XZ): + self._model_Y_X = clone(model_Y_X, safe=False) + self._model_T_X = clone(model_T_X, safe=False) + self._model_T_XZ = clone(model_T_XZ, safe=False) + + def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + # TODO: would it be useful to extend to handle controls ala vanilla DML? + assert W is None, "DML IV does not accept controls" + self._model_Y_X.fit(X=X, W=None, Target=Y, sample_weight=sample_weight) + self._model_T_X.fit(X=X, W=None, Target=T, sample_weight=sample_weight) + self._model_T_XZ.fit(X=X, W=None, Z=Z, Target=T, sample_weight=sample_weight) + return self + + def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + assert W is None, "DML IV does not accept controls" + if hasattr(self._model_Y_X, 'score'): + Y_X_score = self._model_Y_X.score(X=X, W=W, Target=Y, sample_weight=sample_weight) + else: + Y_X_score = None + if hasattr(self._model_T_X, 'score'): + T_X_score = self._model_T_X.score(X=X, W=W, Target=T, sample_weight=sample_weight) + else: + T_X_score = None + if hasattr(self._model_T_XZ, 'score'): + T_XZ_score = self._model_T_XZ.score(X=X, W=W, Z=Z, Target=T, sample_weight=sample_weight) + else: + T_XZ_score = None + return Y_X_score, T_X_score, T_XZ_score + + def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + assert W is None, "DML IV does not accept controls" + Y_pred = self._model_Y_X.predict(X, W) + TXZ_pred = self._model_T_XZ.predict(X, W, Z) + TX_pred = self._model_T_X.predict(X, W) + if X is None: # In this case predict above returns a single row + Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1)) + TX_pred = np.tile(TX_pred.reshape(1, -1), (T.shape[0], 1)) + Y_res = Y - Y_pred.reshape(Y.shape) + T_res = TXZ_pred.reshape(T.shape) - TX_pred.reshape(T.shape) + return Y_res, T_res + + +class _BaseDMLIVModelFinal: + """ + Final model at fit time, fits a residual on residual regression with a heterogeneous coefficient + that depends on X, i.e. + + .. math :: + Y - \\E[Y | X] = \\theta(X) \\cdot (\\E[T | X, Z] - \\E[T | X]) + \\epsilon + + and at predict time returns :math:`\\theta(X)`. The score method returns the MSE of this final + residual on residual regression. + """ + + def __init__(self, model_final): + self._model_final = clone(model_final, safe=False) + + def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): + Y_res, T_res = nuisances + self._model_final.fit(X, T_res, Y_res, sample_weight=sample_weight, sample_var=sample_var) + return self + + def predict(self, X=None): + return self._model_final.predict(X) + + def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): + Y_res, T_res = nuisances + if Y_res.ndim == 1: + Y_res = Y_res.reshape((-1, 1)) + if T_res.ndim == 1: + T_res = T_res.reshape((-1, 1)) + effects = self._model_final.predict(X).reshape((-1, Y_res.shape[1], T_res.shape[1])) + Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape) + if sample_weight is not None: + return np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0)) + else: + return np.mean((Y_res - Y_res_pred)**2) class _BaseDMLIV(_OrthoLearner): @@ -299,6 +462,10 @@ class _BaseDMLIV(_OrthoLearner): discrete_treatment: bool, optional, default False Whether the treatment values should be treated as categorical, rather than continuous, quantities + categories: 'auto' or list, default 'auto' + The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). + The first category will be treated as the control treatment. + n_splits: int, cross-validation generator or an iterable, optional, default 2 Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -324,75 +491,12 @@ class _BaseDMLIV(_OrthoLearner): """ def __init__(self, model_Y_X, model_T_X, model_T_XZ, model_final, - discrete_instrument=False, discrete_treatment=False, n_splits=2, random_state=None): - class ModelNuisance: - """ - Nuisance model fits the three models at fit time and at predict time - returns :math:`Y-\\E[Y|X]` and :math:`\\E[T|X,Z]-\\E[T|X]` as residuals. - """ - - def __init__(self, model_Y_X, model_T_X, model_T_XZ): - self._model_Y_X = clone(model_Y_X, safe=False) - self._model_T_X = clone(model_T_X, safe=False) - self._model_T_XZ = clone(model_T_XZ, safe=False) - - def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None): - # TODO: would it be useful to extend to handle controls ala vanilla DML? - assert W is None, "DML IV does not accept controls" - self._model_Y_X.fit(X, Y, sample_weight=sample_weight) - self._model_T_X.fit(X, T, sample_weight=sample_weight) - self._model_T_XZ.fit(X, Z, T, sample_weight=sample_weight) - return self - - def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None): - Y_pred = self._model_Y_X.predict(X) - TXZ_pred = self._model_T_XZ.predict(X, Z) - TX_pred = self._model_T_X.predict(X) - if X is None: # In this case predict above returns a single row - Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1)) - TX_pred = np.tile(TX_pred.reshape(1, -1), (T.shape[0], 1)) - Y_res = Y - Y_pred.reshape(Y.shape) - T_res = TXZ_pred.reshape(T.shape) - TX_pred.reshape(T.shape) - return Y_res, T_res - - class ModelFinal: - """ - Final model at fit time, fits a residual on residual regression with a heterogeneous coefficient - that depends on X, i.e. - - .. math :: - Y - \\E[Y | X] = \\theta(X) \\cdot (\\E[T | X, Z] - \\E[T | X]) + \\epsilon - - and at predict time returns :math:`\\theta(X)`. The score method returns the MSE of this final - residual on residual regression. - """ - - def __init__(self, model_final): - self._model_final = clone(model_final, safe=False) - - def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): - Y_res, T_res = nuisances - self._model_final.fit(X, T_res, Y_res, sample_weight=sample_weight, sample_var=sample_var) - return self - - def predict(self, X=None): - return self._model_final.predict(X) - - def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): - Y_res, T_res = nuisances - if Y_res.ndim == 1: - Y_res = Y_res.reshape((-1, 1)) - if T_res.ndim == 1: - T_res = T_res.reshape((-1, 1)) - effects = self._model_final.predict(X).reshape((-1, Y_res.shape[1], T_res.shape[1])) - Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape) - if sample_weight is not None: - return np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0)) - else: - return np.mean((Y_res - Y_res_pred)**2) - - super().__init__(ModelNuisance(model_Y_X, model_T_X, model_T_XZ), ModelFinal(model_final), + discrete_instrument=False, discrete_treatment=False, categories='auto', + n_splits=2, random_state=None): + super().__init__(_BaseDMLIVModelNuisance(model_Y_X, model_T_X, model_T_XZ), + _BaseDMLIVModelFinal(model_final), discrete_treatment=discrete_treatment, discrete_instrument=discrete_instrument, + categories=categories, n_splits=n_splits, random_state=random_state) def fit(self, Y, T, Z, X=None, *, sample_weight=None, sample_var=None, inference=None): @@ -521,6 +625,27 @@ def models_T_XZ(self): """ return [mdl._model for mdl in super().models_T_XZ] + @property + def nuisance_scores_Y_X(self): + """ + Get the scores for Y_X model on the out-of-sample training data + """ + return self.nuisance_scores_[0] + + @property + def nuisance_scores_T_X(self): + """ + Get the scores for T_X model on the out-of-sample training data + """ + return self.nuisance_scores_[1] + + @property + def nuisance_scores_T_XZ(self): + """ + Get the scores for T_XZ model on the out-of-sample training data + """ + return self.nuisance_scores_[2] + def cate_feature_names(self, input_feature_names=None): """ Get the output feature names. @@ -605,11 +730,16 @@ class DMLIV(_BaseDMLIV): discrete_treatment: bool, optional, default False Whether the treatment values should be treated as categorical, rather than continuous, quantities + + categories: 'auto' or list, default 'auto' + The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). + The first category will be treated as the control treatment. """ def __init__(self, model_Y_X, model_T_X, model_T_XZ, model_final, featurizer=None, fit_cate_intercept=True, - n_splits=2, discrete_instrument=False, discrete_treatment=False, random_state=None): + n_splits=2, discrete_instrument=False, discrete_treatment=False, + categories='auto', random_state=None): self.bias_part_of_coef = fit_cate_intercept self.fit_cate_intercept = fit_cate_intercept super().__init__(_FirstStageWrapper(model_Y_X, False), @@ -622,6 +752,7 @@ def __init__(self, model_Y_X, model_T_X, model_T_XZ, model_final, featurizer=Non n_splits=n_splits, discrete_instrument=discrete_instrument, discrete_treatment=discrete_treatment, + categories=categories, random_state=random_state) @@ -688,11 +819,16 @@ class NonParamDMLIV(_BaseDMLIV): discrete_treatment: bool, optional, default False Whether the treatment values should be treated as categorical, rather than continuous, quantities + + categories: 'auto' or list, default 'auto' + The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). + The first category will be treated as the control treatment. + """ def __init__(self, model_Y_X, model_T_X, model_T_XZ, model_final, featurizer=None, fit_cate_intercept=True, - n_splits=2, discrete_instrument=False, discrete_treatment=False): + n_splits=2, discrete_instrument=False, discrete_treatment=False, categories='auto'): super().__init__(_FirstStageWrapper(model_Y_X, False), _FirstStageWrapper(model_T_X, discrete_treatment), _FirstStageWrapper(model_T_XZ, discrete_treatment), @@ -702,7 +838,113 @@ def __init__(self, model_Y_X, model_T_X, model_T_XZ, model_final, use_weight_trick=True), n_splits=n_splits, discrete_instrument=discrete_instrument, - discrete_treatment=discrete_treatment) + discrete_treatment=discrete_treatment, + categories=categories) + + +class _BaseDRIVModelFinal: + """ + Final model at fit time, fits a residual on residual regression with a heterogeneous coefficient + that depends on X, i.e. + + .. math :: + Y - \\E[Y | X] = \\theta(X) \\cdot (\\E[T | X, Z] - \\E[T | X]) + \\epsilon + + and at predict time returns :math:`\\theta(X)`. The score method returns the MSE of this final + residual on residual regression. + """ + + def __init__(self, model_final, featurizer, + discrete_treatment, discrete_instrument, + fit_cate_intercept, cov_clip, opt_reweighted): + self._model_final = clone(model_final, safe=False) + self._fit_cate_intercept = fit_cate_intercept + self._original_featurizer = clone(featurizer, safe=False) + self._discrete_treatment = discrete_treatment + self._discrete_instrument = discrete_instrument + if self._fit_cate_intercept: + add_intercept_trans = FunctionTransformer(add_intercept, + validate=True) + if featurizer: + self._featurizer = Pipeline([('featurize', self._original_featurizer), + ('add_intercept', add_intercept_trans)]) + else: + self._featurizer = add_intercept_trans + else: + self._featurizer = self._original_featurizer + self._cov_clip = cov_clip + self._opt_reweighted = opt_reweighted + + def _effect_estimate(self, nuisances): + prel_theta, res_t, res_y, res_z, cov = [nuisance.reshape(nuisances[0].shape) for nuisance in nuisances] + + # Estimate final model of theta(X) by minimizing the square loss: + # (prel_theta(X) + (Y_res - prel_theta(X) * T_res) * Z_res / cov[T,Z | X] - theta(X))^2 + # We clip the covariance so that it is bounded away from zero, so as to reduce variance + # at the expense of some small bias. For points with very small covariance we revert + # to the model-based preliminary estimate and do not add the correction term. + cov_sign = np.sign(cov) + cov_sign[cov_sign == 0] = 1 + clipped_cov = cov_sign * np.clip(np.abs(cov), + self._cov_clip, np.inf) + return prel_theta + (res_y - prel_theta * res_t) * res_z / clipped_cov, clipped_cov + + def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): + self.d_y = Y.shape[1:] + self.d_t = nuisances[1].shape[1:] + self.d_z = nuisances[3].shape[1:] + + # TODO: if opt_reweighted is False, we could change the logic to support multidimensional treatments, + # instruments, and outcomes + if self.d_y and self.d_y[0] > 2: + raise AttributeError("DRIV only supports a single outcome") + + if self.d_t and self.d_t[0] > 1: + if self._discrete_treatment: + raise AttributeError("DRIV only supports binary treatments") + else: + raise AttributeError("DRIV only supports single-dimensional continuous treatments") + + if self.d_z and self.d_z[0] > 1: + if self._discrete_instrument: + raise AttributeError("DRIV only supports binary instruments") + else: + raise AttributeError("DRIV only supports single-dimensional continuous instruments") + + theta_dr, clipped_cov = self._effect_estimate(nuisances) + + if (X is not None) and (self._featurizer is not None): + X = self._featurizer.fit_transform(X) + # TODO: how do we incorporate the sample_weight and sample_var passed into this method + # as arguments? + if self._opt_reweighted: + self._model_final.fit(X, theta_dr, sample_weight=clipped_cov.ravel()**2) + else: + self._model_final.fit(X, theta_dr) + return self + + def predict(self, X=None): + if (X is not None) and (self._featurizer is not None): + X = self._featurizer.transform(X) + return self._model_final.predict(X).reshape((-1,) + self.d_y + self.d_t) + + def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): + # TODO: is there a good way to incorporate the other nuisance terms in the score? + _, T_res, Y_res, _, _ = nuisances + + if Y_res.ndim == 1: + Y_res = Y_res.reshape((-1, 1)) + if T_res.ndim == 1: + T_res = T_res.reshape((-1, 1)) + if (X is not None) and (self._featurizer is not None): + X = self._featurizer.transform(X) + effects = self._model_final.predict(X).reshape((-1, Y_res.shape[1], T_res.shape[1])) + Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape) + + if sample_weight is not None: + return np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0)) + else: + return np.mean((Y_res - Y_res_pred) ** 2) class _BaseDRIV(_OrthoLearner): @@ -742,6 +984,10 @@ class _BaseDRIV(_OrthoLearner): discrete_treatment: bool, optional, default False Whether the treatment values should be treated as categorical, rather than continuous, quantities + categories: 'auto' or list, default 'auto' + The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). + The first category will be treated as the control treatment. + n_splits: int, cross-validation generator or an iterable, optional, default 2 Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -773,117 +1019,25 @@ def __init__(self, fit_cate_intercept=True, cov_clip=0.1, opt_reweighted=False, discrete_instrument=False, discrete_treatment=False, + categories='auto', n_splits=2, random_state=None): - class ModelFinal: - """ - Final model at fit time, fits a residual on residual regression with a heterogeneous coefficient - that depends on X, i.e. - - .. math :: - Y - \\E[Y | X] = \\theta(X) \\cdot (\\E[T | X, Z] - \\E[T | X]) + \\epsilon - - and at predict time returns :math:`\\theta(X)`. The score method returns the MSE of this final - residual on residual regression. - """ - - def __init__(self, model_final, featurizer, fit_cate_intercept): - self._model_final = clone(model_final, safe=False) - self._fit_cate_intercept = fit_cate_intercept - self._original_featurizer = clone(featurizer, safe=False) - if self._fit_cate_intercept: - add_intercept = FunctionTransformer(lambda F: - hstack([np.ones((F.shape[0], 1)), F]), - validate=True) - if featurizer: - self._featurizer = Pipeline([('featurize', self._original_featurizer), - ('add_intercept', add_intercept)]) - else: - self._featurizer = add_intercept - else: - self._featurizer = self._original_featurizer - - @staticmethod - def _effect_estimate(nuisances): - prel_theta, res_t, res_y, res_z, cov = [nuisance.reshape(nuisances[0].shape) for nuisance in nuisances] - - # Estimate final model of theta(X) by minimizing the square loss: - # (prel_theta(X) + (Y_res - prel_theta(X) * T_res) * Z_res / cov[T,Z | X] - theta(X))^2 - # We clip the covariance so that it is bounded away from zero, so as to reduce variance - # at the expense of some small bias. For points with very small covariance we revert - # to the model-based preliminary estimate and do not add the correction term. - cov_sign = np.sign(cov) - cov_sign[cov_sign == 0] = 1 - clipped_cov = cov_sign * np.clip(np.abs(cov), - self.cov_clip, np.inf) - return prel_theta + (res_y - prel_theta * res_t) * res_z / clipped_cov, clipped_cov - - def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): - self.d_y = Y.shape[1:] - self.d_t = nuisances[1].shape[1:] - self.d_z = nuisances[3].shape[1:] - - # TODO: if opt_reweighted is False, we could change the logic to support multidimensional treatments, - # instruments, and outcomes - if self.d_y and self.d_y[0] > 2: - raise AttributeError("DRIV only supports a single outcome") - - if self.d_t and self.d_t[0] > 1: - if discrete_treatment: - raise AttributeError("DRIV only supports binary treatments") - else: - raise AttributeError("DRIV only supports single-dimensional continuous treatments") - - if self.d_z and self.d_z[0] > 1: - if discrete_instrument: - raise AttributeError("DRIV only supports binary instruments") - else: - raise AttributeError("DRIV only supports single-dimensional continuous instruments") - - theta_dr, clipped_cov = self._effect_estimate(nuisances) - - if (X is not None) and (self._featurizer is not None): - X = self._featurizer.fit_transform(X) - # TODO: how do we incorporate the sample_weight and sample_var passed into this method - # as arguments? - if opt_reweighted: - self._model_final.fit(X, theta_dr, sample_weight=clipped_cov.ravel()**2) - else: - self._model_final.fit(X, theta_dr) - return self - - def predict(self, X=None): - if (X is not None) and (self._featurizer is not None): - X = self._featurizer.transform(X) - return self._model_final.predict(X).reshape((-1,) + self.d_y + self.d_t) - - def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None): - # TODO: is there a good way to incorporate the other nuisance terms in the score? - _, T_res, Y_res, _, _ = nuisances - - if Y_res.ndim == 1: - Y_res = Y_res.reshape((-1, 1)) - if T_res.ndim == 1: - T_res = T_res.reshape((-1, 1)) - if (X is not None) and (self._featurizer is not None): - X = self._featurizer.transform(X) - effects = self._model_final.predict(X).reshape((-1, Y_res.shape[1], T_res.shape[1])) - Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape) - - if sample_weight is not None: - return np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0)) - else: - return np.mean((Y_res - Y_res_pred) ** 2) self.fit_cate_intercept = fit_cate_intercept self.bias_part_of_coef = fit_cate_intercept self.cov_clip = cov_clip self.opt_reweighted = opt_reweighted - super().__init__(nuisance_models, ModelFinal(model_final, featurizer, fit_cate_intercept), + super().__init__(nuisance_models, _BaseDRIVModelFinal(model_final, + featurizer, + discrete_treatment, + discrete_instrument, + fit_cate_intercept, + cov_clip, + opt_reweighted), discrete_instrument=discrete_instrument, discrete_treatment=discrete_treatment, - n_splits=n_splits, random_state=random_state) + categories=categories, n_splits=n_splits, random_state=random_state) - def fit(self, Y, T, Z, X=None, *, sample_weight=None, sample_var=None, inference=None): + def fit(self, Y, T, Z, X=None, W=None, *, sample_weight=None, sample_var=None, inference=None): """ Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`. @@ -897,6 +1051,8 @@ def fit(self, Y, T, Z, X=None, *, sample_weight=None, sample_var=None, inference Instruments for each sample X: optional(n, d_x) matrix or None (Default=None) Features for each sample + W: optional(n, d_w) matrix or None (Default=None) + Controls for each sample sample_weight: optional(n,) vector or None (Default=None) Weights for each samples sample_var: optional(n,) vector or None (Default=None) @@ -909,10 +1065,42 @@ def fit(self, Y, T, Z, X=None, *, sample_weight=None, sample_var=None, inference ------- self: _BaseDRIV instance """ - # Replacing fit from _OrthoLearner, to enforce W=None and improve the docstring - return super().fit(Y, T, X=X, W=None, Z=Z, + # Replacing fit from _OrthoLearner, to reorder arguments and improve the docstring + return super().fit(Y, T, X=X, W=W, Z=Z, sample_weight=sample_weight, sample_var=sample_var, inference=inference) + def score(self, Y, T, Z, X=None, W=None): + """ + Score the fitted CATE model on a new data set. Generates nuisance parameters + for the new data set based on the fitted nuisance models created at fit time. + It uses the mean prediction of the models fitted by the different crossfit folds. + Then calls the score function of the model_final and returns the calculated score. + The model_final model must have a score method. + + If model_final does not have a score method, then it raises an :exc:`.AttributeError` + + Parameters + ---------- + Y: (n, d_y) matrix or vector of length n + Outcomes for each sample + T: (n, d_t) matrix or vector of length n + Treatments for each sample + Z: (n, d_z) matrix or None (Default=None) + Instruments for each sample + X: optional (n, d_x) matrix or None (Default=None) + Features for each sample + W: optional(n, d_w) matrix or None (Default=None) + Controls for each sample + + Returns + ------- + score : float or (array of float) + The score of the final CATE model on the new data. Same type as the return + type of the model_final.score method. + """ + # Replacing score from _OrthoLearner, to reorder arguments and improve the docstring + return super().score(Y, T, X=X, W=W, Z=Z) + @property def original_featurizer(self): return super().model_final._original_featurizer @@ -953,6 +1141,58 @@ def cate_feature_names(self, input_feature_names=None): raise AttributeError("Featurizer does not have a method: get_feature_names!") +class _IntentToTreatDRIVModelNuisance: + """ + Nuisance model fits the three models at fit time and at predict time + returns :math:`Y-\\E[Y|X]` and :math:`\\E[T|X,Z]-\\E[T|X]` as residuals. + """ + + def __init__(self, model_Y_X, model_T_XZ, prel_model_effect): + self._model_Y_X = clone(model_Y_X, safe=False) + self._model_T_XZ = clone(model_T_XZ, safe=False) + self._prel_model_effect = clone(prel_model_effect, safe=False) + + def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + self._model_Y_X.fit(X=X, W=W, Target=Y, sample_weight=sample_weight) + self._model_T_XZ.fit(X=X, W=W, Z=Z, Target=T, sample_weight=sample_weight) + # we need to undo the one-hot encoding for calling effect, + # since it expects raw values + self._prel_model_effect.fit(Y, inverse_onehot(T), inverse_onehot(Z), X=X) + return self + + def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + if hasattr(self._model_Y_X, 'score'): + Y_X_score = self._model_Y_X.score(X=X, W=W, Target=Y, sample_weight=sample_weight) + else: + Y_X_score = None + if hasattr(self._model_T_XZ, 'score'): + T_XZ_score = self._model_T_XZ.score(X=X, W=W, Z=Z, Target=T, sample_weight=sample_weight) + else: + T_XZ_score = None + if hasattr(self._prel_model_effect, 'score'): + # we need to undo the one-hot encoding for calling effect, + # since it expects raw values + effect_score = self._prel_model_effect.score(Y, inverse_onehot(T), inverse_onehot(Z), X=X) + else: + effect_score = None + + return Y_X_score, T_XZ_score, effect_score + + def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None): + Y_pred = self._model_Y_X.predict(X, W) + T_pred_zero = self._model_T_XZ.predict(X, W, np.zeros(Z.shape)) + T_pred_one = self._model_T_XZ.predict(X, W, np.ones(Z.shape)) + delta = (T_pred_one - T_pred_zero) / 2 + T_pred_mean = (T_pred_one + T_pred_zero) / 2 + prel_theta = self._prel_model_effect.effect(X) + if X is None: # In this case predict above returns a single row + Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1)) + prel_theta = np.tile(prel_theta.reshape(1, -1), (T.shape[0], 1)) + Y_res = Y - Y_pred.reshape(Y.shape) + T_res = T - T_pred_mean.reshape(T.shape) + return prel_theta, T_res, Y_res, 2 * Z - 1, delta + + class _IntentToTreatDRIV(_BaseDRIV): """ Helper class for the DRIV algorithm for the intent-to-treat A/B test setting @@ -965,50 +1205,20 @@ def __init__(self, model_Y_X, model_T_XZ, fit_cate_intercept=True, cov_clip=.1, n_splits=3, - opt_reweighted=False): + opt_reweighted=False, + categories='auto'): """ """ - class ModelNuisance: - """ - Nuisance model fits the three models at fit time and at predict time - returns :math:`Y-\\E[Y|X]` and :math:`\\E[T|X,Z]-\\E[T|X]` as residuals. - """ - - def __init__(self, model_Y_X, model_T_XZ, prel_model_effect): - self._model_Y_X = clone(model_Y_X, safe=False) - self._model_T_XZ = clone(model_T_XZ, safe=False) - self._prel_model_effect = clone(prel_model_effect, safe=False) - - def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None): - assert W is None, "IntentToTreatDRIV does not accept controls" - self._model_Y_X.fit(X, Y, sample_weight=sample_weight) - self._model_T_XZ.fit(X, Z, T, sample_weight=sample_weight) - # we need to undo the one-hot encoding for calling effect, - # since it expects raw values - self._prel_model_effect.fit(Y, inverse_onehot(T), inverse_onehot(Z), X=X) - return self - - def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None): - Y_pred = self._model_Y_X.predict(X) - T_pred_zero = self._model_T_XZ.predict(X, np.zeros(Z.shape)) - T_pred_one = self._model_T_XZ.predict(X, np.ones(Z.shape)) - delta = (T_pred_one - T_pred_zero) / 2 - T_pred_mean = (T_pred_one + T_pred_zero) / 2 - prel_theta = self._prel_model_effect.effect(X) - if X is None: # In this case predict above returns a single row - Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1)) - prel_theta = np.tile(prel_theta.reshape(1, -1), (T.shape[0], 1)) - Y_res = Y - Y_pred.reshape(Y.shape) - T_res = T - T_pred_mean.reshape(T.shape) - return prel_theta, T_res, Y_res, 2 * Z - 1, delta # TODO: check that Y, T, Z do not have multiple columns - super().__init__(ModelNuisance(model_Y_X, model_T_XZ, prel_model_effect), model_effect, + super().__init__(_IntentToTreatDRIVModelNuisance(model_Y_X, model_T_XZ, prel_model_effect), + model_effect, featurizer=featurizer, fit_cate_intercept=fit_cate_intercept, cov_clip=cov_clip, n_splits=n_splits, discrete_instrument=True, discrete_treatment=True, + categories=categories, opt_reweighted=opt_reweighted) @@ -1082,6 +1292,10 @@ class IntentToTreatDRIV(_IntentToTreatDRIV): assumes the final_model_effect is flexible enough to fit the true CATE model. Otherwise, it method will return a biased projection to the model_effect space, biased to give more weight on parts of the feature space where the instrument is strong. + + categories: 'auto' or list, default 'auto' + The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). + The first category will be treated as the control treatment. """ def __init__(self, model_Y_X, model_T_XZ, @@ -1091,7 +1305,8 @@ def __init__(self, model_Y_X, model_T_XZ, fit_cate_intercept=True, cov_clip=.1, n_splits=3, - opt_reweighted=False): + opt_reweighted=False, + categories='auto'): model_Y_X = _FirstStageWrapper(model_Y_X, discrete_target=False) model_T_XZ = _FirstStageWrapper(model_T_XZ, discrete_target=True) prel_model_effect = _IntentToTreatDRIV(model_Y_X, @@ -1107,7 +1322,8 @@ def __init__(self, model_Y_X, model_T_XZ, fit_cate_intercept=fit_cate_intercept, cov_clip=cov_clip, n_splits=n_splits, - opt_reweighted=opt_reweighted) + opt_reweighted=opt_reweighted, + categories=categories) @property def models_Y_X(self): @@ -1117,6 +1333,18 @@ def models_Y_X(self): def models_T_XZ(self): return [mdl._model_T_XZ._model for mdl in super().models_nuisance] + @property + def nuisance_scores_Y_X(self): + return self.nuisance_scores_[0] + + @property + def nuisance_scores_T_XZ(self): + return self.nuisance_scores_[1] + + @property + def nuisance_scores_effect(self): + return self.nuisance_scores_[2] + class LinearIntentToTreatDRIV(StatsModelsCateEstimatorMixin, IntentToTreatDRIV): """ @@ -1168,6 +1396,11 @@ class LinearIntentToTreatDRIV(StatsModelsCateEstimatorMixin, IntentToTreatDRIV): assumes the final_model_effect is flexible enough to fit the true CATE model. Otherwise, it method will return a biased projection to the model_effect space, biased to give more weight on parts of the feature space where the instrument is strong. + + categories: 'auto' or list, default 'auto' + The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). + The first category will be treated as the control treatment. + """ def __init__(self, model_Y_X, model_T_XZ, @@ -1176,16 +1409,18 @@ def __init__(self, model_Y_X, model_T_XZ, fit_cate_intercept=True, cov_clip=.1, n_splits=3, - opt_reweighted=False): + opt_reweighted=False, + categories='auto'): super().__init__(model_Y_X, model_T_XZ, flexible_model_effect=flexible_model_effect, featurizer=featurizer, - fit_cate_intercept=True, + fit_cate_intercept=fit_cate_intercept, final_model_effect=StatsModelsLinearRegression(fit_intercept=False), - cov_clip=cov_clip, n_splits=n_splits, opt_reweighted=opt_reweighted) + cov_clip=cov_clip, n_splits=n_splits, opt_reweighted=opt_reweighted, + categories=categories) # override only so that we can update the docstring to indicate support for `StatsModelsInference` - def fit(self, Y, T, Z, X=None, sample_weight=None, sample_var=None, inference=None): + def fit(self, Y, T, Z, X=None, W=None, sample_weight=None, sample_var=None, inference=None): """ Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`. @@ -1199,6 +1434,8 @@ def fit(self, Y, T, Z, X=None, sample_weight=None, sample_var=None, inference=No Instruments for each sample X: optional(n, d_x) matrix or None (Default=None) Features for each sample + W: optional(n, d_w) matrix or None (Default=None) + Controls for each sample sample_weight: optional(n,) vector or None (Default=None) Weights for each samples sample_var: optional(n,) vector or None (Default=None) @@ -1212,4 +1449,4 @@ def fit(self, Y, T, Z, X=None, sample_weight=None, sample_var=None, inference=No ------- self : instance """ - return super().fit(Y, T, Z, X=X, sample_weight=sample_weight, sample_var=sample_var, inference=inference) + return super().fit(Y, T, Z, X=X, W=W, sample_weight=sample_weight, sample_var=sample_var, inference=inference) diff --git a/econml/tests/test_dml.py b/econml/tests/test_dml.py index b7fa1f668..d6973ac9a 100644 --- a/econml/tests/test_dml.py +++ b/econml/tests/test_dml.py @@ -3,6 +3,7 @@ import unittest import pytest +import pickle from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, PolynomialFeatures @@ -134,6 +135,9 @@ def make_random(n, is_discrete, d): if not(multi) and d_y > 1: continue + # ensure we can serialize the unfit estimator + pickle.dumps(est) + for inf in infs: with self.subTest(d_w=d_w, d_x=d_x, d_y=d_y, d_t=d_t, is_discrete=is_discrete, est=est, inf=inf): @@ -144,6 +148,10 @@ def make_random(n, is_discrete, d): continue est.fit(Y, T, X, W, inference=inf) + + # ensure we can pickle the fit estimator + pickle.dumps(est) + # make sure we can call the marginal_effect and effect methods const_marg_eff = est.const_marginal_effect(X) marg_eff = est.marginal_effect(T, X) @@ -153,6 +161,12 @@ def make_random(n, is_discrete, d): np.testing.assert_array_equal( marg_eff if d_x else marg_eff[0:1], const_marg_eff) + assert isinstance(est.score_, float) + for score in est.nuisance_scores_y: + assert isinstance(score, float) + for score in est.nuisance_scores_t: + assert isinstance(score, float) + T0 = np.full_like(T, 'a') if is_discrete else np.zeros_like(T) eff = est.effect(X, T0=T0, T1=T) self.assertEqual(shape(eff), effect_shape) @@ -911,3 +925,57 @@ def _test_sparse(n_p, d_w, n_r): np.testing.assert_allclose(a, dml.coef_.reshape(-1), atol=1e-1) eff = reshape(t * np.choose(np.tile(p, 2), a), (-1,)) np.testing.assert_allclose(eff, dml.effect(x, T0=0, T1=t), atol=1e-1) + + def test_nuisance_scores(self): + X = np.random.choice(np.arange(5), size=(100, 3)) + y = np.random.normal(size=(100,)) + T = T0 = T1 = np.random.choice(np.arange(3), size=(100, 2)) + W = np.random.normal(size=(100, 2)) + for n_splits in [1, 2, 3]: + est = LinearDMLCateEstimator(n_splits=n_splits) + est.fit(y, T, X, W) + assert len(est.nuisance_scores_t) == len(est.nuisance_scores_y) == n_splits + + def test_categories(self): + dmls = [LinearDMLCateEstimator, SparseLinearDMLCateEstimator] + for ctor in dmls: + dml1 = ctor(LinearRegression(), LogisticRegression(C=1000), + fit_cate_intercept=False, discrete_treatment=True) + dml2 = ctor(LinearRegression(), LogisticRegression(C=1000), + fit_cate_intercept=False, discrete_treatment=True, categories=['c', 'b', 'a']) + + # create a simple artificial setup where effect of moving from treatment + # a -> b is 2, + # a -> c is 1, and + # b -> c is -1 (necessarily, by composing the previous two effects) + # Using an uneven number of examples from different classes, + # and having the treatments in non-lexicographic order, + # should rule out some basic issues. + + # Note that explicitly specifying the dtype as object is necessary until + # there's a fix for https://github.com/scikit-learn/scikit-learn/issues/15616 + + for dml in [dml1, dml2]: + dml.fit(np.array([2, 3, 1, 3, 2, 1, 1, 1]), + np.array(['c', 'b', 'a', 'b', 'c', 'a', 'a', 'a'], dtype='object'), np.ones((8, 1))) + + # estimated effects should be identical when treatment is explicitly given + np.testing.assert_almost_equal( + dml1.effect( + np.ones((9, 1)), + T0=np.array(['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c'], dtype='object'), + T1=np.array(['a', 'b', 'c', 'a', 'b', 'c', 'a', 'b', 'c'], dtype='object') + ), + dml2.effect( + np.ones((9, 1)), + T0=np.array(['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c'], dtype='object'), + T1=np.array(['a', 'b', 'c', 'a', 'b', 'c', 'a', 'b', 'c'], dtype='object') + ), + decimal=4) + + # but const_marginal_effect should be reordered based on the explicit cagetories + cme1 = dml1.const_marginal_effect(np.ones((1, 1))).reshape(-1) + cme2 = dml2.const_marginal_effect(np.ones((1, 1))).reshape(-1) + self.assertAlmostEqual(cme1[1], -cme2[1], places=4) # 1->3 in original ordering; 3->1 in new ordering + # 1-> 2 in original ordering; combination of 3->1 and 3->2 + self.assertAlmostEqual(cme1[0], -cme2[1] + cme2[0], places=4) diff --git a/econml/tests/test_drlearner.py b/econml/tests/test_drlearner.py index 587e844f8..435edcaae 100644 --- a/econml/tests/test_drlearner.py +++ b/econml/tests/test_drlearner.py @@ -4,6 +4,7 @@ import numpy as np import unittest import pytest +import pickle from sklearn.base import TransformerMixin from numpy.random import normal, multivariate_normal, binomial from sklearn.exceptions import DataConversionWarning @@ -109,6 +110,9 @@ def make_random(is_discrete, d): model_final=StatsModelsLinearRegression(), multitask_model_final=True)]: + # ensure that we can serialize unfit estimator + pickle.dumps(est) + # TODO: add stratification to bootstrap so that we can use it even with discrete treatments infs = [None] if isinstance(est, LinearDRLearner): @@ -118,6 +122,10 @@ def make_random(is_discrete, d): with self.subTest(d_w=d_w, d_x=d_x, d_y=d_y, d_t=d_t, is_discrete=is_discrete, est=est, inf=inf): est.fit(Y, T, X, W, inference=inf) + + # ensure that we can serialize fit estimator + pickle.dumps(est) + # make sure we can call the marginal_effect and effect methods const_marg_eff = est.const_marginal_effect( X) diff --git a/econml/tests/test_ortho_iv.py b/econml/tests/test_ortho_iv.py index cfe57930a..5e105d3f4 100644 --- a/econml/tests/test_ortho_iv.py +++ b/econml/tests/test_ortho_iv.py @@ -3,6 +3,7 @@ import unittest import pytest +import pickle from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, PolynomialFeatures @@ -20,7 +21,7 @@ import econml.tests.utilities # bugfix for assertWarns -class TestDML(unittest.TestCase): +class TestOrthoIV(unittest.TestCase): def test_cate_api(self): """Test that we correctly implement the CATE API.""" @@ -100,16 +101,16 @@ def const_marg_eff_shape(n, d_x, d_y, d_t_final): if not (discrete_t or discrete_z): all_infs.append(BootstrapInference(1)) - estimators = [(DMLATEIV(model_Y_X=Lasso(), - model_T_X=model_t, - model_Z_X=model_z, + estimators = [(DMLATEIV(model_Y_W=Lasso(), + model_T_W=model_t, + model_Z_W=model_z, discrete_treatment=discrete_t, discrete_instrument=discrete_z), True, all_infs), - (ProjectedDMLATEIV(model_Y_X=Lasso(), - model_T_X=model_t, - model_T_XZ=model_t, + (ProjectedDMLATEIV(model_Y_W=Lasso(), + model_T_W=model_t, + model_T_WZ=model_t, discrete_treatment=discrete_t, discrete_instrument=discrete_z), False, @@ -133,89 +134,110 @@ def const_marg_eff_shape(n, d_x, d_y, d_t_final): if not(multi) and d_y > 1 or d_t > 1 or d_z > 1: continue - for inf in infs: - with self.subTest(d_z=d_z, d_x=d_q, d_y=d_y, d_t=d_t, - discrete_t=discrete_t, discrete_z=discrete_z, - est=est, inf=inf): - Z = Z1 - T = T1 - d_t_final = d_t_final1 - X = Q - d_x = d_q - W = None - - if isinstance(est, (DMLATEIV, ProjectedDMLATEIV)): - # these support only W but not X - W = Q - X = None - d_x = None - - def fit(): - return est.fit(Y, T, Z=Z, W=W, inference=inf) - - def score(): - return est.score(Y, T, Z=Z, W=W) - else: - # these support only binary, not general discrete T and Z - if discrete_t: - T = T2 - d_t_final = d_t_final2 - - if discrete_z: - Z = Z2 - - def fit(): - return est.fit(Y, T, Z=Z, X=X, inference=inf) - - def score(): - return est.score(Y, T, Z=Z, X=X) - - marginal_effect_shape = marg_eff_shape(n, d_y, d_t_final) - const_marginal_effect_shape = const_marg_eff_shape(n, d_x, d_y, d_t_final) - - fit() - # make sure we can call the marginal_effect and effect methods - const_marg_eff = est.const_marginal_effect(X) - marg_eff = est.marginal_effect(T, X) - self.assertEqual(shape(marg_eff), marginal_effect_shape) - self.assertEqual(shape(const_marg_eff), const_marginal_effect_shape) - - np.testing.assert_array_equal( - marg_eff if d_x else marg_eff[0:1], const_marg_eff) - - T0 = np.full_like(T, 'a') if discrete_t else np.zeros_like(T) - eff = est.effect(X, T0=T0, T1=T) - self.assertEqual(shape(eff), effect_shape) - - # TODO: add tests for extra properties like coef_ where they exist - - if inf is not None: - const_marg_eff_int = est.const_marginal_effect_interval(X) - marg_eff_int = est.marginal_effect_interval(T, X) - self.assertEqual(shape(marg_eff_int), - (2,) + marginal_effect_shape) - self.assertEqual(shape(const_marg_eff_int), - (2,) + const_marginal_effect_shape) - self.assertEqual(shape(est.effect_interval(X, T0=T0, T1=T)), - (2,) + effect_shape) - - # TODO: add tests for extra properties like coef_ where they exist - - score() - - # make sure we can call effect with implied scalar treatments, - # no matter the dimensions of T, and also that we warn when there - # are multiple treatments - if d_t > 1: - cm = self.assertWarns(Warning) - else: - # ExitStack can be used as a "do nothing" ContextManager - cm = ExitStack() - with cm: - effect_shape2 = (n if d_x else 1,) + ((d_y,) if d_y > 0 else()) - eff = est.effect(X) if not discrete_t else est.effect( - X, T0='a', T1='b') - self.assertEqual(shape(eff), effect_shape2) + # ensure we can serialize unfit estimator + pickle.dumps(est) + + d_ws = [None] + if isinstance(est, LinearIntentToTreatDRIV): + d_ws.append(2) + + for d_w in d_ws: + W = make_random(False, d_w) + + for inf in infs: + with self.subTest(d_z=d_z, d_x=d_q, d_y=d_y, d_t=d_t, + discrete_t=discrete_t, discrete_z=discrete_z, + est=est, inf=inf): + Z = Z1 + T = T1 + d_t_final = d_t_final1 + X = Q + d_x = d_q + + if isinstance(est, (DMLATEIV, ProjectedDMLATEIV)): + # these support only W but not X + W = Q + X = None + d_x = None + + def fit(): + return est.fit(Y, T, Z=Z, W=W, inference=inf) + + def score(): + return est.score(Y, T, Z=Z, W=W) + else: + # these support only binary, not general discrete T and Z + if discrete_t: + T = T2 + d_t_final = d_t_final2 + + if discrete_z: + Z = Z2 + + if isinstance(est, LinearIntentToTreatDRIV): + def fit(): + return est.fit(Y, T, Z=Z, X=X, W=W, inference=inf) + + def score(): + return est.score(Y, T, Z=Z, X=X, W=W) + else: + def fit(): + return est.fit(Y, T, Z=Z, X=X, inference=inf) + + def score(): + return est.score(Y, T, Z=Z, X=X) + + marginal_effect_shape = marg_eff_shape(n, d_y, d_t_final) + const_marginal_effect_shape = const_marg_eff_shape( + n, d_x, d_y, d_t_final) + + fit() + + # ensure we can serialize fit estimator + pickle.dumps(est) + + # make sure we can call the marginal_effect and effect methods + const_marg_eff = est.const_marginal_effect(X) + marg_eff = est.marginal_effect(T, X) + self.assertEqual(shape(marg_eff), marginal_effect_shape) + self.assertEqual(shape(const_marg_eff), const_marginal_effect_shape) + + np.testing.assert_array_equal( + marg_eff if d_x else marg_eff[0:1], const_marg_eff) + + T0 = np.full_like(T, 'a') if discrete_t else np.zeros_like(T) + eff = est.effect(X, T0=T0, T1=T) + self.assertEqual(shape(eff), effect_shape) + + # TODO: add tests for extra properties like coef_ where they exist + + if inf is not None: + const_marg_eff_int = est.const_marginal_effect_interval(X) + marg_eff_int = est.marginal_effect_interval(T, X) + self.assertEqual(shape(marg_eff_int), + (2,) + marginal_effect_shape) + self.assertEqual(shape(const_marg_eff_int), + (2,) + const_marginal_effect_shape) + self.assertEqual(shape(est.effect_interval(X, T0=T0, T1=T)), + (2,) + effect_shape) + + # TODO: add tests for extra properties like coef_ where they exist + + score() + + # make sure we can call effect with implied scalar treatments, + # no matter the dimensions of T, and also that we warn when there + # are multiple treatments + if d_t > 1: + cm = self.assertWarns(Warning) + else: + # ExitStack can be used as a "do nothing" ContextManager + cm = ExitStack() + with cm: + effect_shape2 = (n if d_x else 1,) + ((d_y,) if d_y > 0 else()) + eff = est.effect(X) if not discrete_t else est.effect( + X, T0='a', T1='b') + self.assertEqual(shape(eff), effect_shape2) def test_bad_splits_discrete(self): """ diff --git a/econml/tests/test_ortho_learner.py b/econml/tests/test_ortho_learner.py index 0c0786c1b..940dfb443 100644 --- a/econml/tests/test_ortho_learner.py +++ b/econml/tests/test_ortho_learner.py @@ -28,18 +28,25 @@ def fit(self, X, y, Q, W=None): def predict(self, X, y, Q, W=None): return self._model.predict(X), y - self._model.predict(X), X + def score(self, X, y, Q, W=None): + return self._model.score(X, y) + np.random.seed(123) X = np.random.normal(size=(5000, 3)) y = X[:, 0] + np.random.normal(size=(5000,)) folds = list(KFold(2).split(X, y)) model = Lasso(alpha=0.01) - nuisance, model_list, fitted_inds = _crossfit(Wrapper(model), - folds, - X, y, y, W=y, Z=None) + nuisance, model_list, fitted_inds, scores = _crossfit(Wrapper(model), + folds, + X, y, y, W=y, Z=None) np.testing.assert_allclose(nuisance[0][folds[0][1]], model.fit(X[folds[0][0]], y[folds[0][0]]).predict(X[folds[0][1]])) np.testing.assert_allclose(nuisance[0][folds[0][0]], model.fit(X[folds[0][1]], y[folds[0][1]]).predict(X[folds[0][0]])) + np.testing.assert_allclose(scores[0][0], model.fit(X[folds[0][0]], y[folds[0][0]]).score(X[folds[0][1]], + y[folds[0][1]])) + np.testing.assert_allclose(scores[0][1], model.fit(X[folds[0][1]], y[folds[0][1]]).score(X[folds[0][0]], + y[folds[0][0]])) coef_ = np.zeros(X.shape[1]) coef_[0] = 1 [np.testing.assert_allclose(coef_, mdl._model.coef_, rtol=0, atol=0.08) for mdl in model_list] @@ -50,13 +57,17 @@ def predict(self, X, y, Q, W=None): y = X[:, 0] + np.random.normal(size=(5000,)) folds = list(KFold(2).split(X, y)) model = Lasso(alpha=0.01) - nuisance, model_list, fitted_inds = _crossfit(Wrapper(model), - folds, - X, y, None, W=y, Z=None) + nuisance, model_list, fitted_inds, scores = _crossfit(Wrapper(model), + folds, + X, y, None, W=y, Z=None) np.testing.assert_allclose(nuisance[0][folds[0][1]], model.fit(X[folds[0][0]], y[folds[0][0]]).predict(X[folds[0][1]])) np.testing.assert_allclose(nuisance[0][folds[0][0]], model.fit(X[folds[0][1]], y[folds[0][1]]).predict(X[folds[0][0]])) + np.testing.assert_allclose(scores[0][0], model.fit(X[folds[0][0]], y[folds[0][0]]).score(X[folds[0][1]], + y[folds[0][1]])) + np.testing.assert_allclose(scores[0][1], model.fit(X[folds[0][1]], y[folds[0][1]]).score(X[folds[0][0]], + y[folds[0][0]])) coef_ = np.zeros(X.shape[1]) coef_[0] = 1 [np.testing.assert_allclose(coef_, mdl._model.coef_, rtol=0, atol=0.08) for mdl in model_list] @@ -67,13 +78,17 @@ def predict(self, X, y, Q, W=None): y = X[:, 0] + np.random.normal(size=(5000,)) folds = list(KFold(2).split(X, y)) model = Lasso(alpha=0.01) - nuisance, model_list, fitted_inds = _crossfit(Wrapper(model), - folds, - X, y, None, W=None, Z=None) + nuisance, model_list, fitted_inds, scores = _crossfit(Wrapper(model), + folds, + X, y, None, W=None, Z=None) np.testing.assert_allclose(nuisance[0][folds[0][1]], model.fit(X[folds[0][0]], y[folds[0][0]]).predict(X[folds[0][1]])) np.testing.assert_allclose(nuisance[0][folds[0][0]], model.fit(X[folds[0][1]], y[folds[0][1]]).predict(X[folds[0][0]])) + np.testing.assert_allclose(scores[0][0], model.fit(X[folds[0][0]], y[folds[0][0]]).score(X[folds[0][1]], + y[folds[0][1]])) + np.testing.assert_allclose(scores[0][1], model.fit(X[folds[0][1]], y[folds[0][1]]).score(X[folds[0][0]], + y[folds[0][0]])) coef_ = np.zeros(X.shape[1]) coef_[0] = 1 [np.testing.assert_allclose(coef_, mdl._model.coef_, rtol=0, atol=0.08) for mdl in model_list] @@ -98,9 +113,9 @@ def predict(self, X, y, W=None): (np.arange(X.shape[0] // 2), np.arange(X.shape[0] // 2, X.shape[0]))] model = Lasso(alpha=0.01) with pytest.raises(AttributeError) as e_info: - nuisance, model_list, fitted_inds = _crossfit(Wrapper(model), - folds, - X, y, W=y, Z=None) + nuisance, model_list, fitted_inds, scores = _crossfit(Wrapper(model), + folds, + X, y, W=y, Z=None) np.random.seed(123) X = np.random.normal(size=(5000, 3)) @@ -108,9 +123,9 @@ def predict(self, X, y, W=None): folds = [(np.arange(X.shape[0]), np.arange(X.shape[0]))] model = Lasso(alpha=0.01) with pytest.raises(AttributeError) as e_info: - nuisance, model_list, fitted_inds = _crossfit(Wrapper(model), - folds, - X, y, W=y, Z=None) + nuisance, model_list, fitted_inds, scores = _crossfit(Wrapper(model), + folds, + X, y, W=y, Z=None) def test_ol(self): @@ -149,7 +164,8 @@ def score(self, Y, T, W=None, nuisances=None): sigma = 0.1 y = X[:, 0] + X[:, 1] + np.random.normal(0, sigma, size=(10000,)) est = _OrthoLearner(ModelNuisance(LinearRegression(), LinearRegression()), ModelFinal(), - n_splits=2, discrete_treatment=False, discrete_instrument=False, random_state=None) + n_splits=2, discrete_treatment=False, discrete_instrument=False, categories='auto', + random_state=None) est.fit(y, X[:, 0], W=X[:, 1:]) np.testing.assert_almost_equal(est.const_marginal_effect(), 1, decimal=3) np.testing.assert_array_almost_equal(est.effect(), np.ones(1), decimal=3) @@ -157,6 +173,8 @@ def score(self, Y, T, W=None, nuisances=None): np.testing.assert_almost_equal(est.score(y, X[:, 0], W=X[:, 1:]), sigma**2, decimal=3) np.testing.assert_almost_equal(est.score_, sigma**2, decimal=3) np.testing.assert_almost_equal(est.model_final.model.coef_[0], 1, decimal=3) + # Nuisance model has no score method, so nuisance_scores_ should be none + assert est.nuisance_scores_ is None # Test non keyword based calls to fit np.random.seed(123) @@ -164,7 +182,8 @@ def score(self, Y, T, W=None, nuisances=None): sigma = 0.1 y = X[:, 0] + X[:, 1] + np.random.normal(0, sigma, size=(10000,)) est = _OrthoLearner(ModelNuisance(LinearRegression(), LinearRegression()), ModelFinal(), - n_splits=2, discrete_treatment=False, discrete_instrument=False, random_state=None) + n_splits=2, discrete_treatment=False, discrete_instrument=False, + categories='auto', random_state=None) # test non-array inputs est.fit(list(y), list(X[:, 0]), None, X[:, 1:]) np.testing.assert_almost_equal(est.const_marginal_effect(), 1, decimal=3) @@ -181,7 +200,8 @@ def score(self, Y, T, W=None, nuisances=None): y = X[:, 0] + X[:, 1] + np.random.normal(0, sigma, size=(10000,)) est = _OrthoLearner(ModelNuisance(LinearRegression(), LinearRegression()), ModelFinal(), n_splits=KFold(n_splits=3), - discrete_treatment=False, discrete_instrument=False, random_state=None) + discrete_treatment=False, discrete_instrument=False, + categories='auto', random_state=None) est.fit(y, X[:, 0], None, X[:, 1:]) np.testing.assert_almost_equal(est.const_marginal_effect(), 1, decimal=3) np.testing.assert_array_almost_equal(est.effect(), np.ones(1), decimal=3) @@ -198,7 +218,7 @@ def score(self, Y, T, W=None, nuisances=None): folds = [(np.arange(X.shape[0] // 2), np.arange(X.shape[0] // 2, X.shape[0]))] est = _OrthoLearner(ModelNuisance(LinearRegression(), LinearRegression()), ModelFinal(), n_splits=KFold(n_splits=3), discrete_treatment=False, - discrete_instrument=False, random_state=None) + discrete_instrument=False, categories='auto', random_state=None) est.fit(y, X[:, 0], None, X[:, 1:]) np.testing.assert_almost_equal(est.const_marginal_effect(), 1, decimal=3) np.testing.assert_array_almost_equal(est.effect(), np.ones(1), decimal=3) @@ -239,7 +259,8 @@ def predict(self, X=None): sigma = 0.1 y = X[:, 0] + X[:, 1] + np.random.normal(0, sigma, size=(10000,)) est = _OrthoLearner(ModelNuisance(LinearRegression(), LinearRegression()), ModelFinal(), - n_splits=2, discrete_treatment=False, discrete_instrument=False, random_state=None) + n_splits=2, discrete_treatment=False, discrete_instrument=False, + categories='auto', random_state=None) est.fit(y, X[:, 0], W=X[:, 1:]) np.testing.assert_almost_equal(est.const_marginal_effect(), 1, decimal=3) np.testing.assert_array_almost_equal(est.effect(), np.ones(1), decimal=3) @@ -247,6 +268,55 @@ def predict(self, X=None): assert est.score_ is None np.testing.assert_almost_equal(est.model_final.model.coef_[0], 1, decimal=3) + def test_ol_nuisance_scores(self): + class ModelNuisance: + def __init__(self, model_t, model_y): + self._model_t = model_t + self._model_y = model_y + + def fit(self, Y, T, W=None): + self._model_t.fit(W, T) + self._model_y.fit(W, Y) + return self + + def predict(self, Y, T, W=None): + return Y - self._model_y.predict(W), T - self._model_t.predict(W) + + def score(self, Y, T, W=None): + return (self._model_t.score(W, Y), self._model_y.score(W, T)) + + class ModelFinal: + + def __init__(self): + return + + def fit(self, Y, T, W=None, nuisances=None): + Y_res, T_res = nuisances + self.model = LinearRegression(fit_intercept=False).fit(T_res.reshape(-1, 1), Y_res) + return self + + def predict(self, X=None): + return self.model.coef_[0] + + np.random.seed(123) + X = np.random.normal(size=(10000, 3)) + sigma = 0.1 + y = X[:, 0] + X[:, 1] + np.random.normal(0, sigma, size=(10000,)) + est = _OrthoLearner(ModelNuisance(LinearRegression(), LinearRegression()), ModelFinal(), + n_splits=2, discrete_treatment=False, discrete_instrument=False, + categories='auto', random_state=None) + est.fit(y, X[:, 0], W=X[:, 1:]) + np.testing.assert_almost_equal(est.const_marginal_effect(), 1, decimal=3) + np.testing.assert_array_almost_equal(est.effect(), np.ones(1), decimal=3) + np.testing.assert_array_almost_equal(est.effect(T0=0, T1=10), np.ones(1) * 10, decimal=2) + np.testing.assert_almost_equal(est.model_final.model.coef_[0], 1, decimal=3) + nuisance_scores_y = est.nuisance_scores_[0] + nuisance_scores_t = est.nuisance_scores_[1] + assert len(nuisance_scores_y) == len(nuisance_scores_t) == 2 # as many scores as splits + # y scores should be positive, since W predicts Y somewhat + # t scores might not be, since W and T are uncorrelated + np.testing.assert_array_less(0, nuisance_scores_y) + def test_ol_discrete_treatment(self): class ModelNuisance: def __init__(self, model_t, model_y): @@ -287,7 +357,8 @@ def score(self, Y, T, W=None, nuisances=None): sigma = 0.01 y = T + X[:, 0] + np.random.normal(0, sigma, size=(10000,)) est = _OrthoLearner(ModelNuisance(LogisticRegression(solver='lbfgs'), LinearRegression()), ModelFinal(), - n_splits=2, discrete_treatment=True, discrete_instrument=False, random_state=None) + n_splits=2, discrete_treatment=True, discrete_instrument=False, + categories='auto', random_state=None) est.fit(y, T, W=X) np.testing.assert_almost_equal(est.const_marginal_effect(), 1, decimal=3) np.testing.assert_array_almost_equal(est.effect(), np.ones(1), decimal=3) diff --git a/econml/utilities.py b/econml/utilities.py index f583a0b21..4343d7e90 100644 --- a/econml/utilities.py +++ b/econml/utilities.py @@ -411,6 +411,23 @@ def t(X): return _apply(t, X) +def add_intercept(X): + """ + Adds an intercept feature to an array by prepending a column of ones. + + Parameters + ---------- + X : array-like + Input array. Must be 2D. + + Returns + ------- + arr : ndarray + `X` with a column of ones prepended + """ + return hstack([np.ones((X.shape[0], 1)), X]) + + def reshape_Y_T(Y, T): """ Reshapes Y and T when Y.ndim = 2 and/or T.ndim = 1. @@ -1371,3 +1388,24 @@ def predict(self, XZ): @property def coef_(self): return np.concatenate((model.coef_ for model in self.models)) + + +class _EncoderWrapper: + """ + Wraps a OneHotEncoder (and optionally also a LabelEncoder). + + Useful mainly so that the `encode` method can be used in a FunctionTransformer, + which would otherwise need a lambda (which can't be pickled). + """ + + def __init__(self, one_hot_encoder, label_encoder=None, drop_first=False): + self._label_encoder = label_encoder + self._one_hot_encoder = one_hot_encoder + self._drop_first = drop_first + + def encode(self, arr): + if self._label_encoder: + arr = self._label_encoder.transform(arr.ravel()) + + result = self._one_hot_encoder.transform(reshape(arr, (-1, 1))) + return result[:, 1:] if self._drop_first else result diff --git a/notebooks/Deep IV Examples.ipynb b/notebooks/Deep IV Examples.ipynb index 035b73178..10b86d173 100644 --- a/notebooks/Deep IV Examples.ipynb +++ b/notebooks/Deep IV Examples.ipynb @@ -21,7 +21,7 @@ "source": [ "# Deep IV: Use Case and Examples\n", "\n", - "Deep IV uses deep neural networks in a two-stage instrumental variable (IV) estimation of causal effects, as described in [this ICML publication](http://proceedings.mlr.press/v70/hartford17a/hartford17a.pdf) or in the `econml` [specification](https://econml.azurewebsites.net/spec/estimation/iv.html#deep-instrumental-variables). In the EconML SDK, we have implemented Deep IV estimation on top of the Keras framework for building and training neural networks. In this notebook, we'll demonstrate how to use the SDK to apply Deep IV to synthetic data.\n", + "Deep IV uses deep neural networks in a two-stage instrumental variable (IV) estimation of causal effects, as described in [this ICML publication](http://proceedings.mlr.press/v70/hartford17a/hartford17a.pdf) or in the `econml` [specification](https://econml.azurewebsites.net/spec/estimation/deepiv.html). In the EconML SDK, we have implemented Deep IV estimation on top of the Keras framework for building and training neural networks. In this notebook, we'll demonstrate how to use the SDK to apply Deep IV to synthetic data.\n", "\n", "### Data\n", "\n", diff --git a/notebooks/Metalearners Examples.ipynb b/notebooks/Metalearners Examples.ipynb index e37a1e1ba..ff1f9b929 100644 --- a/notebooks/Metalearners Examples.ipynb +++ b/notebooks/Metalearners Examples.ipynb @@ -215,8 +215,7 @@ "# Instantiate X learner\n", "models = GradientBoostingRegressor(n_estimators=100, max_depth=6, min_samples_leaf=int(n/100))\n", "propensity_model = RandomForestClassifier(n_estimators=100, max_depth=6, \n", - " min_samples_leaf=int(n/100),\n", - " class_weight='balanced_subsample')\n", + " min_samples_leaf=int(n/100))\n", "X_learner = XLearner(models=models, propensity_model=propensity_model)\n", "# Train X_learner\n", "X_learner.fit(Y, T, X)\n", @@ -234,8 +233,7 @@ "models = GradientBoostingRegressor(n_estimators=100, max_depth=6, min_samples_leaf=int(n/100))\n", "final_models = GradientBoostingRegressor(n_estimators=100, max_depth=6, min_samples_leaf=int(n/100))\n", "propensity_model = RandomForestClassifier(n_estimators=100, max_depth=6, \n", - " min_samples_leaf=int(n/100),\n", - " class_weight='balanced_subsample')\n", + " min_samples_leaf=int(n/100))\n", "DA_learner = DomainAdaptationLearner(models=models, final_models=final_models, \n", " propensity_model=propensity_model)\n", "# Train DA_learner\n", @@ -255,8 +253,7 @@ "outcome_model = GradientBoostingRegressor(n_estimators=100, max_depth=6, min_samples_leaf=int(n/100))\n", "pseudo_treatment_model = GradientBoostingRegressor(n_estimators=100, max_depth=6, min_samples_leaf=int(n/100))\n", "propensity_model = RandomForestClassifier(n_estimators=100, max_depth=6, \n", - " min_samples_leaf=int(n/100),\n", - " class_weight='balanced_subsample')\n", + " min_samples_leaf=int(n/100))\n", "\n", "DR_learner = DRLearner(model_regression=outcome_model, model_propensity=propensity_model,\n", " model_final=pseudo_treatment_model, n_splits=5)\n",