diff --git a/docs/tutorial.md b/docs/tutorial.md index 77ab603e..961771aa 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -48,6 +48,7 @@ Supported covariance types are "iid", "HC1-3", CRV1 and CRV3 (one-way clustering `.vcov()` method: ```py + fixest.vcov({'CRV1':'group_id'}).summary() # >>> fixest.vcov({'CRV1':'group_id'}).summary() # diff --git a/poetry.lock b/poetry.lock index b4d54aa2..0e98b127 100644 --- a/poetry.lock +++ b/poetry.lock @@ -94,14 +94,15 @@ redis = ["redis (>=2.10.5)"] [[package]] name = "certifi" -version = "2022.12.7" +version = "2023.5.7" + description = "Python package for providing Mozilla's CA Bundle." category = "main" optional = false python-versions = ">=3.6" files = [ - {file = "certifi-2022.12.7-py3-none-any.whl", hash = "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"}, - {file = "certifi-2022.12.7.tar.gz", hash = "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3"}, + {file = "certifi-2023.5.7-py3-none-any.whl", hash = "sha256:c6c2e98f5c7869efca1f8916fed228dd91539f9f1b444c314c06eef02980c716"}, + {file = "certifi-2023.5.7.tar.gz", hash = "sha256:0f0d56dc5a6ad56fd4ba36484d6cc34451e1c6548c61daad8c320169f91eddc7"}, ] [[package]] @@ -570,14 +571,14 @@ testing = ["covdefaults (>=2.3)", "coverage (>=7.2.3)", "diff-cover (>=7.5)", "p [[package]] name = "fonttools" -version = "4.39.3" +version = "4.39.4" description = "Tools to manipulate font files" category = "main" optional = false python-versions = ">=3.8" files = [ - {file = "fonttools-4.39.3-py3-none-any.whl", hash = "sha256:64c0c05c337f826183637570ac5ab49ee220eec66cf50248e8df527edfa95aeb"}, - {file = "fonttools-4.39.3.zip", hash = "sha256:9234b9f57b74e31b192c3fc32ef1a40750a8fbc1cd9837a7b7bfc4ca4a5c51d7"}, + {file = "fonttools-4.39.4-py3-none-any.whl", hash = "sha256:106caf6167c4597556b31a8d9175a3fdc0356fdcd70ab19973c3b0d4c893c461"}, + {file = "fonttools-4.39.4.zip", hash = "sha256:dba8d7cdb8e2bac1b3da28c5ed5960de09e59a2fe7e63bb73f5a59e57b0430d2"}, ] [package.extras] @@ -652,14 +653,14 @@ files = [ [[package]] name = "griffe" -version = "0.27.3" +version = "0.27.5" description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API." category = "dev" optional = true python-versions = ">=3.7" files = [ - {file = "griffe-0.27.3-py3-none-any.whl", hash = "sha256:094513b209d4acd4b2680c2415d3af5f8ed925714795380c2a7d070e222e0b27"}, - {file = "griffe-0.27.3.tar.gz", hash = "sha256:a3d0f75aa76b80f181f818cf605f658a69fccf287aaeeeafc7a6cf4e6a2ca27e"}, + {file = "griffe-0.27.5-py3-none-any.whl", hash = "sha256:15b48fc3cebfc1c1a1a2f6e8177f6644a4a54517322e08e224fdf671454b34d7"}, + {file = "griffe-0.27.5.tar.gz", hash = "sha256:96fbc7a264bdb32b4da227bed6a16f2509e028a12d7471dbb48c2785bb01817f"}, ] [package.dependencies] @@ -2514,24 +2515,28 @@ test = ["pytest (>=6.0.0)"] [[package]] name = "wildboottest" -version = "0.1.10" +version = "0.1.13" description = "Wild Cluster Bootstrap Inference for Linear Models in Python" category = "main" optional = false python-versions = ">=3.7,<3.11" -files = [ - {file = "wildboottest-0.1.10-py3-none-any.whl", hash = "sha256:b7aad4fe357af49bd130491ce9332bceac9c0e47b0d467a93b0e125d1c659853"}, - {file = "wildboottest-0.1.10.tar.gz", hash = "sha256:c2f2653ecf588768b4946f1127d22bb39d8cf68b4f288cf92b8b0e46ec34f7ed"}, -] +files = [] +develop = false [package.dependencies] -numba = ">=0.55,<0.56" -numpy = ">=1.18,<2.0" -pandas = ">=1.4,<2.0" -poetry = ">=1.4.2,<2.0.0" -pytest = ">=7.2.0,<8.0.0" -statsmodels = ">=0.13,<0.14" -tabulate = ">=0.9.0,<0.10.0" +numba = "^0.55" +numpy = "^1.18" +pandas = "^1.4" +poetry = "^1.4.2" +pytest = "^7.2.0" +statsmodels = "^0.13" +tabulate = "^0.9.0" + +[package.source] +type = "git" +url = "https://github.com/s3alfisc/wildboottest.git" +reference = "main" +resolved_reference = "5e7723d611102da6bb892a9370d6c5ef7e99822f" [[package]] name = "wrapt" @@ -2722,4 +2727,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = ">=3.8,<3.11" -content-hash = "5aa6a6f0ca4277279991a4def7acb72f109ea65178c60fdfb6676f93526b499e" +content-hash = "d814efcfb5cbc47db937d7bc7b943fa761d624aec7b242156da157d519f2a0f3" diff --git a/pyfixest/FormulaParser.py b/pyfixest/FormulaParser.py index 4100ea3b..e3b74035 100644 --- a/pyfixest/FormulaParser.py +++ b/pyfixest/FormulaParser.py @@ -51,15 +51,50 @@ def __init__(self, fml): fml_split = fml.split('|') depvars, covars = fml_split[0].split("~") - if len(fml_split) > 1: - fevars = fml_split[1] - else: + if len(fml_split) == 1: fevars = "0" + endogvars = None + instruments = None + elif len(fml_split) == 2: + if "~" in fml_split[1]: + fevars = "0" + endogvars, instruments = fml_split[1].split("~") + else: + fevars = fml_split[1] + endogvars = None + instruments = None + elif len(fml_split) == 3: + fevars = fml_split[1] + endogvars, instruments = fml_split[2].split("~") + + if endogvars is not None: + if len(endogvars) > len(instruments): + raise ValueError("The IV system is underdetermined. Only fully determined systems are allowed. Please provide as many instruments as endogenous variables.") + elif len(endogvars) < len(instruments): + raise ValueError("The IV system is overdetermined. Only fully determined systems are allowed. Please provide as many instruments as endogenous variables.") + else: + pass # Parse all individual formula components into lists self.depvars = depvars.split("+") self.covars = _unpack_fml(covars) self.fevars = _unpack_fml(fevars) + # no fancy syntax for endogvars, instruments allowed + self.endogvars = endogvars + self.instruments = instruments + + if instruments is not None: + self.is_iv = True + # all rhs variables for the first stage (endog variable replaced with instrument) + first_stage_covars_list = covars.split("+") + first_stage_covars_list[first_stage_covars_list.index(endogvars)] = instruments + self.first_stage_covars_list = "+".join(first_stage_covars_list) + self.covars_first_stage = _unpack_fml(self.first_stage_covars_list) + self.depvars_first_stage = endogvars + else: + self.is_iv = False + self.covars_first_stage = None + self.depvars_first_stage = None if self.covars.get("i") is not None: self.ivars = dict() @@ -81,7 +116,10 @@ def __init__(self, fml): # Pack the formula components back into strings self.covars_fml = _pack_to_fml(self.covars) self.fevars_fml = _pack_to_fml(self.fevars) - + if instruments is not None: + self.covars_first_stage_fml = _pack_to_fml(self.covars_first_stage) + else: + self.covars_first_stage_fml = None #if "^" in self.covars: # raise CovariateInteractionError("Please use 'i()' or ':' syntax to interact covariates.") @@ -93,63 +131,105 @@ def __init__(self, fml): - def get_fml_dict(self): + def get_fml_dict(self, iv = False): """ Returns a dictionary of all fevars & formula without fevars. The keys are the fixed effect variable combinations. The values are lists of formula strings that do not include the fixed effect variables. + Args: + iv (bool): If True, the formula dictionary will be returned for the first stage of an IV regression. + If False, the formula dictionary will be returned for the second stage of an IV regression / OLS regression. Returns: dict: A dictionary of the form {"fe1+fe2": ['Y1 ~ X', 'Y2~X'], "fe1+fe3": ['Y1 ~ X', 'Y2~X']} where the keys are the fixed effect variable combinations and the values are lists of formula strings that do not include the fixed effect variables. + If IV is True, creates an instance named fml_dict_iv. Otherwise, creates an instance named fml_dict. """ - self.fml_dict = dict() + + fml_dict = dict() for fevar in self.fevars_fml: res = [] for depvar in self.depvars: - for covar in self.covars_fml: - res.append(depvar + '~' + covar) - self.fml_dict[fevar] = res + if iv: + for covar in self.covars_first_stage_fml: + res.append(depvar + '~' + covar) + else: + for covar in self.covars_fml: + res.append(depvar + '~' + covar) + fml_dict[fevar] = res + if iv: + self.fml_dict_iv = fml_dict + else: + self.fml_dict = fml_dict - def _transform_fml_dict(self): + def _transform_fml_dict(self, iv = False): fml_dict2 = dict() - for fe in self.fml_dict.keys(): + if iv: - fml_dict2[fe] = dict() + for fe in self.fml_dict_iv.keys(): - for fml in self.fml_dict.get(fe): - depvars, covars = fml.split("~") - if fml_dict2[fe].get(depvars) is None: - fml_dict2[fe][depvars] = [covars] - else: - fml_dict2[fe][depvars].append(covars) + fml_dict2[fe] = dict() - self.fml_dict2 = fml_dict2 + for fml in self.fml_dict_iv.get(fe): + depvars, covars = fml.split("~") + if fml_dict2[fe].get(depvars) is None: + fml_dict2[fe][depvars] = [covars] + else: + fml_dict2[fe][depvars].append(covars) + else: + + for fe in self.fml_dict.keys(): + + fml_dict2[fe] = dict() + + for fml in self.fml_dict.get(fe): + depvars, covars = fml.split("~") + if fml_dict2[fe].get(depvars) is None: + fml_dict2[fe][depvars] = [covars] + else: + fml_dict2[fe][depvars].append(covars) + + if iv: + self.fml_dict2_iv = fml_dict2 + else: + self.fml_dict2 = fml_dict2 - def get_var_dict(self): + def get_var_dict(self, iv = False): """ Create a dictionary of all fevars and list of covars and depvars used in regression with those fevars. The keys are the fixed effect variable combinations. The values are lists of variables (dependent variables and covariates) of the resespective regressions. + Args: + iv (bool): If True, the formula dictionary will be returned for the first stage of an IV regression. + Returns: dict: A dictionary of the form {"fe1+fe2": ['Y1', 'X1', 'X2'], "fe1+fe3": ['Y1', 'X1', 'X2']} where the keys are the fixed effect variable combinations and the values are lists of variables (dependent variables and covariates) used in the regression with those fixed effect variables. """ - self.var_dict = dict() - for fevar in self.fevars_fml: - self.var_dict[fevar] = _flatten_list(self.depvars) + _flatten_list(list(self.covars.values())) + var_dict = dict() + if iv: + for fevar in self.fevars_fml: + var_dict[fevar] = _flatten_list(self.depvars) + _flatten_list(list(self.covars_first_stage.values())) + else: + for fevar in self.fevars_fml: + var_dict[fevar] = _flatten_list(self.depvars) + _flatten_list(list(self.covars.values())) + + if iv: + self.var_dict_iv = var_dict + else: + self.var_dict = var_dict def _unpack_fml(x): diff --git a/pyfixest/__pycache__/FormulaParser.cpython-310.pyc b/pyfixest/__pycache__/FormulaParser.cpython-310.pyc index 48494aa2..98fee1c7 100644 Binary files a/pyfixest/__pycache__/FormulaParser.cpython-310.pyc and b/pyfixest/__pycache__/FormulaParser.cpython-310.pyc differ diff --git a/pyfixest/__pycache__/feols.cpython-310.pyc b/pyfixest/__pycache__/feols.cpython-310.pyc index 073d15f6..2b044de4 100644 Binary files a/pyfixest/__pycache__/feols.cpython-310.pyc and b/pyfixest/__pycache__/feols.cpython-310.pyc differ diff --git a/pyfixest/__pycache__/fixest.cpython-310.pyc b/pyfixest/__pycache__/fixest.cpython-310.pyc index e6287d5a..a4c59184 100644 Binary files a/pyfixest/__pycache__/fixest.cpython-310.pyc and b/pyfixest/__pycache__/fixest.cpython-310.pyc differ diff --git a/pyfixest/__pycache__/utils.cpython-310.pyc b/pyfixest/__pycache__/utils.cpython-310.pyc index dbd838c5..555fa464 100644 Binary files a/pyfixest/__pycache__/utils.cpython-310.pyc and b/pyfixest/__pycache__/utils.cpython-310.pyc differ diff --git a/pyfixest/feols.py b/pyfixest/feols.py index d931f95c..96c18017 100644 --- a/pyfixest/feols.py +++ b/pyfixest/feols.py @@ -22,6 +22,8 @@ class Feols: Dependent variable of the regression. X : Union[np.ndarray, pd.DataFrame] Independent variable of the regression. + Z: Union[np.ndarray, pd.DataFrame] + Instruments of the regression. Attributes ---------- @@ -29,6 +31,8 @@ class Feols: The dependent variable of the regression. X : np.ndarray The independent variable of the regression. + Z : np.ndarray + The instruments of the regression. N : int The number of observations. k : int @@ -48,7 +52,7 @@ class Feols: """ - def __init__(self, Y: np.ndarray, X: np.ndarray) -> None: + def __init__(self, Y: np.ndarray, X: np.ndarray, Z: np.ndarray) -> None: if not isinstance(Y, (np.ndarray)): raise TypeError("Y must be a numpy array.") @@ -57,9 +61,12 @@ def __init__(self, Y: np.ndarray, X: np.ndarray) -> None: self.Y = Y self.X = X + self.Z = Z if self.X.ndim != 2: raise ValueError("X must be a 2D array") + if self.Z.ndim != 2: + raise ValueError("Z must be a 2D array") self.N, self.k = X.shape @@ -68,14 +75,14 @@ def get_fit(self) -> None: Regression estimation for a single model, via ordinary least squares (OLS). ''' - self.tXX = np.transpose(self.X) @ self.X - self.tXXinv = np.linalg.inv(self.tXX) + self.tZX = np.transpose(self.Z) @ self.X + self.tZXinv = np.linalg.inv(self.tZX) - self.tXy = (np.transpose(self.X) @ self.Y) - beta_hat = self.tXXinv @ self.tXy + self.tZy = (np.transpose(self.Z) @ self.Y) + beta_hat = self.tZXinv @ self.tZy self.beta_hat = beta_hat.flatten() self.Y_hat = (self.X @ self.beta_hat) - self.u_hat = (self.Y - self.Y_hat) + self.u_hat = (self.Y.flatten() - self.Y_hat) def get_vcov(self, vcov: Union[str, Dict[str, str], List[str]]) -> None: ''' @@ -150,6 +157,9 @@ def get_vcov(self, vcov: Union[str, Dict[str, str], List[str]]) -> None: self.is_clustered = True + if self.is_iv: + if self.vcov_type in ["CRV3"]: + raise ValueError("CRV3 inference is not supported for IV regressions.") # compute vcov if self.vcov_type == 'iid': @@ -164,7 +174,13 @@ def get_vcov(self, vcov: Union[str, Dict[str, str], List[str]]) -> None: ) # only relevant factor for iid in ssc: fixef.K - self.vcov = self.ssc * self.tXXinv * (np.sum(self.u_hat ** 2) / (self.N - 1)) + if self.is_iv == False: + self.vcov = self.ssc * self.tZXinv * (np.sum(self.u_hat ** 2) / (self.N - 1)) + else: + sigma2 = (np.sum(self.u_hat ** 2) / (self.N - 1)) + tZZinv = np.linalg.inv(np.transpose(self.Z) @ self.Z) # k x k + tXZ = np.transpose(self.X) @ self.Z + self.vcov = self.ssc * np.linalg.inv(tXZ @ tZZinv @ self.tZX ) * sigma2 # elif self.vcov_type == 'hetero': @@ -180,15 +196,27 @@ def get_vcov(self, vcov: Union[str, Dict[str, str], List[str]]) -> None: if vcov_type_detail in ["hetero", "HC1"]: u = self.u_hat elif vcov_type_detail in ["HC2", "HC3"]: - leverage = np.sum(self.X * (self.X @ self.tXXinv), axis=1) + leverage = np.sum(self.X * (self.X @ self.tZXinv), axis=1) if vcov_type_detail == "HC2": u = self.u_hat / np.sqrt(1 - leverage) else: u = self.u_hat / (1-leverage) - meat = np.transpose(self.X) * (u ** 2) @ self.X - # set off diagonal elements to zero - self.vcov = self.ssc * self.tXXinv @ meat @ self.tXXinv + if self.is_iv == False: + meat = np.transpose(self.Z) * (u ** 2) @ self.Z + # set off diagonal elements to zero + self.vcov = self.ssc * self.tZXinv @ meat @ self.tZXinv + else: + tZZinv = np.linalg.inv(np.transpose(self.Z) @ self.Z) # k x k + tXZ = np.transpose(self.X) @ self.Z # k x k + if u.ndim == 1: + u = u.reshape((self.N,1)) + Omega = np.transpose(self.Z) @ (self.Z * (u ** 2)) # k x k + meat = tXZ @ tZZinv @ Omega @ tZZinv @ self.tZX # k x k + bread = np.linalg.inv(tXZ @ tZZinv @ self.tZX) + self.vcov = self.ssc * bread @ meat @ bread + + elif self.vcov_type == "CRV": @@ -218,16 +246,24 @@ def get_vcov(self, vcov: Union[str, Dict[str, str], List[str]]) -> None: if vcov_type_detail == "CRV1": + meat = np.zeros((self.k, self.k)) - for igx, g, in enumerate(clustid): + for _, g, in enumerate(clustid): - Xg = self.X[np.where(cluster_df == g)] + Zg = self.Z[np.where(cluster_df == g)] ug = self.u_hat[np.where(cluster_df == g)] - score_g = (np.transpose(Xg) @ ug).reshape((self.k, 1)) + score_g = (np.transpose(Zg) @ ug).reshape((self.k, 1)) meat += np.dot(score_g, score_g.transpose()) - self.vcov = self.ssc * self.tXXinv @ meat @ self.tXXinv + if self.is_iv == False: + self.vcov = self.ssc * self.tZXinv @ meat @ self.tZXinv + else: + tZZinv = np.linalg.inv(np.transpose(self.Z) @ self.Z) # k x k + tXZ = np.transpose(self.X) @ self.Z # k x k + meat = tXZ @ tZZinv @ meat @ tZZinv @ self.tZX + bread = np.linalg.inv(tXZ @ tZZinv @ self.tZX) + self.vcov = self.ssc * bread @ meat @ bread elif vcov_type_detail == "CRV3": @@ -238,6 +274,9 @@ def get_vcov(self, vcov: Union[str, Dict[str, str], List[str]]) -> None: #if self.has_fixef: # raise ValueError("CRV3 inference is currently not supported with fixed effects.") + if self.is_iv: + raise ValueError("CRV3 inference is not supported with IV estimation.") + k_params = self.k beta_hat = self.beta_hat @@ -360,11 +399,13 @@ def get_wildboottest(self, B:int, cluster : Union[np.ndarray, pd.Series, pd.Data Returns: a pd.DataFrame with bootstrapped t-statistic and p-value ''' + if self.is_iv: + raise ValueError("Wild cluster bootstrap is not supported with IV estimation.") if self.has_fixef: raise ValueError("Wild cluster bootstrap is not supported with fixed effects.") xnames = self.coefnames.to_list() - Y = self.Y + Y = self.Y.flatten() X = self.X # later: allow r <> 0 and custom R diff --git a/pyfixest/fixest.py b/pyfixest/fixest.py index fb556e13..f26d6761 100644 --- a/pyfixest/fixest.py +++ b/pyfixest/fixest.py @@ -66,8 +66,9 @@ def _demean(self, data: pd.DataFrame, fval: str, ivars: List[str], drop_ref: str None ''' - YX_dict = dict() + YXZ_dict = dict() na_dict = dict() + var_dict = dict() if fval != "0": fe, fe_na = self._clean_fe(data, fval) @@ -77,6 +78,8 @@ def _demean(self, data: pd.DataFrame, fval: str, ivars: List[str], drop_ref: str fe_na = None dict2fe = self.fml_dict2.get(fval) + if self.is_iv: + dict2fe_iv = self.fml_dict2_iv.get(fval) # create lookup table with NA index key # for each regression, check if lookup table already @@ -87,35 +90,72 @@ def _demean(self, data: pd.DataFrame, fval: str, ivars: List[str], drop_ref: str lookup_demeaned_data = dict() + # loop over both dict2fe and dict2fe_iv (if the latter is not None) for depvar in dict2fe.keys(): # [(0, 'X1+X2'), (1, ['X1+X3'])] - for c, covar in enumerate(dict2fe.get(depvar)): + for _, covar in enumerate(dict2fe.get(depvar)): covar2 = covar depvar2 = depvar fml = depvar2 + " ~ " + covar2 - Y, X = model_matrix(fml, data) + if self.is_iv: + instruments2 = dict2fe_iv.get(depvar)[0] + endogvar = list(set(covar2.split("+")) - set(instruments2.split("+")))[0] + instrument = list(set(instruments2.split("+")) - set(covar2.split("+")))[0] + + if len([instrument]) > 1 or len([endogvar]) > 1: + raise ValueError("Currently, IV estimation is only supported with one endogeneous variable and one instrument.") + + fml2 = instrument + "+" + fml + rhs, lhs = model_matrix(fml2, data) + + Y = rhs[[depvar]] + I = rhs[[instrument]] + X = lhs + else: + Y, X = model_matrix(fml, data) + na_index = list(set(data.index) - set(Y.index)) if self.ivars is not None: if drop_ref is not None: X = X.drop(drop_ref, axis=1) - dep_varnames = list(Y.columns) - co_varnames = list(X.columns) - var_names = list(dep_varnames) + list(co_varnames) + y_names = list(Y.columns) + x_names = list(X.columns) + yxz_names = list(y_names) + list(x_names) + if self.is_iv: + iv_names = list(I.columns) + #z_varnames = list(set(x_names) - set([endogvar])) + #z_varnames.append(instrument) + x_names_copy = x_names.copy() + x_names_copy.remove(endogvar) + z_names = x_names_copy + [instrument] + cols = yxz_names + iv_names + else: + iv_names = None + z_names = None + cols = yxz_names if self.ivars is not None: - self.icovars = [s for s in co_varnames if s.startswith( + self.icovars = [s for s in x_names if s.startswith( ivars[0]) and s.endswith(ivars[1])] else: self.icovars = None + Y = Y.to_numpy() + #if Y.ndim != 1: + # Y = Y.reshape((len(Y), 1)) + #.reshape((len(Y), 1)) X = X.to_numpy() + if self.is_iv: + I = I.to_numpy() + if I.ndim != 1: + I = I.reshape((len(I), 1)) if Y.shape[1] > 1: raise ValueError( @@ -126,39 +166,50 @@ def _demean(self, data: pd.DataFrame, fval: str, ivars: List[str], drop_ref: str na_index = (na_index + fe_na) fe2 = np.delete(fe, na_index, axis=0) # drop intercept - intercept_index = co_varnames.index("Intercept") + intercept_index = x_names.index("Intercept") X = np.delete(X, intercept_index, axis = 1) - co_varnames.remove("Intercept") - var_names.remove("Intercept") + #if self.is_iv: + # I = np.delete(I, intercept_index, axis = 1) + x_names.remove("Intercept") + yxz_names.remove("Intercept") + if self.is_iv: + z_names.remove("Intercept") + cols.remove("Intercept") # check if variables have already been demeaned Y = np.delete(Y, fe_na, axis=0) X = np.delete(X, fe_na, axis=0) - YX = np.concatenate([Y, X], axis=1) + if self.is_iv: + I = np.delete(I, fe_na, axis=0) + + if self.is_iv: + YXZ = np.concatenate([Y, X, I], axis = 1) + else: + YXZ = np.concatenate([Y, X], axis=1) na_index_str = ','.join(str(x) for x in na_index) # check if looked dict has data for na_index if lookup_demeaned_data.get(na_index_str) is not None: # get data out of lookup table: list of [algo, data] - algorithm, YX_demeaned_old = lookup_demeaned_data.get( + algorithm, YXZ_demeaned_old = lookup_demeaned_data.get( na_index_str) # get not yet demeaned covariates var_diff_names = list( - set(var_names) - set(YX_demeaned_old.columns))[0] - var_diff_index = list(var_names).index(var_diff_names) - var_diff = YX[:, var_diff_index] + set(yxz_names) - set(YXZ_demeaned_old.columns))[0] + var_diff_index = list(yxz_names).index(var_diff_names) + var_diff = YXZ[:, var_diff_index] if var_diff.ndim == 1: var_diff = var_diff.reshape(len(var_diff), 1) - YX_demean_new = algorithm.residualize(var_diff) - YX_demeaned = np.concatenate( - [YX_demeaned_old, YX_demean_new], axis=1) - YX_demeaned = pd.DataFrame(YX_demeaned) + YXZ_demean_new = algorithm.residualize(var_diff) + YXZ_demeaned = np.concatenate( + [YXZ_demeaned_old, YXZ_demean_new], axis=1) + YXZ_demeaned = pd.DataFrame(YXZ_demeaned) - YX_demeaned.columns = list( - YX_demeaned_old.columns) + [var_diff_names] + YXZ_demeaned.columns = list( + YXZ_demeaned_old.columns) + [var_diff_names] else: # not data demeaned yet for NA combination @@ -177,26 +228,42 @@ def _demean(self, data: pd.DataFrame, fval: str, ivars: List[str], drop_ref: str np.where(algorithm._singleton_indices))[0].tolist() na_index += dropped_singleton_indices - YX_demeaned = algorithm.residualize(YX) - YX_demeaned = pd.DataFrame(YX_demeaned) - YX_demeaned.columns = list( - dep_varnames) + list(co_varnames) + YXZ_demeaned = algorithm.residualize(YXZ) + YXZ_demeaned = pd.DataFrame(YXZ_demeaned) + + #if self.is_iv: + # cols += iv_names + + YXZ_demeaned.columns = cols lookup_demeaned_data[na_index_str] = [ - algorithm, YX_demeaned] + algorithm, YXZ_demeaned] else: # if no fixed effects - YX = np.concatenate([Y, X], axis=1) - YX_demeaned = pd.DataFrame(YX) - YX_demeaned.columns = list( - dep_varnames) + list(co_varnames) + if self.is_iv: + YXZ = np.concatenate([Y, X, I], axis=1) + else: + YXZ = np.concatenate([Y, X], axis=1) + + YXZ_demeaned = pd.DataFrame(YXZ) + + #if self.is_iv: + # cols += list(iv_names) + + YXZ_demeaned.columns = cols - cols = list(dep_varnames) + list(co_varnames) - YX_dict[fml] = YX_demeaned[cols] + YXZ_dict[fml] = YXZ_demeaned#[cols] na_dict[fml] = na_index + var_dict[fml] = dict({ + 'y_names': y_names, + 'x_names': x_names, + 'iv_names': iv_names, + 'z_names': z_names + }) - return YX_dict, na_dict + + return YXZ_dict, na_dict, var_dict def feols(self, fml: str, vcov: Union[None, str, Dict[str, str]] = None, ssc=ssc(), fixef_rm: str = "none") -> None: ''' @@ -232,22 +299,45 @@ def feols(self, fml: str, vcov: Union[None, str, Dict[str, str]] = None, ssc=ssc fml = 'Y1 + Y2 ~ csw(X1, X2, X3) | sw(X4, X5) + X6' ''' - # no whitespace in formula - fml = re.sub(r'\s+', ' ', fml) + self.fml = fml.replace(" ", "") + self.split = None + + # deparse formula, at least partially fxst_fml = FixestFormulaParser(fml) + if fxst_fml.is_iv: + self.is_iv = True + else: + self.is_iv = False + + # add function argument to these methods for IV fxst_fml.get_fml_dict() fxst_fml.get_var_dict() fxst_fml._transform_fml_dict() - self.fml = fml - self.split = None + if self.is_iv: + # create required dicts for first stage IV regressions + fxst_fml.get_fml_dict(iv = True) + fxst_fml.get_var_dict(iv = True) + fxst_fml._transform_fml_dict(iv = True) + self.fml_dict = fxst_fml.fml_dict self.var_dict = fxst_fml.var_dict self.fml_dict2 = fxst_fml.fml_dict2 + + if self.is_iv: + self.fml_dict_iv = fxst_fml.fml_dict_iv + self.var_dict_iv = fxst_fml.var_dict_iv + self.fml_dict2_iv = fxst_fml.fml_dict2_iv + else: + self.fml_dict_iv = self.fml_dict + self.var_dict_iv = self.var_dict + self.fml_dict2_iv = self.fml_dict2 + self.ivars = fxst_fml.ivars + self.ssc_dict = ssc if fixef_rm == "singleton": self.drop_singletons = True @@ -293,6 +383,8 @@ def feols(self, fml: str, vcov: Union[None, str, Dict[str, str]] = None, ssc=ssc self.dropped_data_dict = dict() self.demeaned_data_dict = dict() + # names of depvar, X, Z matrices + self.yxz_name_dict = dict() estimate_full_model = True estimate_split_model = False @@ -330,23 +422,26 @@ def feols(self, fml: str, vcov: Union[None, str, Dict[str, str]] = None, ssc=ssc for _, fval in enumerate(fixef_keys): self.demeaned_data_dict[fval] = [] self.dropped_data_dict[fval] = [] + self.yxz_name_dict[fval] = [] data = self.data - demeaned_data, dropped_data = self._demean( + demeaned_data, dropped_data, yxz_name_dict= self._demean( data, fval, ivars, drop_ref) self.demeaned_data_dict[fval].append(demeaned_data) self.dropped_data_dict[fval].append(dropped_data) - + self.yxz_name_dict[fval].append(yxz_name_dict) # here: if estimate_split_model: for _, fval in enumerate(fixef_keys): self.demeaned_data_dict[fval] = [] self.dropped_data_dict[fval] = [] + self.yxz_name_dict[fval] = [] for x in self.split_categories: sub_data = self.data[x == self.splitvar] - demeaned_data, dropped_data = self._demean( + demeaned_data, dropped_data, yxz_name_dict = self._demean( sub_data, fval, ivars, drop_ref) self.demeaned_data_dict[fval].append(demeaned_data) self.dropped_data_dict[fval].append(dropped_data) + self.yxz_name_dict[fval].append(yxz_name_dict) # estimate models based on demeaned model matrix and dependent variables for _, fval in enumerate(self.fml_dict.keys()): @@ -372,10 +467,25 @@ def feols(self, fml: str, vcov: Union[None, str, Dict[str, str]] = None, ssc=ssc split_log = None full_fml = fml2 - Y = model_frame.iloc[:, 0].to_numpy() - X = model_frame.iloc[:, 1:] + name_dict = self.yxz_name_dict[fval][0][fml] + depvar_name = name_dict["y_names"] + xvar_names = name_dict["x_names"] + if name_dict["z_names"] is None: + zvar_names = name_dict["x_names"] + else: + zvar_names = name_dict["z_names"] + + Y = model_frame[depvar_name] + X = model_frame[xvar_names] + Z = model_frame[zvar_names] + colnames = X.columns + zcolnames = Z.columns + + Y = Y.to_numpy() X = X.to_numpy() + Z = Z.to_numpy() + N = X.shape[0] k = X.shape[1] @@ -386,9 +496,10 @@ def feols(self, fml: str, vcov: Union[None, str, Dict[str, str]] = None, ssc=ssc ". The model is skipped. As you are running a regression via `i()` syntax, maybe you need to drop a level via i(var1, var2, ref = ...)?") else: raise ValueError( - "The design Matrix X does not have full rank for the regression with fml" + fml2 + ". The model is skipped. ") + "The design Matrizx X does not have full rank for the regression with fml" + fml2 + ". The model is skipped. ") - FEOLS = Feols(Y, X) + FEOLS = Feols(Y, X, Z) + FEOLS.is_iv = self.is_iv FEOLS.fml = fml2 FEOLS.ssc_dict = self.ssc_dict FEOLS.get_fit() @@ -507,8 +618,6 @@ def summary(self) -> None: None ''' - print('---') - for x in list(self.model_res.keys()): split = x.split("|") @@ -528,13 +637,20 @@ def summary(self) -> None: } ) + if fxst.is_iv: + estimation_method = "IV" + else: + estimation_method = "OLS" + + print('###') print('') + print('Model: ', estimation_method) + print('Dep. var.: ', depvar) if fe is not None: print('Fixed effects: ', fe) # if fxst.split_log is not None: # print('Split. var: ', self.split + ":" + fxst.split_log) - print('Dep. var.: ', depvar) print('Inference: ', fxst.vcov_log) print('Observations: ', fxst.N) print('') diff --git a/pyfixest/utils.py b/pyfixest/utils.py index 926b9239..9e43831f 100644 --- a/pyfixest/utils.py +++ b/pyfixest/utils.py @@ -11,7 +11,7 @@ def get_data(seed = 1234): np.random.seed(seed) - N = 1000 + N = 2000 k = 20 G = 25 X = np.random.normal(0, 1, N * k).reshape((N,k)) @@ -39,6 +39,8 @@ def get_data(seed = 1234): data['Y'][0] = np.nan data['X1'][1] = np.nan + + data["Z1"] = data["X1"] + np.random.normal(0, 1, data.shape[0]) #data['X2'][2] = np.nan diff --git a/pyproject.toml b/pyproject.toml index ce910876..c27ae38c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,7 @@ [tool.poetry] name = "pyfixest" -version = "0.4.3" +version = "0.5" + description = "Experimental draft package for high dimensional fixed effect estimation" authors = ["Alexander Fischer "] license = "MIT" diff --git a/readme.md b/readme.md index 1a4cd817..701b66ca 100644 --- a/readme.md +++ b/readme.md @@ -20,40 +20,81 @@ from pyfixest.utils import get_data data = get_data() fixest = pf.Fixest(data = data) +# OLS Estimation fixest.feols("Y~X1 | csw0(X2, X3)", vcov = {'CRV1':'group_id'}) fixest.summary() # ### # +# Model: OLS +# Dep. var.: Y +# Inference: {'CRV1': 'group_id'} +# Observations: 1998 +# +# Estimate Std. Error t value Pr(>|t|) +# Intercept -3.941395 0.221365 -17.804976 2.442491e-15 +# X1 -0.273096 0.165154 -1.653580 1.112355e-01 # --- # ### # +# Model: OLS # Dep. var.: Y +# Fixed effects: X2 # Inference: {'CRV1': 'group_id'} -# Observations: 998 +# Observations: 1998 # -# Estimate Std. Error t value Pr(>|t|) -# Intercept 6.648203 0.220649 30.130262 0.00000 -# X1 -0.141200 0.211081 -0.668937 0.50369 +# Estimate Std. Error t value Pr(>|t|) +# X1 -0.260472 0.163874 -1.589472 0.125042 # --- # ### # -# Fixed effects: X2 +# Model: OLS # Dep. var.: Y +# Fixed effects: X2+X3 # Inference: {'CRV1': 'group_id'} -# Observations: 998 +# Observations: 1998 # -# Estimate Std. Error t value Pr(>|t|) -# X1 -0.142274 0.210556 -0.675707 0.499383 +# Estimate Std. Error t value Pr(>|t|) +# X1 0.03975 0.107003 0.371481 0.713538 # --- + + +# IV Estimation +fixest = pf.Fixest(data = data) +fixest.feols("Y~X1 | csw0(X2, X3) | X1 ~ Z1", vcov = {'CRV1':'group_id'}) +fixest.summary() # ### # -# Fixed effects: X2+X3 +# Model: IV +# Dep. var.: Y +# Inference: {'CRV1': 'group_id'} +# Observations: 1998 +# +# Estimate Std. Error t value Pr(>|t|) +# Intercept -3.941293 0.221354 -17.805377 2.442491e-15 +# X1 -0.265817 0.261940 -1.014803 3.203217e-01 +# --- +# ### +# +# Model: IV # Dep. var.: Y +# Fixed effects: X2 # Inference: {'CRV1': 'group_id'} -# Observations: 998 +# Observations: 1998 # # Estimate Std. Error t value Pr(>|t|) -# X1 -0.096317 0.204801 -0.470296 0.638247 +# X1 -0.259964 0.264817 -0.981674 0.336054 +# --- +# ### +# +# Model: IV +# Dep. var.: Y +# Fixed effects: X2+X3 +# Inference: {'CRV1': 'group_id'} +# Observations: 1998 +# +# Estimate Std. Error t value Pr(>|t|) +# X1 0.041142 0.201983 0.203688 0.840315 +# --- ``` diff --git a/tests/__pycache__/test_errors.cpython-310-pytest-7.3.1.pyc b/tests/__pycache__/test_errors.cpython-310-pytest-7.3.1.pyc index cf3bc0b3..0a80ddba 100644 Binary files a/tests/__pycache__/test_errors.cpython-310-pytest-7.3.1.pyc and b/tests/__pycache__/test_errors.cpython-310-pytest-7.3.1.pyc differ diff --git a/tests/__pycache__/test_vs_fixest.cpython-310-pytest-7.3.1.pyc b/tests/__pycache__/test_vs_fixest.cpython-310-pytest-7.3.1.pyc index 53fb61c3..c832b2ac 100644 Binary files a/tests/__pycache__/test_vs_fixest.cpython-310-pytest-7.3.1.pyc and b/tests/__pycache__/test_vs_fixest.cpython-310-pytest-7.3.1.pyc differ diff --git a/tests/test_errors.py b/tests/test_errors.py index 6566552c..5bdf55f6 100644 --- a/tests/test_errors.py +++ b/tests/test_errors.py @@ -84,3 +84,25 @@ def test_depvar_numeric(): with pytest.raises(ValueError): fixest.feols('Y ~ X1') + +def test_iv_errors(): + + data = get_data() + data["Z1"] = data["X1"] + np.random.normal(0, 1, data.shape[0]) + data["Z2"] = data["X2"] + np.random.normal(0, 1, data.shape[0]) + + + fixest = Fixest(data) + with pytest.raises(ValueError): + fixest.feols('Y ~ X1 | Z1 + Z2 ~ X1 ') + with pytest.raises(ValueError): + fixest.feols('Y ~ X1 | Z1 ~ X1 + X2') + with pytest.raises(ValueError): + fixest.feols('Y ~ X1 | Z1 + Z2 ~ X1 + X2') + with pytest.raises(ValueError): + fixest.feols('Y ~ X1 | Z1 ~ X1 ', vcov = {"CRV3":"group_id"}) + + + + + diff --git a/tests/test_vs_fixest.py b/tests/test_vs_fixest.py index 695a928a..812e0f44 100644 --- a/tests/test_vs_fixest.py +++ b/tests/test_vs_fixest.py @@ -172,7 +172,7 @@ def test_py_vs_r2(data, fml_multi): # suppress correction for fixed effects fixest.setFixest_ssc(fixest.ssc(True, "none", True, "min", "min", False)) - r_fml = _py_fml_to_r_fml(fml_multi) + r_fml = _py_fml_to_r_fml(fml_multi, False) pyfixest = Fixest(data = data).feols(fml_multi) py_coef = pyfixest.coef()['Estimate'] @@ -217,7 +217,7 @@ def test_py_vs_r_i(data, fml_i): # suppress correction for fixed effects fixest.setFixest_ssc(fixest.ssc(True, "none", True, "min", "min", False)) - r_fml = _py_fml_to_r_fml(fml_i) + r_fml = _py_fml_to_r_fml(fml_i, False) pyfixest = Fixest(data = data).feols(fml_i, vcov = 'iid') py_coef = pyfixest.coef()['Estimate'] @@ -319,8 +319,103 @@ def test_py_vs_r_split(data, fml_split): +@pytest.mark.parametrize("fml_iv", [ -def _py_fml_to_r_fml(py_fml): + "Y ~ X1 | X1 ~ Z1", + #"Y ~ X1 + X2 | X1 ~ Z1", + + "Y ~ X1 | X2 | X1 ~ Z1", + "Y ~ X1 | X2 + X3 | X1 ~ Z1", + + #"Y ~ X1 + C(X4) | X2 + X3 | X1 ~ Z1", + "Y ~ X1 + X2| X3 | X1 ~ Z1", + + #"Y ~ X1 + X2 | X1 + X2 ~ Z1 + Z2", + + +]) + + +def test_py_vs_r_iv(data, fml_iv): + + ''' + tests for instrumental variables regressions + ''' + + np.random.seed(1235) + + data["Z1"] = data["X1"] * np.random.normal(data.shape[0]) + + # iid errors + pyfixest = Fixest(data = data).feols(fml_iv, vcov = 'iid') + + py_coef = np.sort(pyfixest.coef()['Estimate']) + py_se = np.sort(pyfixest.se()['Std. Error']) + py_pval = np.sort(pyfixest.pvalue()['Pr(>|t|)']) + py_tstat = np.sort(pyfixest.tstat()['t value']) + + fml_r = _py_fml_to_r_fml(fml_iv, True) + + r_fixest = fixest.feols( + ro.Formula(fml_r), + se = 'iid', + data=data, + ssc = fixest.ssc(True, "none", True, "min", "min", False) + ) + + if not np.allclose((np.array(py_coef)), np.sort(stats.coef(r_fixest))): + raise ValueError("py_coef != r_coef") + if not np.allclose((np.array(py_se)), np.sort(fixest.se(r_fixest))): + raise ValueError("py_se != r_se for iid errors") + if not np.allclose((np.array(py_pval)), np.sort(fixest.pvalue(r_fixest))): + raise ValueError("py_pval != r_pval for iid errors") + if not np.allclose(np.array(py_tstat), np.sort(fixest.tstat(r_fixest))): + raise ValueError("py_tstat != r_tstat for iid errors") + + # heteroskedastic errors + pyfixest.vcov("HC1") + py_se = pyfixest.se()['Std. Error'] + py_pval = pyfixest.pvalue()['Pr(>|t|)'] + py_tstat = pyfixest.tstat()['t value'] + + r_fixest = fixest.feols( + ro.Formula(fml_r), + se = 'hetero', + data=data, + ssc = fixest.ssc(True, "none", True, "min", "min", False) + ) + + if not np.allclose((np.array(py_se)), (fixest.se(r_fixest))): + raise ValueError("py_se != r_se for HC1 errors") + if not np.allclose((np.array(py_pval)), (fixest.pvalue(r_fixest))): + raise ValueError("py_pval != r_pval for HC1 errors") + if not np.allclose(np.array(py_tstat), fixest.tstat(r_fixest)): + raise ValueError("py_tstat != r_tstat for HC1 errors") + + # cluster robust errors + pyfixest.vcov({'CRV1':'group_id'}) + py_se = pyfixest.se()['Std. Error'] + py_pval = pyfixest.pvalue()['Pr(>|t|)'] + py_tstat = pyfixest.tstat()['t value'] + + r_fixest = fixest.feols( + ro.Formula(fml_r), + cluster = ro.Formula('~group_id'), + data=data, + ssc = fixest.ssc(True, "none", True, "min", "min", False) + ) + + if not np.allclose((np.array(py_se)), (fixest.se(r_fixest))): + raise ValueError("py_se != r_se for CRV1 errors") + if not np.allclose((np.array(py_pval)), (fixest.pvalue(r_fixest))): + raise ValueError("py_pval != r_pval for CRV1 errors") + if not np.allclose(np.array(py_tstat), fixest.tstat(r_fixest)): + raise ValueError("py_tstat != r_tstat for CRV1 errors") + + + + +def _py_fml_to_r_fml(py_fml, is_iv = False): ''' pyfixest multiple estimation fml syntax to fixest multiple depvar @@ -328,11 +423,46 @@ def _py_fml_to_r_fml(py_fml): i.e. 'Y1 + X2 ~ X' -> 'c(Y1, Y2) ~ X' ''' - fml_split = py_fml.split("~") - depvars = fml_split[0] - covars = fml_split[1] - depvars = "c(" + ",".join(depvars.split("+")) + ")" - return depvars + "~" + covars + if is_iv == False: + + fml_split = py_fml.split("~") + depvars = fml_split[0] + covars = fml_split[1] + depvars = "c(" + ",".join(depvars.split("+")) + ")" + + return depvars + "~" + covars + + else: + + fml2 = py_fml.split("|") + + if len(fml2) == 2: + + covars = fml2[0].split("~")[1] + covars = covars.split("+") + depvar = fml2[0].split("~")[0] + endogvars = fml2[1].split("~")[0] + exogvars = list(set(covars) - set([endogvars])) + if exogvars == []: + exogvars = "1" + else: + exogvars = "+".join(exogvars) + + return depvar + "~" + exogvars + "|" + fml2[1] + + elif len(fml2) == 3: + + covars = fml2[0].split("~")[1] + covars = covars.split("+") + depvar = fml2[0].split("~")[0] + endogvars = fml2[2].split("~")[0] + exogvars = list(set(covars) - set([endogvars])) + if exogvars == []: + exogvars = "1" + else: + exogvars = "+".join(exogvars) + + return depvar + "~" + exogvars + "|" + fml2[1] + "|" + fml2[2]