Fix #94 and #95

Issues resolved: 1) qrsolve(XX, XY) suffers from numerical inaccuracies on some cases, so we fix to qrsolve(X, Y) if we don't have weights (Note: this might be a bit slower as the XX and XY are already precomputed. It could also be optimized but for now let's leave it as it is. 2) The methods used to find out omitted variables were different in different points. When demeaning, we looked if the regressors were close to zero, but used absolute values (1e-8) which doesn't work if there is a huge scaling difference between Y and X. Now we will assumme the var has been absorbed by the absvars if the ratio z'z/w'w < (1e-9) where z=old variable and w= demeaned variable. This 1e-9 will also increase with tolerance() as it's just tolerance*1e-1 We also use invsym() as the main driver for whether to drop or not, as that is what it's used on the built-in tools. Then the inputs of qrsolve() will exclude the omitted variables, to prevent any issue.
sergiocorreia · Apr 17, 2017 · 79a8c01 · 79a8c01
1 parent 9503eee
commit 79a8c01
Show file tree

Hide file tree

Showing 7 changed files with 204 additions and 60 deletions.
diff --git a/src/reghdfe.ado b/src/reghdfe.ado
@@ -1,4 +1,4 @@
-*! version 4.1.4 17apr2017
+*! version 4.2.0 17apr2017
 
 program reghdfe, eclass
 	* Intercept old+version
@@ -448,6 +448,7 @@ program RegressOLS, eclass
 		matrix colnames `b' = `indepvars'
 		matrix colnames `V' = `indepvars'
 		matrix rownames `V' = `indepvars'
+		_ms_findomitted `b' `V'
 		ereturn post `b' `V', `esample' buildfvinfo depname(`depvar') 
 	}
 	else {

diff --git a/src/reghdfe.pkg b/src/reghdfe.pkg
@@ -29,7 +29,7 @@ d Required packages:
 d 	ftools
 d 	boottest (for Stata version 12 or earlier)
 d
-d Distribution-Date: 20170406
+d Distribution-Date: 20170417
 d
 
 f reghdfe.ado

diff --git a/src/reghdfe.sthlp b/src/reghdfe.sthlp
@@ -1,5 +1,5 @@
 {smcl}
-{* *! version 4.1.4 17apr2017}{...}
+{* *! version 4.2.0 17apr2017}{...}
 {vieweralsosee "[R] areg" "help areg"}{...}
 {vieweralsosee "[R] xtreg" "help xtreg"}{...}
 {vieweralsosee "[R] ivregress" "help ivregress"}{...}

diff --git a/src/reghdfe_class.mata b/src/reghdfe_class.mata
@@ -102,6 +102,7 @@ class FixedEffects
     `StringRowVector'       dofadjustments // firstpair pairwise cluster continuous
     `Varname'               groupvar
     `String'                residuals
+    `RowVector'             kept // 1 if the regressors are not deemed as omitted (by partial_out+cholsolve+invsym)
     `String'                diopts
 
     // Output
@@ -187,7 +188,6 @@ class FixedEffects
     // -data- is either a varlist or a matrix
     `Variables'             y
     `Varlist'               vars
-    `RowVector'             idx
     `Integer'               i
 
     if (eltype(data) == "string") {
@@ -214,13 +214,6 @@ class FixedEffects
 
         assert_msg(cols(y)==cols(vars), "st_data() constructed more columns than expected")
 
-        if (timeit) timer_on(52)
-        idx = `selectindex'(colmax(abs(colminmax(y))) :== 0)
-        if (timeit) timer_off(52)
-        if (cols(idx) & verbose>-1) {
-            printf("{err}WARNING: after demeaning, some variables are just zeros: %s\n", invtokens(vars[idx]))
-        }
-
         if (timeit) timer_on(53)
         _partial_out(y, save_tss)
         if (timeit) timer_off(53)
@@ -306,6 +299,7 @@ class FixedEffects
     `Real'                  y_mean
     `Vector'                lhs
     `Vector'                alphas
+    `StringRowVector'       vars
     `Integer'               i
 
     if (args()<2 | save_tss==.) save_tss = 0
@@ -349,6 +343,9 @@ class FixedEffects
     if (timeit) timer_off(60)
 
 
+    // Compute 2-norm of each var, to see if we need to drop as regressors
+    kept = diagonal(cross(y, y))'
+
     // Intercept LSMR case
     if (acceleration=="lsmr") {
         // RRE benchmarking
@@ -366,45 +363,54 @@ class FixedEffects
         }
         return
     }
+    else {
+        // Standardize variables
+        if (verbose > 0) printf("{txt}    - Standardizing variables\n")
+        if (timeit) timer_on(61)
+        stdevs = reghdfe_standardize(y)
+        if (timeit) timer_off(61)
 
-    // Standardize variables
-    if (verbose > 0) printf("{txt}    - Standardizing variables\n")
-    if (timeit) timer_on(61)
-    stdevs = reghdfe_standardize(y)
-    if (timeit) timer_off(61)
-
-    // RRE benchmarking
-    rre_true_residual = rre_true_residual / stdevs[1]
-    if (compute_rre) rre_depvar_norm = norm(y[., 1])
+        // RRE benchmarking
+        if (compute_rre) {
+            rre_true_residual = rre_true_residual / stdevs[1]
+            rre_depvar_norm = norm(y[., 1])
+        }
 
-    // Solve
-    if (verbose>0) printf("{txt}    - Running solver (acceleration={res}%s{txt}, transform={res}%s{txt} tol={res}%-1.0e{txt})\n", acceleration, transform, tolerance)
-    if (verbose==1) printf("{txt}    - Iterating:")
-    if (verbose>1) printf("{txt}      ")
-    converged = 0
-    iteration_count = 0
-    if (timeit) timer_on(62)
-    y = (*func_accel)(this, y, funct_transform) :* stdevs // this is like python's self
-    if (timeit) timer_off(62)
-    // converged gets updated by check_convergence()
-    
-    if (prune) {
-        assert(G==2)
-        if (timeit) timer_on(63)
-        _expand_1core(y)
-        if (timeit) timer_off(63)
+        // Solve
+        if (verbose>0) printf("{txt}    - Running solver (acceleration={res}%s{txt}, transform={res}%s{txt} tol={res}%-1.0e{txt})\n", acceleration, transform, tolerance)
+        if (verbose==1) printf("{txt}    - Iterating:")
+        if (verbose>1) printf("{txt}      ")
+        converged = 0 // converged will get updated by check_convergence()
+        iteration_count = 0
+        if (timeit) timer_on(62)
+        y = (*func_accel)(this, y, funct_transform) :* stdevs // 'this' is like python's self
+        if (timeit) timer_off(62)
+        
+        if (prune) {
+            assert(G==2)
+            if (timeit) timer_on(63)
+            _expand_1core(y)
+            if (timeit) timer_off(63)
+        }
     }
 
     // Standardizing makes it hard to detect values that are perfectly collinear with the absvars
-    // in which case they should be 0.00 but they end up as 1e-16
+    // in which case they should be 0.00 but they end up as e.g. 1e-16
     // EG: reghdfe price ibn.foreign , absorb(foreign)
 
     // This will edit to zero entire columns where *ALL* values are very close to zero
     if (timeit) timer_on(64)
-    needs_zeroing = colmax(abs(colminmax(y))) :< 1e-8 // chose something relatively close to epsilon() ~ 1e-16
-    needs_zeroing = `selectindex'(needs_zeroing)
+    kept = (diagonal(cross(y, y))' :/ kept) :> (tolerance*1e-1)
+    needs_zeroing = `selectindex'(!kept)
     if (cols(needs_zeroing)) {
         y[., needs_zeroing] = J(rows(y), cols(needs_zeroing), 0)
+        vars = tokens(varlist)
+        assert_msg(cols(vars)==cols(y), "cols(vars)!=cols(y)")
+        for (i=1; i<=cols(vars); i++) {
+            if (!kept[i] & verbose>-1) {
+                printf("{txt}note: omitted %s; close to zero after partialling-out (tol =%3.1e)\n", vars[i], tolerance*1e-1)
+            }
+        }
     }
     if (timeit) timer_off(64)
 }

diff --git a/src/reghdfe_common.mata b/src/reghdfe_common.mata
@@ -132,7 +132,6 @@ mata:
 	`Variable'				resid
 	`Real'					eps
 	`Integer'				i
-	`StringRowVector'		indepvars
 
 	if (S.timeit) timer_on(90)
 	reghdfe_solve_ols(S, X, b=., V=., N=., rank=., df_r=., resid=.)
@@ -142,15 +141,14 @@ mata:
 	
 	// Add "o." prefix to omitted regressors
 	eps = sqrt(epsilon(1))
-	indepvars = tokens(S.indepvars)
 	for (i=1; i<=rows(b); i++) {
-		if (abs(b[i])<eps) {
-			printf("{txt}note: %s omitted because of collinearity\n", indepvars[i])
-			stata(sprintf("_ms_put_omit %s", indepvars[i]))
-			indepvars[i] = st_global("s(ospec)")
+		if (b[i]==0 & S.verbose > -1) {
+			printf("{txt}note: %s omitted because of collinearity\n", tokens(S.indepvars)[i])
+			//stata(sprintf("_ms_put_omit %s", indepvars[i]))
+			//indepvars[i] = st_global("s(ospec)")
+			// This is now one in reghdfe.ado with -_ms_findomitted-
 		}
 	}
-	S.indepvars = invtokens(indepvars)
 
 	st_matrix(Vname, V)
 	st_numscalar(nname, N)
@@ -178,12 +176,11 @@ mata:
                                   `Integer' df_r,
                                   `Vector' resid)
 {
-	// Hack: the first col of X is actually y
+	// Hack: the first col of X is actually y!
 	`Integer'				K, KK
 	`Matrix'				xx, inv_xx, W, inv_V
 	`Vector' 				xy, w
 	`Integer'				used_df_r
-	`RowVector'				kept
 
 	if (S.vcetype == "unadjusted" & S.weight_type=="pweight") S.vcetype = "robust"
 	if (S.verbose > 0) printf("\n{txt} ## Solving least-squares regression of partialled-out variables\n\n")
@@ -215,8 +212,31 @@ mata:
 	xy = K ? xx[| 2 , 1 \ K+1 , 1 |] : J(0, 1, .)
 	xx = K ? xx[| 2 , 2 \ K+1 , K+1 |] : J(0, 0, .)
 	if (S.timeit) timer_off(91)
-	
-	b = reghdfe_cholqrsolve(xx, xy, 1) // qr or chol?
+
+	// This matrix indicates what regressors are not collinear
+	S.kept = K ? S.kept[2..K+1] : J(1, 0, .)
+
+	// Bread of the robust VCV matrix
+	// Compute this early so we can update the list of collinear regressors
+	if (S.timeit) timer_on(95)
+	inv_xx = reghdfe_rmcoll(tokens(S.indepvars), xx, S.kept)
+	S.df_m = rank = K - diag0cnt(inv_xx)
+	KK = rank + S.df_a
+	S.df_r = N - KK // replaced when clustering
+	if (S.timeit) timer_off(95)
+
+	// Compute betas
+	if (S.has_weights) {
+		b = reghdfe_cholqrsolve(xx, xy, 1) // qr or chol?
+	}
+	else {
+		// This is more numerically accurate but doesn't handle weights
+		// See: http://www.stata.com/statalist/archive/2012-02/msg00956.html
+		b = J(K, 1, 0)
+		if (cols(S.kept)) {
+			b[S.kept] = qrsolve(X[., 1:+S.kept], X[., 1])
+		}
+	}
 
 	if (S.timeit) timer_on(92)
 	resid = X * (1 \ -b) // y - X * b
@@ -236,13 +256,6 @@ mata:
 	S.rss = quadcross(resid, w, resid) // do before reghdfe_robust() modifies w
 	if (S.timeit) timer_off(94)
 
-	// Bread of the robust VCV matrix
-	if (S.timeit) timer_on(95)
-	inv_xx = reghdfe_rmcoll(tokens(S.indepvars), xx, kept=.) // invsym(xx, 1..K)
-	S.df_m = rank = K - diag0cnt(inv_xx)
-	KK = rank + S.df_a
-	S.df_r = N - KK // replaced when clustering
-	if (S.timeit) timer_off(95)
 
 	// Compute full VCE
 	if (S.timeit) timer_on(96)
@@ -263,16 +276,16 @@ mata:
 
 	// Wald test: joint significance
 	if (S.timeit) timer_on(97)
-	inv_V = invsym(V[kept, kept]) // this might not be of full rank but numerical inaccuracies hide it
+	inv_V = invsym(V[S.kept, S.kept]) // this might not be of full rank but numerical inaccuracies hide it
 	if (diag0cnt(inv_V)) {
 		if (S.verbose > -1) printf("{txt}(Warning: missing F statistic; dropped variables due to collinearity or too few clusters)\n")
 		W = .
 	}
-	else if (length(b[kept])==0) {
+	else if (length(b[S.kept])==0) {
 		W = .
 	}
 	else {
-		W = b[kept]' * inv_V * b[kept] / S.df_m
+		W = b[S.kept]' * inv_V * b[S.kept] / S.df_m
 		if (missing(W) & S.verbose > -1) printf("{txt}(Warning: missing F statistic)\n")
 	}
 	if (S.timeit) timer_off(97)

diff --git a/test/omitted.do b/test/omitted.do
@@ -0,0 +1,123 @@
+noi cscript "reghdfe: ensure we omit the correct variables" adofile reghdfe
+
+* Setup
+	local included_e ///
+		scalar: N rmse tss rss r2 r2_a df_r df_m ll ll_0 /// mss F
+		matrix: trim_b trim_V ///
+		macros: wexp wtype
+
+* [TEST] Simplest case
+	sysuse auto, clear
+	gen z = 0
+	reghdfe price z, noab
+	matrix b = e(b)
+	assert b[1,1]==0
+	matrix list b
+	_ms_omit_info b
+	assert r(k_omit)==1
+
+
+* [TEST] Simple case
+* https://github.com/sergiocorreia/reghdfe/issues/93
+
+	sysuse auto, clear
+	gen lowrep = rep78<4
+	loc lhs price
+	loc rhs lowrep
+	loc absvars rep78
+
+	* 1. Run benchmark
+	areg `lhs' `rhs', absorb(`absvars')
+	matrix list e(b)
+
+	trim_cons
+	local bench_df_a = e(df_a)
+	storedresults save benchmark e()
+
+	* 2. Run reghdfe
+	reghdfe `lhs' `rhs', absorb(`absvars') keepsingletons verbose(-1)
+	matrix list e(b)
+	notrim
+	storedresults compare benchmark e(), tol(1e-12) include(`included_e')
+	assert `bench_df_a'==e(df_a)-1
+
+* [TEST] https://github.com/sergiocorreia/reghdfe/issues/95
+	clear
+	set obs 100
+	set seed 123
+	gen double y = runiform() * 1e8
+	gen double z = runiform()
+	gen byte c = 1
+
+	loc lhs z
+	loc rhs y
+	loc absvars c
+
+	* 1. Run benchmark
+	areg `lhs' `rhs', absorb(`absvars')
+	matrix list e(b)
+
+	trim_cons
+	local bench_df_a = e(df_a)
+	storedresults save benchmark e()
+
+	* 2. Run reghdfe
+	reghdfe `lhs' `rhs', absorb(`absvars') keepsingletons verbose(-1)
+	matrix list e(b)
+	notrim
+	storedresults compare benchmark e(), tol(1e-12) include(`included_e')
+	assert `bench_df_a'==e(df_a)-1
+
+
+* [TEST] https://github.com/sergiocorreia/reghdfe/issues/94
+	clear
+input w x y unity
+w          x          y      unity
+1    -1.10565   -1.223344       1
+1   -1.109139   -1.127615       1
+0   -.8497003   -1.207231       1
+1   -.2734231   -1.294683       1
+0    .0063879   -.8343916       1
+1    .7669702   -.8245103       1 
+end
+
+	gen z = 0
+	reghdfe y z, noab
+
+	reghdfe y i.w##i.unity x, a(unity) tol(1e-12)
+	matrix list e(b)
+	matrix list e(V)
+
+	areg y i.w##i.unity x, a(unity)
+	matrix list e(b)
+	matrix list e(V)
+
+
+	loc lhs y
+	loc rhs i.w##i.unity x
+	loc absvars unity
+
+	* 1. Run benchmark
+	areg `lhs' `rhs', absorb(`absvars')
+	matrix list e(b)
+
+	trim_cons
+	local bench_df_a = e(df_a)
+	storedresults save benchmark e()
+
+	* 2. Run reghdfe
+	reghdfe `lhs' `rhs', absorb(`absvars') keepsingletons verbose(-1)
+	matrix list e(b)
+	notrim
+	matrix b = e(b)
+	matrix V = e(V)
+	assert abs(SR__benchmark_trim_b[1,2] - b[1,1]) < 1e-8
+	assert abs(SR__benchmark_trim_b[1,6] - b[1,4]) < 1e-8
+	assert abs(SR__benchmark_trim_V[2,2] - V[1,1]) < 1e-8
+	assert abs(SR__benchmark_trim_V[6,6] - V[4,4]) < 1e-8
+
+	* Done!
+
+storedresults drop benchmark
+clear matrix
+exit