-
Notifications
You must be signed in to change notification settings - Fork 37
/
lasso_example_null_CV.py
122 lines (93 loc) · 4.42 KB
/
lasso_example_null_CV.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import functools
import numpy as np
from scipy.stats import norm as ndist
import regreg.api as rr
from selection.tests.instance import gaussian_instance
from knockoffs import lasso_glmnet
from core import (infer_full_target,
split_sampler, # split_sampler not working yet
normal_sampler,
logit_fit,
probit_fit)
def simulate(n=100, p=50, s=10, signal=(0, 0), sigma=2, alpha=0.1):
# description of statistical problem
X, y, truth = gaussian_instance(n=n,
p=p,
s=s,
equicorrelated=False,
rho=0.0,
sigma=sigma,
signal=signal,
random_signs=True,
scale=False)[:3]
XTX = X.T.dot(X)
XTXi = np.linalg.inv(XTX)
resid = y - X.dot(XTXi.dot(X.T.dot(y)))
dispersion = np.linalg.norm(resid)**2 / (n-p)
S = X.T.dot(y)
covS = dispersion * X.T.dot(X)
smooth_sampler = normal_sampler(S, covS)
splitting_sampler = split_sampler(X * y[:, None], covS)
def meta_algorithm(X, XTXi, resid, sampler):
S = sampler(scale=0.) # deterministic with scale=0
ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X
G = lasso_glmnet(X, ynew, *[None]*4)
select = G.select()
return set(list(select[0]))
selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid)
# run selection algorithm
observed_set = selection_algorithm(splitting_sampler)
# find the target, based on the observed outcome
# we just take the first target
pivots, covered, lengths = [], [], []
naive_pivots, naive_covered, naive_lengths = [], [], []
for idx in list(observed_set)[:1]:
print("variable: ", idx, "total selected: ", len(observed_set))
true_target = truth[idx]
(pivot,
interval) = infer_full_target(selection_algorithm,
observed_set,
idx,
splitting_sampler,
dispersion,
hypothesis=true_target,
fit_probability=probit_fit,
alpha=alpha,
B=500)
pivots.append(pivot)
covered.append((interval[0] < true_target) * (interval[1] > true_target))
lengths.append(interval[1] - interval[0])
target_sd = np.sqrt(dispersion * XTXi[idx, idx])
observed_target = np.squeeze(XTXi[idx].dot(X.T.dot(y)))
quantile = ndist.ppf(1 - 0.5 * alpha)
naive_interval = (observed_target-quantile * target_sd, observed_target+quantile * target_sd)
naive_pivots.append((1-ndist.cdf((observed_target-true_target)/target_sd))) # one-sided
naive_covered.append((naive_interval[0]<true_target)*(naive_interval[1]>true_target))
naive_lengths.append(naive_interval[1]-naive_interval[0])
return pivots, covered, lengths, naive_pivots, naive_covered, naive_lengths
if __name__ == "__main__":
import statsmodels.api as sm
import matplotlib.pyplot as plt
np.random.seed(1)
U = np.linspace(0, 1, 101)
P, L, coverage = [], [], []
naive_P, naive_L, naive_coverage = [], [], []
plt.clf()
for i in range(500):
p, cover, l, naive_p, naive_covered, naive_l = simulate()
coverage.extend(cover)
P.extend(p)
L.extend(l)
naive_P.extend(naive_p)
naive_coverage.extend(naive_covered)
naive_L.extend(naive_l)
print("selective:", np.mean(P), np.std(P), np.mean(L) , np.mean(coverage))
print("naive:", np.mean(naive_P), np.std(naive_P), np.mean(naive_L), np.mean(naive_coverage))
print("len ratio selective divided by naive:", np.mean(np.array(L) / np.array(naive_L)))
if i % 2 == 0 and i > 0:
plt.clf()
plt.plot(U, sm.distributions.ECDF(P)(U), 'r', label='Selective', linewidth=3)
plt.plot([0,1], [0,1], 'k--', linewidth=2)
plt.plot(U, sm.distributions.ECDF(naive_P)(U), 'b', label='Naive', linewidth=3)
plt.legend()
plt.savefig('lasso_example_null_CV.pdf')