-
Notifications
You must be signed in to change notification settings - Fork 1
/
calc_linearsvm_pvals_gaonkar.py
93 lines (77 loc) · 3.28 KB
/
calc_linearsvm_pvals_gaonkar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
"""
Computes an SVM classification between two classes and then uses the analytical approximation of Gaonkar et al. 2015
(doi: https://dx.doi.org/10.1016/j.media.2015.06.008) to calculate a p-value for each weight.
"""
import numpy as np
from scipy import stats
def get_C(X):
"""
Calculates the C matrix (equation 4 in the paper). Splits the compuation in subparts to make it (slightly) better
tractable
:param X: data matrix (numpy.array, (m, d)) (m=num_subjects, d=num_dimensions)
:return: C (numpy.array, (d, m))
"""
J = np.ones((X.shape[0], 1))
cov = np.dot(X, X.T)
inverted_cov = np.linalg.pinv(cov)
XXTinv_J = np.dot(inverted_cov, J)
minus_JT_XXTinv_J_inv = np.linalg.pinv(np.dot(np.dot(-J.T, inverted_cov), J))
JT_XXTinv = np.dot(J.T, inverted_cov)
return np.dot(X.T, inverted_cov + np.dot(np.dot(XXTinv_J, minus_JT_XXTinv_J_inv), JT_XXTinv))
def get_sigma_2(C, rho):
"""
Calculates the variance (equation 5 in the paper) based on C and rho
:param C: matrix (numpy.array) (d, m) (d=num_dimensions, m=num_subjects)
:param rho: proportion of the positive class in the classification (i.e. sum(y==1)/y.size)
:return: variance for each component (numpy.array (d, ))
"""
return (4. * rho - 4 * rho**2) * np.sum(C**2, axis=1)
def get_sj(w):
"""
Margin-informed statistic (equation 8 in the paper). Based on the true (unpermuted) weights w
:param w: weights W from the SVM (numpy.array, (d, ))
:return: sj (numpy.array, (d, ))
"""
return w/np.dot(w, w)
def get_zscore(sigma_2, sj):
"""
Transforms s_j into standard normal distribution (N(0, 1); equation 15 in the paper)
:param sigma_2: variance vector (numpy.array, (d, ))
:param sj: margin-informed statistic (numpy.array, (d, ))
:return: standardized sj (numpy.array, (d, ))
"""
normalization_factor = np.sum(sigma_2)
return (normalization_factor/np.sqrt(sigma_2)) * sj
def get_pval(z_scores):
"""
Calculates p-values for z-scored values. 1 - cdf(abs(z-score)) * 2 (*2 because of the two-tailed test).
:param z_scores: z-scores s_j values (numpy.array, (d, ))
:return: p-values corresponding to the z-scores (numpy.array, (d, ))
"""
return (1 - stats.norm.cdf(np.abs(z_scores))) * 2
def get_rho(y):
"""
Returns the proportion of the positive class (0, 1)
:param y: classification labels (positive class is assumed to be labeled as 1)
:return: proportion of positive class
"""
return np.sum(y == 1, dtype=np.float)/y.size
def calculate_p_values(X, y, w):
"""
Based on the data (X), labels (y) and the trained weights of the SVM (w) calculates analytically the p-values for
each weight
:param X: data matrix (numpy.array, (m, d); (m=num_subjects, d=num_dimensions))
:param y: labels (numpy.array (m, ))
:param w: SVM weights (numpy.array (d, ))
:return:
"""
rho = get_rho(y)
C = get_C(X)
sigma_2 = get_sigma_2(C, rho)
sj = get_sj(w)
sj_sign = np.sign(sj)
sj_zscored = get_zscore(sigma_2, sj)
p = get_pval(sj_zscored)
# to prevent problem when taking the log10 later we will set p = 0 values to the minimal value of the system
p[p == 0] = np.finfo(np.float64).eps
return p, sj_sign