-
Notifications
You must be signed in to change notification settings - Fork 0
/
correlation1.py
288 lines (267 loc) · 9.97 KB
/
correlation1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
import sys
from scipy import stats
#import scipy
from pandas import DataFrame
import seaborn as sns
import matplotlib.pyplot as plt
#Dictionary of all column vectors made, cps, epa, and tag. Keys are the gene names (ex. EF_002456T0).
annot_cols = {}
#Having an arbitrary list of all the cps, epa, and tag genes and serotypes so that all columns stick to the same order
#row_names = []
def columnize(filename):
file0 = open(filename, "r")
line = file0.readline()
while line != "DATA\n":
line = file0.readline()
if line.startswith("FIELD_LABELS"):
genes_in_order = line.split()[1:]
for i in range(len(genes_in_order)):
if "cps" in filename:
new_gene_name = "cps_" + genes_in_order[i]
elif "tag" in filename:
new_gene_name = "tag_" + genes_in_order[i]
elif "epa" in filename:
new_gene_name = "epa_" + genes_in_order[i]
if genes_in_order[i] == "EF_002454T0":
new_gene_name = "cpsF"
genes_in_order[i] = new_gene_name
annot_cols[new_gene_name] = []
line = file0.readline()
while line != "":
for i in range(len(genes_in_order)):
gene = genes_in_order[i]
digit = int(line.split()[i+1])
annot_cols[gene].append(digit)
line = file0.readline()
#columnize("sorted_cps_heatmap.txt")
#columnize("sorted_epa_heatmap.txt")
#columnize("sorted_tag_heatmap.txt")
columnize("sorted_cps_heatmap400.txt")
columnize("sorted_epa_heatmap400.txt")
columnize("sorted_tag_heatmap400.txt")
#Uncomment to include capsule type columns
"""annot_cols["T1"] = []
annot_cols["T2"] = []
annot_cols["T5"] = []
serotype_file = open("sorted_serotype_colorstrip.txt", "r")
line = serotype_file.readline()
while line != "DATA\n":
line = serotype_file.readline()
line = serotype_file.readline()
while line != "":
serotype = line.split()[2]
if serotype == "T1":
annot_cols["T1"].append(1)
annot_cols["T2"].append(0)
annot_cols["T5"].append(0)
elif serotype == "T2":
annot_cols["T1"].append(0)
annot_cols["T2"].append(1)
annot_cols["T5"].append(0)
elif serotype == "T5":
annot_cols["T1"].append(0)
annot_cols["T2"].append(0)
annot_cols["T5"].append(1)
else:
annot_cols["T1"].append(0)
annot_cols["T2"].append(0)
annot_cols["T5"].append(0)
line = serotype_file.readline()"""
snp_annot_cols = {}
def columnize2(filename):
for i in range(1276, 1294):
snp_annot_cols["EF" + str(i)] = []
file0 = open("../nonsynonymous/" + filename, "r")
if filename == "sorted_frameshift_only_heatmap.txt" or filename == "sorted_nonsyn_only_heatmap.txt":
line = file0.readline()
while line != "DATA\n":
line = file0.readline()
line = file0.readline()
while line != "":
for i in range(18):
digit = int(line.split()[i+1])
if digit == 0:
snp_annot_cols["EF" + str(1276+i)].append(0)
else:
snp_annot_cols["EF" + str(1276+i)].append(1)
line = file0.readline()
#Don't forget to uncomment the necessary lines after line 187
#columnize2("sorted_frameshift_only_heatmap.txt")
#columnize2("sorted_nonsyn_only_heatmap.txt")
#columnize2("sorted_nonsyn_and_no_frameshift_heatmap.txt")
#columnize2("sorted_nonsyn_and_frameshift_heatmap.txt")
def columnize3(filename):
for i in range(1276, 1294):
snp_annot_cols["EF" + str(i)] = []
snp_annot_cols["EF1277-1278_intergenic"] = []
snp_annot_cols["EF1283-1284_intergenic"] = []
snp_annot_cols["EF1292-1293_intergenic"] = []
file0 = open("../snps/" + filename, "r")
line = file0.readline()
while line != "DATA\n":
line = file0.readline()
line = file0.readline()
while line != "":
for i in range(18):
digit = int(line.split()[i+1])
if digit == 0:
snp_annot_cols["EF" + str(1276+i)].append(0)
elif digit == 1:
snp_annot_cols["EF" + str(1276+i)].append(1)
else:
print("Not binary")
sys.exit()
digit = int(line.split()[19])
if digit == 0:
snp_annot_cols["EF1277-1278_intergenic"].append(0)
elif digit == 1:
snp_annot_cols["EF1277-1278_intergenic"].append(1)
else:
print("Not binary")
sys.exit()
digit = int(line.split()[20])
if digit == 0:
snp_annot_cols["EF1283-1284_intergenic"].append(0)
elif digit == 1:
snp_annot_cols["EF1283-1284_intergenic"].append(1)
else:
print("Not binary")
sys.exit()
digit = int(line.split()[21])
if digit == 0:
snp_annot_cols["EF1292-1293_intergenic"].append(0)
elif digit == 1:
snp_annot_cols["EF1292-1293_intergenic"].append(1)
else:
print("Not binary")
sys.exit()
line = file0.readline()
#Don't forget to uncomment the necessary lines after line 187
#columnize3("sorted_one_col_per_gene_binary_heatmap.txt")
def columnize4(filename):
file0 = open(filename, "r")
line = file0.readline()
while line != "DATA\n":
line = file0.readline()
if line.startswith("FIELD_LABELS"):
genes_in_order = line.split()[1:]
for gene in genes_in_order:
snp_annot_cols[gene] = []
line = file0.readline()
while line != "":
for i in range(len(genes_in_order)):
gene = genes_in_order[i]
digit = int(line.split()[i+1])
snp_annot_cols[gene].append(digit)
line = file0.readline()
return genes_in_order
#Don't forget to uncomment the necessary lines after line 187
#row_names = columnize4("../snps/sorted_150bp_binary_heatmap.txt")
row_names = columnize4("../snps/sorted_150bp_400genomes_heatmap.txt")
#row_names2 = columnize4("../snps/sorted_EF1291_heatmap_new.txt")
#row_names = columnize4("../nonsynonymous/sorted_nonsyn_EF1291_heatmap.txt")
#row_names = columnize4("../nonsynonymous/sorted_nonsyn_and_no_frameshift_EF1291_heatmap.txt")
#row_names = columnize4("../nonsynonymous/sorted_frameshift_only_EF1291_heatmap.txt")
#row_names = columnize4("../nonsynonymous/sorted_nonsyn_and_frameshift_EF1291_heatmap.txt")
#pearsonr will error if every value in a column is the same, so this part checks for that and prints the columns that are constant. They will also be omitted from the heatmap
phage2_genes_to_skip = []
for phage2_gene in snp_annot_cols:
column = snp_annot_cols[phage2_gene]
all_same = True
val = column[0]
for i in column:
if i != val:
all_same = False
break
if all_same:
print("The matrix column,", phage2_gene, "consists of all", str(val) + "'s. Omitting from heatmap.")
phage2_genes_to_skip.append(phage2_gene)
"""If using columnize2:
corr_cols = {}
for capsule_gene in annot_cols:
column = []
for i in range(1276, 1294):
if "EF" + str(i) not in phage2_genes_to_skip:
#pearsonr( , )[0] is the statistic, [1] is the p-value
cell_value = stats.pearsonr(annot_cols[capsule_gene], snp_annot_cols["EF" + str(i)])[0]
column.append(cell_value)
corr_cols[capsule_gene] = column
row_names = []
for i in range(1276, 1294):
if "EF" + str(i) not in phage2_genes_to_skip:
row_names.append("EF" + str(i))
#End of columnize2"""
"""If using columnize3:
corr_cols = {}
for capsule_gene in annot_cols:
column = []
for i in range(1276, 1294):
if "EF" + str(i) not in phage2_genes_to_skip:
#pearsonr( , )[0] is the statistic, [1] is the p-value
cell_value = stats.pearsonr(annot_cols[capsule_gene], snp_annot_cols["EF" + str(i)])[0]
column.append(cell_value)
if "EF1277-1278_intergenic" not in phage2_genes_to_skip:
cell_value = stats.pearsonr(annot_cols[capsule_gene], snp_annot_cols["EF1277-1278_intergenic"])[0]
column.append(cell_value)
if "EF1283-1284_intergenic" not in phage2_genes_to_skip:
cell_value = stats.pearsonr(annot_cols[capsule_gene], snp_annot_cols["EF1283-1284_intergenic"])[0]
column.append(cell_value)
if "EF1292-1293_intergenic" not in phage2_genes_to_skip:
cell_value = stats.pearsonr(annot_cols[capsule_gene], snp_annot_cols["EF1292-1293_intergenic"])[0]
column.append(cell_value)
corr_cols[capsule_gene] = column
row_names = []
for i in range(1276, 1294):
if "EF" + str(i) not in phage2_genes_to_skip:
row_names.append("EF" + str(i))
if "EF1277-1278_intergenic" not in phage2_genes_to_skip:
row_names.append("EF1277-1278_intergenic")
if "EF1283-1284_intergenic" not in phage2_genes_to_skip:
row_names.append("EF1283-1284_intergenic")
if "EF1292-1293_intergenic" not in phage2_genes_to_skip:
row_names.append("EF1292-1293_intergenic")
#End of columnize3"""
"""If using columnize4:"""
#for finding the positions of the blackest cells in EF1291
#saved = []
saved = {}
corr_cols = {}
for capsule_gene in annot_cols:
column = []
for chunk in snp_annot_cols:
#if int(chunk) > 12896 and int(chunk) < 13077:
#pearsonr( , )[0] is the statistic, [1] is the p-value
cell_value = stats.pearsonr(annot_cols[capsule_gene], snp_annot_cols[chunk])[0]
pval = stats.pearsonr(annot_cols[capsule_gene], snp_annot_cols[chunk])[1]
column.append(cell_value)
if abs(cell_value) > 0.5:
print("Phage 2 =", chunk, "cell wall gene =", capsule_gene, "correlation =", cell_value, "p-value =", pval)
#if capsule_gene in ["epa_EF_002160T0", "epa_EF_002161T0", "epa_EF_002162T0", "epa_EF_002163T0", "epa_EF_002164T0"]:
#print("Phage 2 =", chunk, "epa =", capsule_gene, "correlation =", cell_value, "p-value =", pval)
#else:
#print("ALERT", "Phage 2 =", chunk, "epa =", capsule_gene, "correlation =", cell_value, "p-value =", pval)
# if capsule_gene == "epa_EF_002164T0":
# print(chunk, "correlation =", cell_value)
#if cell_value < -0.4:
#saved.append(cell_value)
#saved[chunk] = cell_value
# saved[chunk] = (cell_value, pval)
corr_cols[capsule_gene] = column
#for finding the positions of the blackest cells in EF1291
#sns.histplot(saved, binwidth=0.01)
#plt.savefig("correlation_distribution_nonsyn")
#for tuple0 in sorted(saved.items(), key=lambda item: item[1][0]):
#print(tuple0[0] + "\t" + str(tuple0[1]))
# print(tuple0[0] + "\t" + str(tuple0[1][0]) + "\t" + str(tuple0[1][1]))
#sys.exit()
#End of columnize4"""
#Only for the 3rd call to columnize4:
#row_names = []
#for name in row_names2:
# if int(name) > 12896 and int(name) < 13077:
# row_names.append(name)
#df = DataFrame(data=corr_cols, index=row_names)
#Only uncomment one of the following three
#sns.clustermap(data=df, vmin=-1, vmax=1).savefig("150bp_400_genomes_correlation_heatmap")
#sns.clustermap(data=df, row_cluster=False, vmin=-1, vmax=1).savefig("150bp_400_genomes_correlation_heatmap_in_order")
#sns.clustermap(data=df, metric="correlation").savefig("correlation_heatmap_diff_metric")