-
Notifications
You must be signed in to change notification settings - Fork 0
/
Filtering.py
63 lines (49 loc) · 1.78 KB
/
Filtering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/binpython
# -*- coding: utf-8 -*-
import os
import shutil
"""
Program to filter the HGDP data for selecting only those SNPs of interest. In this case,
we are using the data collected from Eupedia to select the SNPs related to disease.
Author: Carmen Bravo
Version: 2.0
"""
#Open files.
HgdpSNP = open("HGDP_FinalReport_Forward.txt")
RiskSNP = open("RiskSNPs.txt", "U")
RiskSNPList = open("RiskSNPsList.txt", "U").read().split()
#Create new folder with output files. Delete the folder if it is already there.
dir = "/Users/MissBravo/Desktop/Omics Final Project/Data_treatment/SelectedSNPs"
if os.path.exists(dir):
shutil.rmtree(dir)
os.makedirs(dir)
os.chdir("/Users/MissBravo/Desktop/Omics Final Project/Data_treatment/SelectedSNPs")
OutputHgdp = open("SelectedHgdpSNPs.txt", "w")
OutputRisk = open("SelectedRiskSNPs.txt", "w")
SelectedSNPsList = open("SelectedSNPsList.txt", "w")
RepeatedSNPs = open("RepeatedSNPsList.txt", "w")
#Add the header. The header contains the subjects' identifiers.
OutputHgdp.write(HgdpSNP.readline())
#Add the header. The header contains SNPs and disease information.
OutputRisk.write(RiskSNP.readline())
#Selecting the rows of interest (the ones that correspond to disease SNPs).
SNPList = [] #To avoid duplicates.
for row in HgdpSNP:
if row.split()[0] in RiskSNPList:
OutputHgdp.write(row)
if row.split()[0] not in SNPList:
SNPList.append(row.split()[0])
SelectedSNPsList.write(row.split()[0] + "\n")
OutputHgdp.close()
SelectedSNPsList.close()
#Select SNPs from the risk alleles list.
RepeatedSNP = []
for row in RiskSNP:
if row.split()[0] in SNPList:
OutputRisk.write(row)
if row.split()[0] in RepeatedSNP:
RepeatedSNPs.write(row.split()[0] + "\n")
else:
RepeatedSNP.append(row.split()[0])
OutputRisk.close()
RepeatedSNPs.close()