-
Notifications
You must be signed in to change notification settings - Fork 59
/
parse_dailymed.py
209 lines (193 loc) · 7.99 KB
/
parse_dailymed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import urllib2, os, cPickle, re, time
from bs4 import BeautifulSoup
from toolbox import text_utilities, parse_drugbank
def main():
spl_id = "6ff35080-77ba-44b2-b889-95c6a4b5af94" #"3a0a6ef4-fb2c-47be-82d7-d2675059c45b" #"4ff62a5a-77ba-4bf5-0497-95ccca842315" #"b9ff238b-1e7c-4663-aa70-81425d0ced9e" #"770b5db0-3565-409d-a274-caa132fb8d89" #"3f43fcc5-1a77-41e8-9bc9-079e73a3fa57" #ddef12df-a0bd-4f58-bd3e-da975581d01b" #"f278f0d8-4062-420a-97c3-3a8b88253a08" #"0aa4ac07-bd02-46b2-9cf0-0643e0e1a2da" #"9c983da1-a8e0-479e-8816-5d5668ea242e" #"3313fa81-9c4c-4f03-a48c-34900fcb0848" #"e2158832-ad0b-4b9c-bbf0-1608a177bf85" #"29a6f213-bc93-4dbd-bab0-744722b6f0b8"
spl_id2 = "b16c7832-2fd9-49af-b923-1dc0d91fd6e2" #"57bccb29-1c47-4c64-ab6a-77960a91cc20" #"9c983da1-a8e0-479e-8816-5d5668ea242e"
#drug = "methotrexate"
#disease = "type 2 diabetes mellitus"
rxnorm_mapping_file = "/home/emre/arastirma/data/drug/fda/dailymed/rxnorm_mappings.txt"
spl_id_to_names = get_rxnorm_mapping(rxnorm_mapping_file)
print spl_id_to_names[spl_id]
#return
for spl in (spl_id,): # spl_id2):
name, indication, contraindication, warning = fetch_spl_data("/home/emre/arastirma/data/drug/fda/dailymed/spl", spl)
print spl, name
print indication
print contraindication
#print warning
return
def fetch_spl_data(output_dir, spl):
out_file = output_dir + "/%s.html" % spl
if os.path.exists(out_file):
f = open(out_file)
name, indication, contraindication, warning = read_spl_data(f.read())
f.close()
return name, indication, contraindication, warning
response = get_data(parameter=spl)
#print response
if response is None:
return None, None, None, None
result = []
f = open(out_file, 'w')
for row in response:
if row.strip() == "":
continue
f.write(row.replace("\r", ""))
result.append(row)
f.close()
name, indication, contraindication, warning = read_spl_data("".join(result)) # response.read()
return name, indication, contraindication, warning
def get_drugbank_id_to_spl_id_mapping(rxnorm_mapping_file, name_to_drug, synonym_to_drug, selected_drugs=None, dump_file=None):
if dump_file is not None and os.path.exists(dump_file):
drugbank_id_to_spl_ids = cPickle.load(open(dump_file))
return drugbank_id_to_spl_ids
spl_id_to_names = get_rxnorm_mapping(rxnorm_mapping_file)
drugbank_id_to_spl_ids = {}
for spl_id, names in spl_id_to_names.iteritems():
drugbank_id = None
for name in names:
if name.find(" / ") != -1: # Skip drug combinations
continue
drugbank_id, drugbank_name = parse_drugbank.get_drugbank_id_from_name(name, name_to_drug, synonym_to_drug, regex_db_name = True)
if drugbank_id is not None:
break
if drugbank_id is None:
continue
#print name, drugbank_name
drugbank_id_to_spl_ids.setdefault(drugbank_id, []).append(spl_id)
cPickle.dump(drugbank_id_to_spl_ids, open(dump_file, 'w'))
return drugbank_id_to_spl_ids
def get_rxnorm_mapping(mapping_file):
"""
SETID|SPL_VERSION|RXCUI|RXSTRING|RXTTY
"""
spl_id_to_names = {}
f = open(mapping_file)
header = f.readline()
for line in f:
words = line.strip().split("|")
spl_id = words[0]
name = words[3]
spl_id_to_names.setdefault(spl_id, []).append(name)
f.close()
return spl_id_to_names
def get_data(command="setid", parameter=None):
response = None
url = 'http://dailymed.nlm.nih.gov/dailymed/drugInfo.cfm?%s=%s' % (command, parameter)
#user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
#values = {}
#headers = { 'User-Agent' : user_agent }
#data = urllib.urlencode(values)
#req = urllib2.Request(url, "", headers)
req = urllib2.Request(url)
try:
response = urllib2.urlopen(req)
except:
print "Problem with response:", parameter
return response
def read_spl_data(html_doc):
name = None
indication = []
contraindication = []
warning = []
soup = BeautifulSoup(html_doc, "lxml")
valid_tags = set(["p", "li", "dd"]) # "dt"
for tag in soup.find_all(['h1']):
#print tag, tag.get("class")
cls = tag.get("class")
tag = tag.find_next()
cls_inner = tag.get("class")
#print tag, cls
if cls is not None and cls[0] == "Highlights":
if tag.string is None:
continue
header = tag.string.encode().strip().lower()
#print header
flag = None
if header.find("contraindication") != -1:
flag = "contraindication"
elif header.find("indication") != -1 or header in ("uses", "use", "usage"): # "indications", "indications and usage"
flag = "indication"
if header.find("warning") != -1 and header.find("boxed") == -1:
flag = "warning"
if flag is not None:
for tag_p in tag.find_all_next():
try:
#print "II:", tag_p.name
if tag_p.name == "h1": # in ("h1", "h2"):
break
except:
continue
if tag_p.name in valid_tags:
txt = tag_p.get_text(" ").strip()
if txt == "":
continue
tag2 = tag.find_next("h1")
txt2 = tag2.get_text(" ").strip()
idx = txt.find(txt2)
if idx != -1:
txt = txt[:idx]
txt = " ".join(txt.split())
txt = txt.encode("ascii", "ignore")
if txt == "":
continue
if flag == "indication":
indication.append(txt)
elif flag == "contraindication":
contraindication.append(txt)
elif flag == "warning":
warning.append(txt)
elif cls_inner is not None and cls_inner[0] == "long-title":
if name is not None:
print "Multiple name:", name
name = tag.get_text().encode("ascii", "ignore")
words = name.split(" - ")
name = words[0].strip().lower()
if len(indication) == 0: # Consider moving this to above, there are labels with highlights (with less info)
for tag in soup.find_all(['a']):
href = tag.get("href")
if href is not None and href == "#":
if tag.string is None:
continue
header = tag.string.encode("ascii", "ignore").strip().lower()
flag = None
if header.find("contraindication") != -1:
flag = "contraindication"
elif header.find("indication") != -1 or header in ("uses", "use", "usage"): # "indications", "indications and usage"
flag = "indication"
if header.find("warning") != -1 and header.find("boxed") == -1:
flag = "warning"
if flag is not None:
for tag_p in tag.find_all_next(["p", "li", "dd", "a"]):
try:
if tag_p.name == "a":
href = tag_p.get("href")
if href is not None and href == "#":
break
except:
print "Exception:", tag_p.name
continue
if tag_p.name in valid_tags:
#print tag_p.name, tag_p.get_text(" ")
txt = tag_p.get_text(" ").strip()
if txt == "":
continue
tag2 = tag_p.find_next("a", href="#")
txt2 = tag2.get_text(" ").strip()
idx = txt.find(txt2)
if idx != -1:
txt = txt[:idx]
txt = " ".join(txt.split())
txt = txt.encode("ascii", "ignore")
if txt == "":
continue
if flag == "indication":
indication.append(txt)
elif flag == "contraindication":
contraindication.append(txt)
elif flag == "warning":
warning.append(txt)
return name, indication, contraindication, warning
if __name__ == "__main__":
main()