-
Notifications
You must be signed in to change notification settings - Fork 59
/
parse_fda.py
558 lines (515 loc) · 21.9 KB
/
parse_fda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
import urllib2, os, cPickle, json, re, time
import configuration, text_utilities, stat_utilities
import sys, ssl
CONFIG = configuration.Configuration()
try:
API_USER_KEY = CONFIG.get("FDA_API_KEY") # change this value with custom API key
except:
print "Warning: openFDA API key not found!"
API_USER_KEY = None
LIMIT = 100
FIELD_DRUG = "patient.drug.medicinalproduct"
FIELD_DISEASE = "patient.drug.drugindication"
FIELD_EFFECT = "patient.reaction.reactionmeddrapt"
N_MIN = 5
def main():
#drug = "montelukast"
#drug = "vitamin e" # "vit. e"
#drug = "donepezil"
drug = "methotrexate"
disease = "type 2 diabetes mellitus"
#disease = "alzheimer"
#disease = "asthma"
#disease = "acute lymphocytic leukaemia"
#disease = "rheumatoid arthritis"
#disease = "bone sarcoma"
condition = None #"drug ineffective"
#print choose_fda_drug_name(["Valaciclovir","Valacyclovir","valtrex","Valaciclovirum","Valztrex","Zelitrex","Pervioral","Actavis"])
#mesh_name_to_ids = { "type 2 diabetes mellitus":1, "Diabetes Mellitus, Type 2":1, "Osteopenia":2, "Bone Diseases, Metabolic":3, "Pulmonary Disease, Chronic Obstructive":4 }
#print convert_fda_name_to_mesh(disease, mesh_name_to_ids)
d = get_counts_from_data(drug=None, disease=None, condition="toxicity")
print d.items()[:3]
d = get_counts_from_data(drug="acetaminophen", disease=None, condition=None)
print d.items()[:3]
return
print get_counts_for_drug(drug, disease, condition)
#d = get_counts_from_data(drug, disease, condition)
#print d, len(d)
values, values_eff = get_drug_treatment(drug, disease)
print map(lambda x: "%.2f(%d) %s" % x, values[:5])
print values_eff
#values = get_drugs_for_disease(disease)
#print map(lambda x: "%.2f(%d) %s" % x, values[:20])
#values = get_diseases_for_drug(drug)
#print map(lambda x: "%.2f(%d) %s" % x, values[:20])
return
def get_disease_specific_drugs(drug_to_diseases, phenotype_to_mesh_id):
disease_to_drugs = {}
mesh_id_to_phenotype = {}
for phenotype, mesh_id in phenotype_to_mesh_id.items():
mesh_id_to_phenotype[mesh_id] = phenotype
for drugbank_id, diseases in drug_to_diseases.iteritems():
for phenotype, dui, n, count_max, count_ineff, count_adverse in diseases:
if count_max < 10*N_MIN: #!
continue
#print drugbank_id, phenotype, n, count_max
if dui in mesh_id_to_phenotype: # In the disease data set
disease = mesh_id_to_phenotype[dui].lower()
disease_to_drugs.setdefault(disease, set()).add(drugbank_id)
return disease_to_drugs
def get_drug_disease_mapping(selected_drugs, drug_to_name, drug_to_synonyms, mesh_id_to_name, mesh_id_to_name_with_synonyms, dump_file):
if os.path.exists(dump_file):
drug_to_diseases = cPickle.load(open(dump_file))
return drug_to_diseases
drug_to_diseases = {} # (mesh_id, mesh_term, n, n_max, ri)
exp = re.compile("-\d-")
mesh_name_to_ids = {}
for mesh_id, names in mesh_id_to_name_with_synonyms.iteritems():
if mesh_id.startswith("Q"):
continue
for name in names:
#name = name.decode('utf-8','ignore')
name = name.replace(",", "").lower()
mesh_name_to_ids.setdefault(name, []).append(mesh_id)
not_found_in_mesh = set()
modified_in_mesh = set()
multiple_mesh_id = {}
flag = False
for drugbank_id in selected_drugs:
if drugbank_id == "DB00229":
flag = True
if flag == False:
continue
# Find the most common drug name in FDA for the DrugBank drug
names = [ drug_to_name[drugbank_id] ]
if drugbank_id in drug_to_synonyms:
for synonym in drug_to_synonyms[drugbank_id]:
if synonym.find("[") != -1 or synonym.find("{") != -1:
continue
m = exp.search(synonym)
if m:
continue
names.append(synonym)
drug, n = choose_fda_drug_name(names)
if drug is None: # No match in FDA api
continue
#print drugbank_id, drug, n
# Get diseases for that drug in FDA
diseases = get_diseases_for_drug(drug)
#time.sleep(0.25) # 240 request / min limit
if len(diseases) == 0:
continue
f = open(dump_file + ".txt", 'a')
for n, disease in diseases:
if n < N_MIN:
continue
disease = disease.lower()
phenotype = convert_fda_name_to_mesh(disease, mesh_name_to_ids)
if phenotype is None:
not_found_in_mesh.add(disease)
continue
duis = mesh_name_to_ids[phenotype]
if len(duis) > 1:
multiple_mesh_id[phenotype] = duis
continue
dui = duis[0]
if dui not in mesh_id_to_name:
continue
phenotype_mod = mesh_id_to_name[dui]
if phenotype != phenotype_mod: # matched to synonym or removed 's
modified_in_mesh.add((phenotype, phenotype_mod))
# API can not deal with x^s y / x's y such as crohn's disease and non-hodgkin's lymphoma
idx = disease.find("^s")
if idx == -1:
idx = disease.find("'s")
if idx != -1:
phenotype = disease[:idx]
# Get efficacy for each disease
values, values_eff = get_drug_treatment(drug, phenotype)
time.sleep(0.3) # 240 request / min limit
if values is None or len(values) == 0:
continue
z_max, count_max, term = values[0]
z_ineff, count_ineff, z_adverse, count_adverse = values_eff
# Safety reports provide multiple drug-disease pairs count_max is more reliable than n
#print disease, phenotype, n, count_max, count_ineff, count_adverse
f.write("%s\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n" % (drugbank_id, drug_to_name[drugbank_id], drug, disease, phenotype_mod, dui, n, count_max, count_ineff, count_adverse))
#drug_to_diseases.setdefault(drugbank_id, []).append((phenotype, dui, n, count_max, count_ineff, count_adverse))
f.close()
print "Not found in MeSH:", not_found_in_mesh
print "Modified in MeSH:", modified_in_mesh
print "Multiple id in MeSH:", multiple_mesh_id
for line in open(dump_file + ".txt"):
(drugbank_id, name, fda_name, fda_disease, phenotype, dui, n, count_max, count_ineff, count_adverse) = line.strip().split("\t")
for mesh_id in dui.split(","):
drug_to_diseases.setdefault(drugbank_id, []).append((phenotype, mesh_id, int(n), int(count_max), int(count_ineff), int(count_adverse)))
cPickle.dump(drug_to_diseases, open(dump_file, 'w'))
return drug_to_diseases
def convert_mesh_name_to_fda_name(disease):
disease_to_term = { "arrhythmias, cardiac": "arrhythmia", "colitis, ulcerative": "colitis ulcerative", "lung diseases, obstructive": "chronic obstructive pulmonary disease", "sarcoma": "bone sarcoma", "liver cirrhosis": "hepatic cirrhosis", "liver cirrhosis, biliary": "biliary cirrhosis primary", "anemia, hemolytic": "anemia", "coronary artery disease": "coronary artery disease", "varicose veins": "varicose vein", "blood coagulation disorders": "disseminated intravascular coagulation", "mycobacterium infections": "mycobacterium avium complex infection" }
if disease in disease_to_term:
disease_mod = disease_to_term[disease]
# Chop "disease" (alzheimer, crohn),
elif disease.endswith(" disease"):
disease_mod = disease[:-len(" disease")]
# Replace "neoplasms" with "cancer"
elif disease.endswith(" neoplasms"):
disease_mod = disease[:-len(" neoplasms")]
disease_mod += " cancer"
# Get rid of the "s" for disorders / diseases
elif disease.endswith(" disorders"):
disease_mod = disease[:-1]
elif disease.endswith(" diseases"):
disease_mod = disease[:-1]
# Reverse the order for "," ("diabetes mellitus, type 2", "type 2 diabetes mellitus")
elif "," in disease:
words = disease.split(", ")
words.reverse()
disease_mod = " ".join(words)
else:
disease_mod = disease.lower()
return disease_mod
def convert_fda_name_to_mesh(disease, mesh_name_to_ids):
phenotype = None
disease = disease.replace("^s", "").replace("'s","")
if disease in mesh_name_to_ids:
phenotype = disease
return phenotype
# Get words skipping disease / disorder / syndrome / plural / 's
values = text_utilities.tokenize_disease_name(disease, exact=False)
val_and_phenotypes = []
for mesh_name in mesh_name_to_ids:
val = sum([ mesh_name.lower().find(word.strip()) != -1 for word in values ])
#print mesh_name, val
if val > len(values) / 2.0:
#print mesh_name, disease
val_and_phenotypes.append((float(val)/len(mesh_name.split()), mesh_name))
#print values, val_and_phenotypes
if len(val_and_phenotypes) > 0:
val_and_phenotypes.sort()
phenotype = val_and_phenotypes[-1][1]
return phenotype
def get_data_helper(command, parameter, parameter2=None, parameter_effect=None, skip=0):
parameter = parameter.replace(" ", "+") #.replace("-", "+")
parameter = parameter.upper()
if command == "drug":
txt = '%s.exact:"%s"' % (FIELD_DRUG, parameter)
elif command == "disease":
parameter = parameter.replace("-", "+")
txt = '%s:"%s"' % (FIELD_DISEASE, parameter)
elif command == "drug-effect-all":
txt = '%s.exact:"%s"&count=%s.exact' % (FIELD_DRUG, parameter, FIELD_EFFECT)
elif command == "effect-drug-all":
txt = '%s.exact:"%s"&count=%s.exact' % (FIELD_EFFECT, parameter, FIELD_DRUG)
elif command == "drug-disease":
assert parameter2 is not None
parameter2 = parameter2.replace(" ", "+").replace("-", "+")
txt = '%s.exact:"%s"+AND+%s:"%s"' % (FIELD_DRUG, parameter, FIELD_DISEASE, parameter2)
elif command == "drug-disease-effect":
assert (parameter2 is not None and parameter_effect is not None)
parameter2 = parameter2.replace(" ", "+") #.replace("-", "+")
parameter_effect = parameter_effect.replace(" ", "+") #.replace("-", "+")
txt = '%s.exact:"%s"+AND+%s:"%s"+AND+%s:"%s"' % (FIELD_DRUG, parameter, FIELD_DISEASE, parameter2, FIELD_EFFECT, parameter_effect)
else:
raise ValueError("Unknown command: " + command)
if skip is not None:
limit_txt = "&limit=%d&skip=%d" % (LIMIT, skip)
else:
limit_txt = "&limit=%d" % (LIMIT * 10)
if API_USER_KEY is None:
key_txt = ""
else:
key_txt = "api_key=%s&" % API_USER_KEY
url = 'https://api.fda.gov/drug/event.json?%ssearch=%s%s' % (key_txt, txt, limit_txt)
#print url
req = urllib2.Request(url)
try:
response = urllib2.urlopen(req)
except:
print "Problem with url:", url
return
while True:
try:
response = json.load(response)
break
except:
print "Problem with response:", parameter, parameter2
response = urllib2.urlopen(req)
#n = int(response["meta"]["results"]["total"])
return response
def get_data(command, parameter, parameter2=None, parameter_effect=None):
offset = 0
limit = LIMIT
result = []
while True:
result2 = get_data_helper(command, parameter, parameter2, parameter_effect, skip=offset)
print offset, len(result2["results"])
result += result2["results"]
offset += limit
if len(result2["results"]) < limit:
break
return result
def get_count_data(command, parameter, parameter2=None, parameter_effect=None):
result = get_data_helper(command, parameter, parameter2, parameter_effect, skip=None)
return result
def choose_fda_drug_name(names):
values = []
for name in names:
name = name.lower()
#if name.startswith("vitamin"):
# name = name.replace(" ", "+")
#else:
# words = name.split(" ")
# if len(words) > 1:
# #print "Chopping drug name", name
# name = words[0]
# if name in ("compound", "dr.", "salicylate", "sodium"):
# continue
N, n, M, k = get_counts_for_drug(name, None, None)
time.sleep(0.3) # 240 request / min limit
values.append((N, name))
values.sort()
n, name = values[-1]
if n is None:
name = None
else:
n = int(n)
return name, n
def get_counts_from_data(drug=None, disease=None, condition=None):
if condition is None and drug is not None and disease is not None:
condition_to_count = {}
values = get_data("drug-disease", drug, disease)
for row in values:
for row_inner in row["patient"]["reaction"]:
if "reactionmeddrapt" not in row_inner:
continue
condition = row_inner["reactionmeddrapt"].lower()
i = condition_to_count.setdefault(condition, 0)
condition_to_count[condition] = i + 1
return condition_to_count
if disease is None and condition is None and drug is not None:
drug_to_count = {}
values = get_count_data("drug-effect-all", drug)
if values is None:
return None
for row in values["results"]:
condition = row["term"].lower()
count = int(row["count"])
drug_to_count[condition] = count
return drug_to_count
if disease is None and drug is None and condition is not None:
condition_to_count = {}
values = get_count_data("effect-drug-all", condition)
for row in values["results"]:
condition = row["term"].lower()
count = int(row["count"])
condition_to_count[condition] = count
return condition_to_count
values = get_data("drug-disease-effect", drug, disease, condition)
i = 0
flag_drug = False
flag_disease = False
for row in values:
#print row["safetyreportid"]
for row_inner in row["patient"]["drug"]:
if row_inner["medicinalproduct"].lower().find(drug) != -1:
flag_drug = True
#print row_inner["medicinalproduct"]
if "drugindication" in row_inner:
if row_inner["drugindication"].lower() == disease:
flag_disease = True
#print row_inner["drugindication"]
for row_inner in row["patient"]["reaction"]:
if "reactionmeddrapt" not in row_inner:
continue
if row_inner["reactionmeddrapt"].lower() == condition:
if flag_drug and flag_disease:
i += 1
#print row_inner["reactionmeddrapt"]
return i
def get_counts(command, parameter, parameter2=None, parameter_effect=None):
parameter_org = parameter
parameter = parameter.replace(" ", "+") #.replace("-", "+")
if parameter2 is not None:
parameter2_org = parameter2
parameter2 = parameter2.replace(" ", "+") #.replace("-", "+")
if parameter_effect is not None:
parameter_effect = parameter_effect.replace(" ", "+") #.replace("-", "+")
if command == "drug": # number of safety reports for that drug
txt = '%s:"%s"&count=%s.exact' % (FIELD_DRUG, parameter, FIELD_DRUG)
elif command == "disease": # number of safety reports for that disease
txt = '%s:"%s"&count=%s.exact' % (FIELD_DISEASE, parameter, FIELD_DISEASE)
elif command == "drug-disease": # number of safety reports for that drug and disease pair
assert parameter2 is not None
txt = '%s:"%s"+AND+%s:"%s"&count=%s.exact' % (FIELD_DRUG, parameter, FIELD_DISEASE, parameter2, FIELD_DRUG)
elif command == "disease-drug":
assert parameter2 is not None
txt = '%s:"%s"+AND+%s:"%s"&count=%s.exact' % (FIELD_DISEASE, parameter2, FIELD_DRUG, parameter, FIELD_DISEASE)
elif command == "drug-effect": # number of safety reports for that drug and reaction pair
assert parameter_effect is not None
txt = '%s:"%s"+AND+%s:"%s"&count=%s.exact' % (FIELD_DRUG, parameter, FIELD_EFFECT, parameter_effect, FIELD_DRUG)
elif command == "disease-effect": # number of safety reports for that disease and reaction pair
assert parameter_effect is not None
txt = '%s:"%s"+AND+%s:"%s"&count=%s.exact' % (FIELD_DISEASE, parameter, FIELD_EFFECT, parameter_effect, FIELD_DISEASE)
elif command == "drug-disease-effect" or command == "disease-drug-effect": # number of safety reports for that drug, disease and reaction triple
assert (parameter2 is not None and parameter_effect is not None)
if command == "drug-disease-effect":
txt = '%s:"%s"+AND+%s:"%s"+AND+%s:"%s"&count=%s.exact' % (FIELD_DRUG, parameter, FIELD_DISEASE, parameter2, FIELD_EFFECT, parameter_effect, FIELD_DRUG)
elif command == "disease-drug-effect":
txt = '%s:"%s"+AND+%s:"%s"+AND+%s:"%s"&count=%s.exact' % (FIELD_DRUG, parameter, FIELD_DISEASE, parameter2, FIELD_EFFECT, parameter_effect, FIELD_DISEASE)
elif command == "drug-disease2": # returns all diseases
txt = '%s:"%s"&count=%s.exact' % (FIELD_DRUG, parameter, FIELD_DISEASE)
elif command == "disease-drug2": # returns all drugs
txt = '%s:"%s"&count=%s.exact' % (FIELD_DISEASE, parameter, FIELD_DRUG)
elif command == "drug-disease-effect2": # returns all reactions and their counts
assert parameter2 is not None
txt = '%s:"%s"+AND+%s:"%s"&count=%s.exact' % (FIELD_DRUG, parameter, FIELD_DISEASE, parameter2, FIELD_EFFECT)
elif command == "drug-effect-all2": # number of safety reports for that drug
txt = '%s:"%s"&count=%s.exact' % (FIELD_DRUG, parameter, FIELD_EFFECT)
else:
raise ValueError("Unknown command: " + command)
if API_USER_KEY is None:
url = 'https://api.fda.gov/drug/event.json?search=%s&limit=%d' % (txt, 10*LIMIT)
else:
url = 'https://api.fda.gov/drug/event.json?api_key=%s&search=%s&limit=%d' % (API_USER_KEY, txt, 10*LIMIT)
#print url
n = None
req = urllib2.Request(url)
gcontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
try:
response = urllib2.urlopen(req, context=gcontext)
except urllib2.HTTPError:
if parameter_effect is not None:
n = 0
#print "No info for", parameter, parameter2, parameter_effect
if parameter2 is not None or parameter_effect is not None:
print "Problem with response (probably no info):", url
return n
while True:
try:
response = json.load(response)
break
except:
print "Problem with response:", parameter, parameter2
response = urllib2.urlopen(req)
if command.endswith("2"):
return response["results"]
val = parameter_org.lower() #parameter.lower().replace("+", " ")
if command in ("disease-drug", "disease-drug-effect"):
val = parameter2_org.lower() #parameter2.lower().replace("+", " ")
for row in response["results"]:
# note that 's are ^s in the results
if row["term"].lower().find(val) != -1:
#print row["term"].lower()
n = int(row["count"])
break
return n
def z_scorize_counts(count_term_pairs):
#values = []
#for count, term in count_term_pairs:
# if count < 2:
# continue
# values.append((count, term))
#count_term_pairs = values
values = []
m, s = stat_utilities.calc_mean_and_sigma(zip(*count_term_pairs)[0])
for count, term in count_term_pairs:
val = count - m
if s != 0:
val /= s
values.append((val, count, term))
values.sort()
values.reverse()
return values
def get_efficacy_values(values):
z_ineff, count_ineff = 0, 0
z_adverse, count_adverse = 0, 0
for z, count, term in values:
if term == "CONDITION AGGRAVATED":
z_adverse += z
count_adverse += count
elif term == "DRUG INEFFECTIVE":
z_ineff += z
count_ineff += count
return z_ineff, count_ineff, z_adverse, count_adverse
def get_counts_for_drug(drug, disease=None, condition=None):
command = "drug"
N = get_counts(command, drug)
n, k, M = None, None, None
if disease is not None:
command = "drug-disease"
n = get_counts(command, drug, disease)
if condition is not None:
command = "drug-disease-effect"
k = get_counts(command, drug, disease, condition)
if condition is not None:
command = "drug-effect"
M = get_counts(command, drug, None, condition)
return N, n, M, k
def get_counts_for_disease(drug, disease=None, condition=None):
command = "disease"
N = get_counts(command, disease)
n, k, M = None, None, None
if disease is not None:
command = "drug-disease" #"disease-drug"
n = get_counts(command, drug, disease)
if condition is not None:
command = "drug-disease-effect" #"disease-drug-effect"
k = get_counts(command, drug, disease, condition)
if condition is not None:
command = "disease-effect"
M = get_counts(command, disease, None, condition)
return N, n, M, k
def get_counts_for_drug_and_disease(drug, disease, condition=None):
n, k = None, None
command = "disease-drug"
n = get_counts(command, drug, disease)
if condition is not None:
command = "disease-drug-effect"
k = get_counts(command, drug, disease, condition)
return n, k
def get_drug_treatment(drug, disease):
response = get_counts("drug-disease-effect2", drug, disease)
if response is None:
return None, None
values = []
for row in response:
#indication = row["patient"]["drug"]["drugindication"]
#effect = row["patient"]["reaction"]["reactionmeddrapt"]
term = row["term"]
count = int(row["count"])
#print term, count #indication, effect
values.append((count, term))
values = z_scorize_counts(values)
values_eff = get_efficacy_values(values)
return values, values_eff
def get_drugs_for_disease(disease):
try:
response = get_counts("disease-drug2", disease)
except urllib2.HTTPError:
print "No info for", disease
return []
values = []
for row in response:
term = row["term"]
count = int(row["count"])
values.append((count, term))
#values = z_scorize_counts(values)
return values
def get_diseases_for_drug(drug):
values = []
try:
response = get_counts("drug-disease2", drug)
except urllib2.HTTPError:
print "No info for", drug
return values
if response is None: # for some weird reason humorsol returns a result but not indication
return values
for row in response:
term = row["term"]
count = int(row["count"])
values.append((count, term))
#values = z_scorize_counts(values)
return values
if __name__ == "__main__":
main()