forked from UUDBMI/anu_masterclass
-
Notifications
You must be signed in to change notification settings - Fork 0
/
nlp_pneumonia_utils.py
407 lines (345 loc) · 16.8 KB
/
nlp_pneumonia_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
import os
import zipfile
import codecs
import sklearn.metrics
import pandas as pd
import pyConTextNLP
from pyConTextNLP import pyConTextGraph
from textblob import TextBlob
from radnlp.data import classrslts
import radnlp.view as rview
from IPython.display import display, HTML, Image
from pyConTextNLP.itemData import itemData
# this class encapsulates all data related to a span (text sequence) annotation
# including the text it "covers" and the type (i.e. "concept") of the annotation
class Annotation(object):
def __init__(self):
self.start_index = -1
self.end_index = -1
self.type = ''
self.spanned_text = ''
# adding this so that pyConText's HTML markup can work seamlessly
def getSpan(self):
return (self.start_index, self.end_index)
def getCategory(self):
# pyConText graph objects actually expect a list here
return [self.type]
# this class encapsulates all data for a document which has been annotated_doc_map
# this includes the original text, its annotations and also
class AnnotatedDocument(object):
def __init__(self):
self.text = ''
self.annotations = []
# NOTE : This "positive_label" relates to positive/possible cases of pneumonia
self.positive_label = -1
def read_brat_annotations(lines):
annotations = []
# BRAT FORMAT is:
# NUMBER[TAB]TYPE[SPACE]START_INDEX[SPACE]END_INDEX[SPACE]SPANNED_TEXT
for line in lines:
line = str(line)
tab_tokens = line.split('\t')
space_tokens = tab_tokens[1].split()
anno = Annotation()
anno.spanned_text = tab_tokens[-1]
anno.type = space_tokens[0]
anno.start_index = int(space_tokens[1])
anno.end_index = int(space_tokens[2])
annotations.append(anno)
return annotations
def read_annotations(archive_file, force_redownload=False):
annotated_doc_map = read_doc_annotations(archive_file, force_redownload)
return list(annotated_doc_map.values())
def read_doc_annotations(archive_file, force_redownload=False):
print('Reading annotations from file : ' + archive_file)
if 'http' in archive_file:
if force_redownload or not os.path.isfile(filename):
print('Downloading remote file : ' + archive_file)
urllib.request.urlretrieve(archive_file, filename)
filename = archive_file.split('/')[-1]
else:
filename = archive_file
annotated_doc_map = {}
print('Opening local file : ' + filename)
z = zipfile.ZipFile(filename, "r")
zinfo = z.namelist()
for name in zinfo:
if name.endswith('.txt') or name.endswith('.ann'):
basename = name.split('.')[0]
if basename not in annotated_doc_map:
annotated_doc_map[basename] = AnnotatedDocument()
anno_doc = annotated_doc_map[basename]
# handle text and BRAT annotation files (.ann) differently
if name.endswith('.txt'):
with z.open(name) as f1:
anno_doc.text = f1.read().decode('utf8')
else:
with z.open(name) as f1:
# handle this as utf8 or we get back byte arrays
anno_doc.annotations = read_brat_annotations(codecs.iterdecode(f1, 'utf8'))
# now let's finally assign a 0 or 1 to each document based on whether we see our expected type for the pneumonia label
for key, anno_doc in annotated_doc_map.items():
annos = anno_doc.annotations
anno_doc.positive_label = 0
for anno in annos:
# NOTE : This "positive_label" relates to positive/possible cases of pneumonia
if anno.type == 'DOCUMENT_PNEUMONIA_YES':
anno_doc.positive_label = 1
return annotated_doc_map
def calculate_prediction_metrics(gold_docs, prediction_function):
gold_labels = [x.positive_label for x in gold_docs]
pred_labels = []
for gold_doc in gold_docs:
pred_label = prediction_function(gold_doc.text)
pred_labels.append(pred_label)
# now let's use scikit-learn to compute some metrics
precision = sklearn.metrics.precision_score(gold_labels, pred_labels)
recall = sklearn.metrics.recall_score(gold_labels, pred_labels)
f1 = sklearn.metrics.f1_score(gold_labels, pred_labels)
# let's use Pandas to make a confusion matrix for us
confusion_matrix_df = pd.crosstab(pd.Series(gold_labels, name='Actual'),
pd.Series(pred_labels, name='Predicted'))
print('Precision : {0}'.format(precision))
print('Recall : {0}'.format(recall))
print('F1: {0}'.format(f1))
print('\nConfusion Matrix : ')
display(confusion_matrix_df)
def __insert_color_custom(txt, s, c):
"""insert HTML span style into txt. The span will change the color of the
text located between s[0] and s[1]:
txt: txt to be modified
s: span of where to insert tag
c: color to set the span to"""
return txt[:s[0]] + '<span style="font-weight: bold;color: {0};">'.format(c) + \
txt[s[0]:s[1]] + '</span>' + txt[s[1]:]
# helper functions to highlight annotations from BRAT
def mark_text(txt, nodes, colors={"name": "red", "pet": "blue"}, default_color="black"):
# this function had to be copied and modified from pyConTextNLP.display.html.mark_text
# so that the default_color could be passed through
if not nodes:
return txt
else:
n = nodes.pop(-1)
return mark_text(__insert_color(txt,
n.getSpan(),
colors.get(n.getCategory()[0], default_color)),
nodes,
colors=colors,
# this was not being passed through
default_color=default_color)
# helper functions to highlight annotations from BRAT
def mark_text_custom(txt, nodes, colors={"name": "red", "pet": "blue"}, default_color="black"):
# this function had to be copied and modified from pyConTextNLP.display.html.mark_text
# so that the default_color could be passed through
if not nodes:
return txt
else:
n = nodes.pop(-1)
return mark_text_custom(__insert_color_custom(txt,
n.getSpan(),
colors.get(n.getCategory()[0], default_color)),
nodes,
colors=colors,
# this was not being passed through
default_color=default_color)
def pneumonia_annotation_html_markup(anno_doc):
from pyConTextNLP.display.html import __sort_by_span
# this bit mimics 'mark_document_with_html' from pyConTextNLP.display.html
colors = {}
colors['DOCUMENT_PNEUMONIA_YES'] = 'red'
colors['DOCUMENT_PNEUMONIA_NO'] = 'green'
colors['SPAN_POSITIVE_PNEUMONIA_EVIDENCE'] = 'red'
default_color = 'red'
html = """<p> {0} </p>""".format(" ".join([mark_text_custom(anno_doc.text,
__sort_by_span(anno_doc.annotations),
colors=colors,
default_color=default_color)]))
return html
def mark_document_with_html(doc, colors={"name": "red", "pet": "blue"}, default_color="black"):
"""takes a ConTextDocument object and returns an HTML paragraph with marked phrases in the
object highlighted with the colors coded in colors
doc: ConTextDocument
colors: dictionary keyed by ConText category with values valid HTML colors
"""
from pyConTextNLP.display.html import __sort_by_span
from pyConTextNLP.utils import get_document_markups
return """<p> {0} </p>""".format(" ".join([mark_text_custom(m.graph['__txt'],
__sort_by_span(m.nodes()),
colors=colors,
default_color=default_color) for m in
get_document_markups(doc)]))
def clearPyConTextRegularExpressions():
if len(pyConTextGraph.compiledRegExprs) > 0:
print('Clearing pyConText compiled regular expressions')
pyConTextGraph.compiledRegExprs = {}
def view_single_sentence_graph(sentence, modifiers, targets):
context = markup_context_document(sentence, modifiers, targets)
class_result = classrslts(context_document=context, exam_type="Chest X-Ray", report_text=sentence,
classification_result='N/A')
rview.markup_to_pydot(class_result)
display(Image("tmp.png"))
print(sentence)
class DocumentClassifier(object):
def __init__(self, ruleFile, debug=False, modifiers=None, targets=None,expected_value=None):
self.rules = {}
self.rules_ele_list = {}
self.conclusions = []
self.debug = debug
self.modifiers = modifiers
self.targets = targets
self.save_markups = False
self.expected_value=expected_value
if ruleFile is not None:
if ruleFile.endswith('.csv') or ruleFile.endswith('.tsv') or ruleFile.endswith('.txt'):
f = open(os.path.join(os.getcwd(), ruleFile), 'r')
rows = f.readlines()
else:
rows = ruleFile.split('\n')
else:
print('DocumentClassifier can only take rules in string or in csv, tsv or txt file')
if modifiers is not None and targets is not None:
if isinstance(modifiers, str) and isinstance(targets, str):
if (modifiers.endswith('.csv') or modifiers.endswith('.tsv') or modifiers.endswith('.txt')) \
and (targets.endswith('.csv') or targets.endswith('.tsv') or targets.endswith('.txt')):
self.setModifiersTargetsFromFiles(modifiers, targets)
else:
self.setModifiersTargets(modifiers, targets)
priority = 0
for row in rows:
row = row.strip()
if len(row) == 0:
continue
if row.startswith('#'):
continue
cells = row.split('\t')
if len(cells) == 1:
self.default_conclusion = cells[0]
continue
conclusion = cells[0]
annotation_type = cells[1]
if annotation_type not in self.rules:
self.rules[annotation_type] = []
rules = self.rules[annotation_type]
rules.append(priority)
self.rules_ele_list[priority] = set(cells[2:])
self.conclusions.append(conclusion)
priority += 1
def setModifiersTargets(self, modifiers, targets):
self.modifiers = modifiers
self.targets = targets
def setModifiersTargetsFromFiles(self, modifiers_file, targets_file):
self.modifiers = pyConTextNLP.itemData.instantiateFromCSVtoitemData(os.path.join(os.getcwd(), modifiers_file))
self.targets = pyConTextNLP.itemData.instantiateFromCSVtoitemData(os.path.join(os.getcwd(), targets_file))
def checkMatch(self, annotation_type, modifiers, current_conclusions):
if annotation_type not in self.rules:
return
current_priority=-1
for priority in self.rules[annotation_type]:
if len(self.rules_ele_list[priority]) == 0:
if len(modifiers) == 0:
if annotation_type not in current_conclusions or current_conclusions[annotation_type] > priority:
current_conclusions[annotation_type] = priority
current_priority=priority
elif self.rules_ele_list[priority].issubset(modifiers):
if annotation_type not in current_conclusions or current_conclusions[annotation_type] > priority:
current_conclusions[annotation_type] = priority
current_priority = priority
break;
if self.expected_value is not None and self.conclusions[current_priority] in self.expected_value :
return True
return False
def reset_saved_predictions(self):
self.saved_markups_map = {}
self.save_markups = True
self.expected_value=None
def predict(self, doc, expected_value={'indicate_pneumonia'}, doc_name=None):
if self.save_markups:
self.expected_value = expected_value
conclusions = self.classify_doc(doc, None, doc_name)
if expected_value.issubset(conclusions):
return 1
return 0
def classify_doc(self, doc, debug=None, doc_name=None):
if debug is None:
debug = self.debug
if self.modifiers is None or self.targets is None:
print('DocumentClassifier\'s "modifiers" and/or "targets" has not been set yet.\n' +
'Use function: setModifiersTargets(modifiers, targets) or setModifiersTargetsFromFiles(modifiers_file,' + 'targets_file) to set them up.')
markups = markup_context_document(doc, self.modifiers, self.targets)
if doc_name is not None and self.save_markups and len(markups.getDocumentGraph().nodes())>0:
self.saved_markups_map[doc_name] = markups
return self.classify_markups(markups, debug, doc_name)
def classify_markups(self, markups, debug=None, doc_name=None):
if debug is None:
debug = self.debug
current_conclusions = {}
annotation_id_modifiers = {}
annotation_id_type = {}
nodes_on_edge = set()
target_id_nodes={}
if debug:
annotation_id_phrase = {}
# regroup modifiers by each annotation
for e in markups.getDocumentGraph().edges():
annotation_id = e[1].getTagID()
target_id_nodes[annotation_id]=e[1]
nodes_on_edge.add(annotation_id)
annotation_type = e[1].getCategory()[0]
modifier = e[0].getCategory()[0]
nodes_on_edge.add(e[0].getTagID())
if annotation_id not in annotation_id_modifiers:
annotation_id_modifiers[annotation_id] = set()
annotation_id_modifiers[annotation_id].add(modifier)
annotation_id_type[annotation_id] = annotation_type
if debug:
annotation_id_phrase[annotation_id] = e[1].getPhrase()
# add annotations with no modifier
for node in markups.getDocumentGraph().nodes():
if node.getTagID() not in nodes_on_edge:
annotation_id = node.getTagID()
target_id_nodes[annotation_id] = node
annotation_id_modifiers[annotation_id] = set()
annotation_id_type[annotation_id] = node.getCategory()[0]
if debug:
annotation_id_phrase[annotation_id] = node.getPhrase()
# update decision for each annotation and its modifiers in its id order (assume the ids are assigned based on the positions)
for annotation_id in sorted(annotation_id_type.keys()):
annotation_type = annotation_id_type[annotation_id]
modifiers = annotation_id_modifiers[annotation_id]
self.checkMatch(annotation_type, modifiers, current_conclusions)
if debug:
print('Based on annotation: "' + annotation_id_phrase[
annotation_id] + '" has type: \t"' + annotation_type + '"\twith modifiers: ' + str(modifiers))
print('\tCurrent conclusion is:\t' + str(self.get_conclusion_set(current_conclusions)) + '\n')
if len(current_conclusions) == 0:
return {self.default_conclusion}
return self.get_conclusion_set(current_conclusions)
def get_conclusion_set(self, current_conclusions):
conclusion = set()
for priority in current_conclusions.values():
conclusion.add(self.conclusions[priority])
return conclusion
def markup_sentence(s, modifiers, targets, prune_inactive=True, verbose=False):
"""
"""
markup = pyConTextGraph.ConTextMarkup()
markup.setRawText(s)
markup.cleanText()
markup.markItems(targets, mode="target")
markup.markItems(modifiers, mode="modifier")
markup.pruneMarks()
markup.dropMarks('Exclusion')
# apply modifiers to any targets within the modifiers scope
markup.applyModifiers()
markup.pruneSelfModifyingRelationships()
if prune_inactive:
markup.dropInactiveModifiers()
return markup
def markup_context_document(report_text, modifiers, targets):
context = pyConTextGraph.ConTextDocument()
# we will use TextBlob for breaking up sentences
sentences = [s.raw for s in TextBlob(report_text).sentences]
for sentence in sentences:
m = markup_sentence(sentence, modifiers=modifiers, targets=targets)
context.addMarkup(m)
return context