-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNLPFromText.py
185 lines (160 loc) · 7.54 KB
/
NLPFromText.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import pandas as pd
import numpy
import operator
import re
# Right, so we'll process these reports in chunks. Henry Ford tends to give us reports in a very standardized
# format, so we'll process the bits we can with pattern matching, then put the rest through metamap or something.
# Tally ho
df = pd.read_csv("~/Desktop/DeleteMeSoon/LungAdenocarcinomaText.csv", low_memory=False)
# This is the hospital that ordered the test
source = ""
# This is the kind of report this is
testType = ""
# This is a special marker for if we're dealing with an amended report. Dunno what we'll do with it
amendedReport = False
# This is the day the report was done
reported = ""
# This is the physician who oversaw the procedure
physician = ""
# This is the clinical history of the patient
clinicalHistory = ""
allTests = set()
# We're spliiting this into chunks - the report is formatted, which means we can expect discrete chunks to be
# separated by the double line break.
df['line'] = df['result'].apply(lambda x: x.lower().split("\n\n"))
# I'm going to try something a little funny for the start of this. There are going to be a variable number of tokens
# in these reports, but what I need is to carve off the first chunk and keep the rest. So I'm going to call "j" my
# chunk numberer. Position [j] will ALWAYS be the start of "Patient Name", which I think is universal. Let's roll.
for i in range(0, len(df['line'])):
j = 0
# Henry Ford ALWAYS includes their contact info first. We'll just take the hospital name.
source = df['line'][i][0].split("\n")[0]
# Then the next bit is either the physician or the test type. If physician is first, the test type will be next.
# There may sometimes be a leading space.
if 'physician' in df['line'][i][1].split("\n")[0]:
testType = df['line'][i][2].split("\n")[0]
j = 3
else:
if len(df['line'][i][1].split("\n")) > 1:
if 'amended' in df['line'][i][1].split("\n")[1]:
testType = df['line'][i][1].split("\n")[0]
else:
testType = df['line'][i][1].split("\n")[1]
j = 2
else:
if "report" in df['line'][i][1].split("\n")[0]:
testType = df['line'][i][2].split("\n")[0]
j = 3
else:
testType = df['line'][i][1].split("\n")[0]
j = 2
# This gets us right to the patient info. After this, we're there.
if "patient name" not in df['line'][i][j]:
while "patient name" not in df['line'][i][j]:
if "amended" in df['line'][i][j]:
amendedReport = True
j = j+1
# Ok, now we'll get when the report was received, and who the presiding physician was.
patientInfo = df['line'][i][j].replace('\n', ' ')
# We get two tries to have the reported info, otherwise it's not present.
if('reported:') not in patientInfo and "autopsy date:" not in patientInfo:
j = j+1
patientInfo = df['line'][i][j].replace('\n', ' ')
if ('reported:') not in patientInfo and "autopsy date:" not in patientInfo:
reported="Not Reported"
else:
patientSplit = patientInfo.split(' ')
patientLoc = patientSplit.index('reported:')
reported = patientSplit[patientLoc + 1]
else:
patientSplit = patientInfo.split(' ')
patientLoc = patientSplit.index('reported:')
reported = patientSplit[patientLoc + 1]
# Here we pull out the physician. Physician ALWAYS comes after reported date.
# I'm assuming.
if 'physician(s):' in df['line'][i][j]:
physicianSplit = df['line'][i][j].split('\n')
for value in range(0, len(physicianSplit)):
if 'physician(s):' in physicianSplit[value]:
physician = physicianSplit[value].split(':')[1]
else:
while 'physician(s):' not in df['line'][i][j]:
j = j + 1
physicianSplit = df['line'][i][j].split('\n')
for value in range(0, len(physicianSplit)):
if 'physician(s):' in physicianSplit[value]:
physician = physicianSplit[value].split(':')[1]
# Now we take the REST of the report. We'll try to capture the various sections below.
restOfReport = " ".join(df['line'][i][j+1:])
restOfReport = restOfReport.split("\n")
# It's a nice visual barrier, but not helpful for our purposes
# Let's remove the equals sign barriers
regex = re.compile(r'(==)=+')
restOfReport = filter(lambda i: not regex.search(i), restOfReport)
filtered = [i for i in restOfReport if not regex.search(i)]
# Because I'm joining with pipe, a character that doesn't show up in the reports,
# I can find lone words.
filtered = " | ".join(filtered)
sections = [
'| clinical history |',
'operative diagnoses',
'pathological diagnosis:',
'| comment |',
'procedures/addenda',
'gross description',
'microscopic description',
'in-situ hybridization:',
'icd code(s):',
'billing fee code(s):',
'operation/specimen:',
'clia signout facility:',
'interpretation |',
'clinical panel',
'*** end of report ***',
'***electronically signed out***',
'diagnostic interpretation:',
'disclaimers:',
'manual microdissection:',
'dna quality:',
'mean amplicon depth:',
'laboratory notes',
'gene location transcript cdna protein dp exon af label',
'gene location transcript cdna protein dp exon af interpretation',
'| results-comments |',
'cytogenic impression: |',
'karyotype: |',
'clonal description |'
]
# Now let's get an ordering - we'll need to drastically expand the number of columns we know about, I bet.
sectionList = []
sectionPos = []
for section in sections:
if section in filtered:
sectionList.append(section)
sectionPos.append(filtered.index(section))
sectionList = [x for _,x in sorted(zip(sectionPos, sectionList))]
sectionPos = sorted(sectionPos)
if 'clinical panel' in sectionList and testType == 'hematolymphoid sequencing panel (51 genes).':
if 'diagnostic interpretation:' in sectionList:
position = sectionList.index('diagnostic interpretation:')
#print(filtered[sectionPos[position]:sectionPos[position+1]])
else:
position = sectionList.index('clinical panel')
# print(filtered[sectionPos[position]:])
allTests.add(testType)
if testType == 'microsatellite instability testing (msi).':
if '| results-comments |' in sectionList:
#print(sectionList)
position = sectionList.index('| results-comments |')
#print(filtered[sectionPos[position]+len('| results-comments |'):])
else:
position = sectionList.index('interpretation |')
print(filtered[sectionPos[position]+len('| results-comments |'):])
# if 'gene location transcript cdna protein dp exon af label' in sectionList:
# position = sectionList.index('gene location transcript cdna protein dp exon af label')
# print(filtered[sectionPos[position]+len('gene location transcript cdna protein dp exon af label'):])
# elif 'gene location transcript cdna protein dp exon af interpretation' in sectionList:
# position = sectionList.index('gene location transcript cdna protein dp exon af interpretation')
# print(filtered[sectionPos[position]+len('gene location transcript cdna protein dp exon af interpretation'):])
#for test in allTests:
# print(test)