-
Notifications
You must be signed in to change notification settings - Fork 0
/
SimpleScratch.py
166 lines (146 loc) · 6.25 KB
/
SimpleScratch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import pandas as pd
import numpy as np
# For regex
import re
import regex
import NumWords
import os
import nltk
import json
from NumWords import text2int
import math
from sklearn import tree
from collections import Counter
from datetime import datetime
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.evaluate import mcnemar
from MetaMapForLots import metamapstringoutput
# import operator
from subprocess import call
import itertools
import os
import sys
import csv
pd.set_option("display.max_rows", None, "display.max_columns", None)
# Now beginning the sorting out!
pathReports = pd.read_csv("/Users/bholmes/Desktop/DeleteMeSoon/MSMDR Narratives/PathReports.csv", low_memory=False)
bowList = []
tagsList = []
reportIdList = []
patientIdList = []
testTypeList = []
obs = pd.read_csv("/Users/bholmes/Desktop/NDI/GoogleSheets/obitSSA.csv", low_memory=False)
print(obs.columns)
input()
#for x in range(0, len(pathReports['description'])):
for x in range(0, 10):
# These lists go into making the dataframe containing the tags for each report.
matchedWordsList = []
candidateScoreList = []
candidateMatchedList = []
candidatePreferredList = []
semTypesList = []
negatedList = []
sourcesList = []
# We'll pull out the data as usual from each test - probably won't keep it all
if x % 100 == 0:
print(x, ' of ', len(pathReports['description']))
lower = pathReports['description'][x].lower()
lower = re.sub(' +', ' ', lower)
splitReport = lower.split('\n')
# We'll pull out the MRN
try:
mrnIndex = lower.index("rec.:")
MRN = lower[mrnIndex + 5:mrnIndex + 14].strip()
except Exception as e:
MRN = ''
# And the name
nameIndex = lower.index('name:')
endName = lower.index('accession')
nameBit = lower[nameIndex + 5: endName]
firstName = nameBit.split(',')[1].strip()
lastName = nameBit.split(',', )[0].strip()
middleName = ''
if len(firstName.split()) > 1:
middleName = firstName.split()[1]
firstName = firstName.split()[0]
# And the accession
accession = lower[endName + len('accession #:'):mrnIndex - 5].strip()
# And the DOB
dobindex = lower.index('dob:')
enddod = lower.index('(age')
dob = lower[dobindex + 4:enddod].strip()
index = [idx for idx, s in enumerate(splitReport) if 'patient name:' in s][0]
indexTT = index - 1
testType = splitReport[indexTT]
# Pull out test type
while testType == '' or 'amended' in testType.lower() or testType.lower().replace('-', '') == '':
indexTT = indexTT - 1
testType = splitReport[indexTT].strip()
if testType.endswith('.'):
testType = testType[:-1]
testTypeOrig = testType
testTypeList.append(testType)
reportIdList.append(pathReports['id'][x])
patientIdList.append(pathReports['patientid'][x])
# Now let's get a Bag of Words. We'll sentence Tokenize, and remove punctuation
corpus = nltk.sent_tokenize(lower)
for i in range(len(corpus)):
corpus[i] = re.sub(r'\W', ' ', corpus[i])
corpus[i] = re.sub(r'\s+', ' ', corpus[i])
# Now we'll find the word frequencies per sentence
wordfreq = {}
for sentence in corpus:
tokens = nltk.word_tokenize(sentence)
for token in tokens:
if token not in wordfreq.keys():
wordfreq[token] = 1
else:
wordfreq[token] += 1
bowList.append(wordfreq)
file = '/Users/bholmes/Desktop/DeleteMeSoon/orus/MetaMapInput/sampleInput'
lower = ' '.join(lower.split('\n'))
lower = lower.encode("ascii", "ignore")
lower = lower.decode()
lower = lower + "\n"
with open(file, 'w') as filetowrite:
filetowrite.write(lower)
# Now let's do metamap on it
os.chdir('/Users/bholmes/public_mm')
call(["bin/skrmedpostctl", "start"])
#call(["bin/wsdserverctl", "start"])
# This uses metamap to create an output file based on the input file.
try:
os.system("bin/metamap --JSONf 2 -Q 0 --prune 20 --negex --nomap NoMapFile /Users/bholmes/Desktop/DeleteMeSoon/orus/MetaMapInput/sampleInput "
"/Users/bholmes/Desktop/DeleteMeSoon/orus/MetaMapInput/sampleOutput.txt")
except:
continue
call(["bin/skrmedpostctl", "stop"])
#call(["bin/wsdserverctl", "stop"])
with open('/Users/bholmes/Desktop/DeleteMeSoon/orus/MetaMapInput/sampleOutput.txt') as json_file:
data = json.load(json_file)
# We go by utterance -> phrase -> mapping -> mappingCandidates
for utt in range(0, len(data['AllDocuments'][0]['Document']['Utterances'])):
utterance = data['AllDocuments'][0]['Document']['Utterances'][utt]
for phrasePos in range(0, len(utterance['Phrases'])):
phrase = utterance['Phrases'][phrasePos]
for mappingPos in range(0, len(phrase['Mappings'])):
mapping = phrase['Mappings'][mappingPos]
for mappingCandidatePos in range(0, len(mapping['MappingCandidates'])):
mappingCandidate = mapping['MappingCandidates'][mappingCandidatePos]
matchedWordsList.append(' '.join(mappingCandidate['MatchedWords']))
candidateScoreList.append(mappingCandidate['CandidateScore'])
candidateMatchedList.append(mappingCandidate['CandidateMatched'])
candidatePreferredList.append(mappingCandidate['CandidatePreferred'])
semTypesList.append(' '.join(mappingCandidate['SemTypes']))
negatedList.append(' '.join(mappingCandidate['Negated']))
sourcesList.append(' '.join(mappingCandidate['Sources']))
tagsDF = pd.DataFrame(list(zip(matchedWordsList, semTypesList, sourcesList, candidateMatchedList, candidatePreferredList, candidateScoreList,
negatedList)), columns=['MatchedWords', 'SemTypes', 'Sources', 'CandidateMatched', 'CandidatePreferred', 'CandidateScore', 'Negatived'])
tagsDF = tagsDF.drop_duplicates()
tagsDF = tagsDF.reset_index(drop=True)
tagsList.append(tagsDF)
resultDF = pd.DataFrame(list(zip(bowList, tagsList, reportIdList, patientIdList, testTypeList)))
resultDF.to_csv("~/Desktop/DeleteMeSoon/Artifact/BoWAndTagsForNarratives.csv", index=False)