forked from ruotianluo/self-critical.pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert_mimic.py
138 lines (124 loc) · 5.08 KB
/
convert_mimic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#this edition take findings only
import pandas as pd
import json
import os
import numpy as np
import random
from tqdm import tqdm
import re
def txt2string(fpath):
f = open(fpath,'r')
lines = f.readlines()
#process here if required
finaldict = {}
last_key = ""
for line in lines:
line = line.strip().lower()
if line !='':
if ':' in line:
key, value = line.split(':', 1)
if key != '' and not key[-1].isnumeric():
finaldict[key] = value.strip()
last_key = key
elif last_key != "":
finaldict[last_key] += ' ' + line
elif last_key != "":
finaldict[last_key] += ' ' + line
#print(finaldict)
return finaldict
def get_label():
sampleList = [0, 0, 0, 0, 0, 0, 0, 1, 1, 2] #: 0 train 1 validate 2 test
x = random.choice(sampleList)
return x
def clean_report_mimic_cxr(report):
report_cleaner = lambda t: t.replace('\n', ' ').replace('__', '_').replace('__', '_').replace('__', '_') \
.replace('__', '_').replace('__', '_').replace('__', '_').replace('__', '_').replace(' ', ' ') \
.replace(' ', ' ').replace(' ', ' ').replace(' ', ' ').replace(' ', ' ').replace(' ', ' ') \
.replace('..', '.').replace('..', '.').replace('..', '.').replace('..', '.').replace('..', '.') \
.replace('..', '.').replace('..', '.').replace('..', '.').replace('1. ', '').replace('. 2. ', '. ') \
.replace('. 3. ', '. ').replace('. 4. ', '. ').replace('. 5. ', '. ').replace(' 2. ', '. ') \
.replace(' 3. ', '. ').replace(' 4. ', '. ').replace(' 5. ', '. ') \
.strip().lower().split('. ')
sent_cleaner = lambda t: re.sub('[.,?;*!%^&_+():-\[\]{}]', '', t.replace('"', '').replace('/', '')
.replace('\\', '').replace("'", '').strip().lower())
tokens = [sent_cleaner(sent) for sent in report_cleaner(report) if sent_cleaner(sent) != []]
report = ' . '.join(tokens) + ' .'
return report
file_name = '/media/hdd/donghao/imcaption/R2Gen/data/mimic_cxr/annotation.json'
# file_name = '/media/hdd/donghao/imcaption/R2Gen/data/iu_xray/annotation.json'
with open(file_name) as json_file:
data = json.load(json_file)
keys = data.keys()
final_list = []
count = 0
train_report_num = 0
val_report_num = 0
test_report_num = 0
for each_item in data['train']:
train_report_num = train_report_num + 1
text = each_item['report']
text = clean_report_mimic_cxr(text)
tokens = [token for token in text.split(' ') if token != ""]
# if len(each_item['image_path']) > 1:
# print('bingo')
for impath in each_item['image_path']:
count = count + 1
study = {}
study['file_path'] = impath
study['sentids'] = [count]
study['imgid'] = count
study['sentences'] = [{'raw':text,'imgid':count,'sentid':count,'tokens':tokens}]
study['study_id'] = each_item['study_id']
study['subject_id'] = each_item['subject_id']
study['split'] = each_item['split']
final_list.append(study)
for each_item in data['test']:
test_report_num = test_report_num + 1
text = each_item['report']
text = clean_report_mimic_cxr(text)
tokens = [token for token in text.split(' ') if token != ""]
# if len(each_item['image_path']) > 1:
# print('bingo')
for impath in each_item['image_path']:
count = count + 1
study = {}
study['file_path'] = impath
study['sentids'] = [count]
study['imgid'] = count
study['sentences'] = [{'raw':text,'imgid':count,'sentid':count,'tokens':tokens}]
study['study_id'] = each_item['study_id']
study['subject_id'] = each_item['subject_id']
study['split'] = each_item['split']
final_list.append(study)
for each_item in data['val']:
val_report_num = val_report_num + 1
text = each_item['report']
text = clean_report_mimic_cxr(text)
tokens = [token for token in text.split(' ') if token != ""]
# if len(each_item['image_path']) > 1:
# print('bingo')
for impath in each_item['image_path']:
count = count + 1
study = {}
study['file_path'] = impath
study['sentids'] = [count]
study['imgid'] = count
study['sentences'] = [{'raw':text,'imgid':count,'sentid':count,'tokens':tokens}]
study['study_id'] = each_item['study_id']
study['subject_id'] = each_item['subject_id']
study['split'] = each_item['split']
final_list.append(study)
print('the number of studies', len(final_list))
print('train_image_num', train_report_num, 'val_image_num', val_report_num, 'test_image_num', test_report_num)
# print(each_item['study_id'], each_item['subject_id'])
# print(data['val'][30])
# print(data['val'][31])
# print(data['val'][32])
# print(data['val'][33])
# print(data['test'][33])
# print(data['test'][34])
# print(data['test'][35])
# print(data['test'][36])
print(keys)
# with open('data/dataset_mimic_rm.json', 'w') as outfile:
# json.dump({'images': final_list, 'dataset': 'mimic-rm'}, outfile)