forked from baiwenjia/ukbb_cardiac
-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert_data2.py
196 lines (174 loc) · 7.25 KB
/
convert_data2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# Copyright 2017, Wenjia Bai. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
The data converting script for UK Biobank Application 2964, which contributes
the manual annotations of 5,000 subjects.
This script assumes that the images and annotations have already been downloaded
as zip files. It decompresses the zip files, sort the DICOM files into subdirectories
according to the information provided in the manifest.csv spreadsheet, parse manual
annotated contours from the cvi42 xml files, read the matching DICOM and cvi42 contours
and finally save them as nifti images.
"""
import os
import sys
import shutil
import re
import csv
import glob
import re
import time
import pandas as pd
import dateutil.parser
from biobank_utils import *
import parse_cvi42_xml
DICOM_ZIP_PATTERN = re.compile('(\d+)_2020(?:8|9)_(\d+)_0.zip')
LONG_AXIS_DIR = "20208-long"
SHORT_AXIS_DIR = "20209-short"
if __name__ == '__main__':
if len(sys.argv) < 2 or len(sys.argv) > 3:
print(f'Usage: {sys.argv[0]} <data_dir> [<output_dir>]', file=sys.stderr)
exit(-1)
# data_path is the path to the ukbb data
# structure of input data dir
# <data_dir>/
# |
# +--<20208-long>/
# | |
# | +--<eid1>_20208_<num>_0.zip
# | +--<eid2>_20208_<num>_0.zip
# | +--<eid3>_20208_<num>_0.zip
# | ...
# |
# +--<20209-short>/
# |
# +--<eid1>_20209_<num>_0.zip
# +--<eid2>_20209_<num>_0.zip
# +--<eid3>_20209_<num>_0.zip
# ...
# -----------------------------------------
# structure of output dir
# <output_dir>/
# |
# +--<eid1>_<num>/
# | |
# | +--la_2ch.nii.gz
# | +--la_3ch.nii.gz
# | +--la_4ch.nii.gz
# | +--sa.nii.gz
# |
# +--<eid2>_<num>/
# | |
# | +--la_2ch.nii.gz
# | +--la_3ch.nii.gz
# | +--la_4ch.nii.gz
# | +--sa.nii.gz
# ...
# -----------------------------------------
data_path = sys.argv[1]
# remove trailing slash if any
if data_path.endswith('/'):
data_path = data_path[:-1]
output_dir = sys.argv[2] if len(sys.argv) > 2 else f'{data_path}.converted'
la_data_dir = os.path.join(data_path, LONG_AXIS_DIR)
if not os.path.isdir(la_data_dir):
print(f'{la_data_dir} not found', file=sys.stderr)
exit(-1)
sa_data_dir = os.path.join(data_path, SHORT_AXIS_DIR)
if not os.path.isdir(sa_data_dir):
print(f'{sa_data_dir} not found', file=sys.stderr)
exit(-1)
tmp_dir = os.path.join(os.getenv('HOME'), 'tmp')
if not os.path.exists(tmp_dir):
os.mkdir(tmp_dir)
if not os.path.exists(output_dir):
os.mkdir(output_dir)
zip_files = {} # dictionary mapping from <eid>_<num> to list of zip file paths(relative to data_path)
# iterate over long-axis images
for item in os.listdir(la_data_dir):
m = DICOM_ZIP_PATTERN.match(item)
if not m:
continue
eid, num = m.group(1), m.group(2)
key = f'{eid}_{num}'
zip_list = zip_files.get(key, []) # get or create list
zip_list.append(os.path.join(LONG_AXIS_DIR, item)) # append to the list
zip_files[key] = zip_list # update the dict
# iterate over short-axis images
for item in os.listdir(sa_data_dir):
m = DICOM_ZIP_PATTERN.match(item)
if not m:
continue
eid, num = m.group(1), m.group(2)
eid_and_num = f'{eid}_{num}'
zip_list = zip_files.get(eid_and_num, []) # get or create list
zip_list.append(os.path.join(SHORT_AXIS_DIR, item)) # append to the list
zip_files[eid_and_num] = zip_list # update the dict
skipped = 0
# Convert image data for each <eid>_<num>
for eid_and_num, zip_list in sorted(zip_files.items(), key=lambda x:x[0]):
sub_output_dir = os.path.join(output_dir, eid_and_num)
if os.path.exists(sub_output_dir):
# print(f'skipping completed {eid_and_num}')
skipped += 1
continue
if skipped > 0:
print(f'{skipped} completed subjects skipped')
skipped = 0
print(f'converting {eid_and_num}...', end='', flush=True)
tmp_sub_output_dir = os.path.join(output_dir, f'{eid_and_num}.working')
if os.path.exists(tmp_sub_output_dir):
shutil.rmtree(tmp_sub_output_dir)
os.mkdir(tmp_sub_output_dir)
# Decompress the zip files for this <eid>_<num>
dicom_dir = os.path.join(tmp_dir, f'{eid_and_num}_dicom')
if not os.path.exists(dicom_dir):
os.mkdir(dicom_dir)
for f in zip_list:
# convert to full path
zip_path = os.path.join(data_path, f)
os.system(f'unzip -q -o {zip_path} -d {dicom_dir}')
# Process the manifest file
source_manifest = os.path.join(dicom_dir, 'manifest.csv');
if not os.path.exists(source_manifest):
source_manifest = os.path.join(dicom_dir, 'manifest.cvs'); # some(or all?) has extension .cvs
dest_manifest = os.path.join(dicom_dir, 'manifest2.csv')
process_manifest(source_manifest, dest_manifest)
df2 = pd.read_csv(dest_manifest, error_bad_lines=False)
# Organise the dicom files
# Group the files into subdirectories for each imaging series
for series_name, series_df in df2.groupby('series discription'):
series_dir = os.path.join(dicom_dir, series_name)
if not os.path.exists(series_dir):
os.mkdir(series_dir)
for series_file in series_df['filename']:
sf_path = os.path.join(dicom_dir, series_file)
if os.path.exists(sf_path):
shutil.move(sf_path, series_dir)
# Rare cases when no dicom file exists
# e.g. 12xxxxx/1270299
if not os.listdir(dicom_dir):
print('Warning: empty dicom directory! Skip this one.')
continue
# Convert dicom files into nifti images
dset = Biobank_Dataset(dicom_dir, None)
dset.read_dicom_images()
dset.convert_dicom_to_nifti(tmp_sub_output_dir)
# Remove intermediate files
shutil.rmtree(dicom_dir, ignore_errors=True)
# rename temp working dir to the final dir
shutil.move(tmp_sub_output_dir, sub_output_dir)
print('done')
if skipped > 0:
print(f'{skipped} completed subjects skipped')