-
Notifications
You must be signed in to change notification settings - Fork 2
/
list_dataset_files.py
executable file
·112 lines (90 loc) · 4.39 KB
/
list_dataset_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python3
import argparse
import os
import sys
from dataset import get_loader, DatasetParams
try:
from tqdm import tqdm
except ImportError as e:
print('WARNING: tqdm module not found. Install it if you want a fancy progress bar :-)')
def tqdm(x, disable=False): return x
def main(args):
if args.output_file:
file_name = args.output_file
else:
if args.environment is not None:
environment = args.environment
else:
environment = os.getenv('HOSTNAME')
if environment is None:
environment = 'unknown_host'
file_name = 'image_file_list-{}-{}.txt'.format(args.dataset, environment)
file_name = os.path.join(args.output_path, file_name)
os.makedirs(args.output_path, exist_ok=True)
# If we want to generate multiple files we need to add "_X_of_Y" string to the file
# to indicate which file out of the set it is:
if args.num_files > 1:
file_name_list = []
for i in range(args.num_files):
file_name_i = os.path.splitext(file_name)[0] + '_{}_of_{}.txt'.format(
i + 1, args.num_files)
file_name_list.append(file_name_i)
else:
file_name_list = None
# Check that we are not overwriting anything
if os.path.exists(file_name):
print('ERROR: {} exists, please remove it first if you really want to replace it.'.
format(file_name))
sys.exit(1)
dataset_configs = DatasetParams(args.dataset_config_file)
dataset_params = dataset_configs.get_params(args.dataset)
for d in dataset_params:
# Tell dataset to output full image paths instead of image id:
d.config_dict['return_full_image_path'] = True
# We ask it to iterate over images instead of all (image, caption) pairs
data_loader, _ = get_loader(dataset_params, vocab=None, transform=None,
batch_size=args.batch_size, shuffle=False,
num_workers=args.num_workers,
ext_feature_sets=None,
skip_images=True,
iter_over_images=True)
print("Getting file paths from dataset {}...".format(args.dataset))
show_progress = sys.stderr.isatty()
for i, (_, _, _,
paths, _) in enumerate(tqdm(data_loader, disable=not show_progress)):
if args.num_files == 1:
_file_name = file_name
else:
n = int(i * data_loader.batch_size * args.num_files / len(data_loader.dataset))
_file_name = file_name_list[n]
with open(_file_name, 'a') as f:
for path in paths:
f.write(path + '\n')
# Print log info
if not show_progress and ((i + 1) % args.log_step == 0):
print('Batch [{}/{}]'.format(i + 1, len(data_loader)))
sys.stdout.flush()
print("Written paths to {} image files".format(len(data_loader.dataset)))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', type=str,
default='coco:train2014',
help='dataset that defines images for which features are needed')
parser.add_argument('--dataset_config_file', type=str,
default='datasets/datasets.conf',
help='location of dataset configuration file')
parser.add_argument('--batch_size', type=int, default=128)
parser.add_argument('--num_workers', type=int, default=2)
parser.add_argument('--output_file', type=str, default='',
help='file for saving file_list, if no name specified it '
'defaults to "file_list-dataset_name_X_of_Y.txt"')
parser.add_argument('--environment', type=str,
help='Optionally specify the environment where the paths are valid '
'by default value of $HOSTNAME environment variable will be used')
parser.add_argument('--log_step', type=int, default=10,
help='How often do we want to log output')
parser.add_argument('--num_files', type=int, default=1,
help='How many output files should be generated')
parser.add_argument('--output_path', type=str, default='file_lists',
help='Path where to save generated file lists')
main(args=parser.parse_args())