-
Notifications
You must be signed in to change notification settings - Fork 9
/
dataExtraction.py
146 lines (98 loc) · 3.61 KB
/
dataExtraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 15 10:40:26 2019
@author: MAGESHWARAN
"""
import os
from urllib import request
import tarfile
from tqdm import tqdm
def createFolders(base_folder, folder_name):
"""
Creates New folder to store data
Input:
base_folder : path of the base file
folder_name: Name of the folder to be created
Output:
Returns the path of new folder created
"""
if not os.path.isdir(os.path.join(base_folder, folder_name)):
os.mkdir(os.path.join(base_folder, folder_name))
print("new ", folder_name, " created")
folder = os.path.join(base_folder, folder_name)
return folder
def getFiles(url, folder):
"""
Downloads images.txt and crops.txt from internet
Input:
url : url of the file
folder: Path of the folder where the files to be downloaded
Output:
Returns nothing
"""
filename = url.split("/")[-1].strip()
f = open(os.path.join(folder, filename), 'wb')
f.write(request.urlopen(url).read())
f.close()
def getUrls(txt_file, folder):
"""
Extract the urls from images.txt and crops.txt
Input:
txt_file : txt file (images.txt or crops.txt)
folder: Path of the txt file
Output:
Returns list of urls read from the input file
"""
with open(os.path.join(folder, txt_file)) as imagefiles:
urls = imagefiles.readlines()
return urls
def downloadImages(url_list, folder):
"""
Downloads images using the links provided in txt files
Input:
url_list : urls read from txt files
folder: Path of the folder where the files to be downloaded
Output:
Returns nothing
"""
for image in tqdm(url_list):
filename = image.split("/")[-1].strip()
f = open(os.path.join(folder, filename), 'wb')
f.write(request.urlopen(image).read())
f.close()
def getSampleTestset(sample_testset, folder):
"""
Downloads sample_testset.tar and extract the data from it
Input:
sample_testset : list containing url and filename
folder: Path of the folder where the files to be downloaded
Output:
Returns nothing
"""
filename = os.path.join(folder, sample_testset[1])
f = open(filename, "wb")
f.write(request.urlopen(sample_testset[0]).read())
f.close()
if filename.endswith("tar.gz"):
tar = tarfile.open(filename, "r:gz")
tar.extractall(folder)
tar.close()
if __name__ == "__main__":
# -------------------------- Create directories ---------------------------
base_dir = os.getcwd()
data_folder = createFolders(base_dir, "Dataset")
images_folder = createFolders(data_folder, "Images")
crops_folder = createFolders(data_folder, "Crops")
# -------------------------- Download files -------------------------------
files_url = ["https://s3.amazonaws.com/msd-cvteam/interview_tasks/crops_images_association_2/images.txt",
"https://s3.amazonaws.com/msd-cvteam/interview_tasks/crops_images_association_2/crops.txt"]
getFiles(files_url[0], data_folder)
getFiles(files_url[1], data_folder)
# --------------------------- Download Dataset ----------------------------
image_url = getUrls("images.txt", data_folder)
crop_url = getUrls("crops.txt", data_folder)
downloadImages(image_url, images_folder)
downloadImages(crop_url, crops_folder)
# ------------------------ Download Sample dataset ------------------------
testset = ["https://bit.ly/2VoBYo1", "sample_testset.tar.gz"]
getSampleTestset(testset, data_folder)