-
Notifications
You must be signed in to change notification settings - Fork 4
/
get_cellar_docs.py
228 lines (179 loc) · 8.76 KB
/
get_cellar_docs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
#!/usr/bin/python
# coding=<utf-8>
""" Program to send GET requests to the EU CELLAR endpoint and download zip files for the given documents under a CELLAR URI."""
import requests
import zipfile
import io
import os
from datetime import datetime
from get_cellar_ids import get_cellar_info_from_endpoint, get_cellar_ids_from_json_results, cellar_ids_to_file, \
get_cellar_ids_from_csv_file
from get_text_from_cellar_files import get_text
from utils.file_utils import text_to_str, get_subdir_list_from_path, print_list_to_file, to_json_output_file
from threading import Thread
def check_ids_to_download(id_list, dir_to_check):
"""
Check whether the id in the given CELLAR id_list is already present
in the directory containing previously downloaded files.
The directory contains subdirectories named with a cellar id.
Return a list of cellar_ids absent from the subdirectory names.
:param id_list: list
:return: list
"""
# Get CELLAR ids in the subdirectories containing the files already downloaded
downloaded_files_list = get_subdir_list_from_path(dir_to_check)
# print('ALREADY_DOWNLOADED:', len(downloaded_files_list))
in_dir_name = 'id_logs/in_dir_lists/'
os.makedirs(os.path.dirname(in_dir_name), exist_ok=True)
print_list_to_file(in_dir_name + 'in_dir_' + timestamp + '.txt', downloaded_files_list)
# Get list of files that have not yet been downloaded
missing_ids_list = list(set(id_list) - set(downloaded_files_list))
#print('SET_DIFF:', len(missing_ids_list))
new_ids_dir_name = 'id_logs/cellar_ids/'
os.makedirs(os.path.dirname(new_ids_dir_name), exist_ok=True)
print_list_to_file(new_ids_dir_name + 'cellar_ids_' + timestamp + '.txt', missing_ids_list)
return missing_ids_list
def rest_get_call(id):
"""Send a GET request to download a zip file for the given id under the CELLAR URI."""
url = 'http://publications.europa.eu/resource/cellar/' + id
headers = {
'Accept': "application/zip;mtype=fmx4, application/xml;mtype=fmx4, application/xhtml+xml, text/html, text/html;type=simplified, application/msword, text/plain, application/xml;notice=object",
'Accept-Language': "eng",
'Content-Type': "application/x-www-form-urlencoded",
'Host': "publications.europa.eu"#,
}
response = requests.request("GET", url, headers=headers)
return response
def download_zip(response, folder_path):
"""
Downloads the zip file returned by the restful get request.
Source: https://stackoverflow.com/questions/9419162/download-returned-zip-file-from-url?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
"""
z = zipfile.ZipFile(io.BytesIO(response.content))
z.extractall(folder_path)
def process_range(sub_list, folder_path):
"""
Process a list of ids to download the corresponding zip files.
:param sub_list: list of str
:param folder_path: str
:return: write to files
"""
# Keep track of downloads
zip_files = []
single_files = []
other_downloads = []
# Count downloads
count_cellar_ids = 0
count_zip = 0
count_single= 0
count_other = 0
for id in sub_list:
count_cellar_ids += 1
# Specify sub_folder_path to send results of request
sub_folder_path = folder_path + id
# Send Restful GET request for the given id
response = rest_get_call(id.strip())
# If the response's header contains the string 'Content-Type'
if 'Content-Type' in response.headers:
# If the string 'zip' appears as a value of 'Content-Type'
if 'zip' in response.headers['Content-Type']:
count_zip += 1
zip_files.append(id)
# Download the contents of the zip file in the given folder
download_zip(response, sub_folder_path)
# If the value of 'Content-Type' is not 'zip'
else:
count_single += 1
single_files.append(id)
# Create a directory with the cellar_id name
# and write the returned content in a file
# with the same name
out_file = sub_folder_path + '/' + id + '.html'
os.makedirs(os.path.dirname(out_file), exist_ok=True)
with open(out_file, 'w') as f:
f.write(response.text)
# If the response's header does not contain the string 'Content-Type'
else:
count_other += 1
other_downloads.append(id)
# print('NO_CONTENT_TYPE:', response.content)
# Write the returned content in a file
# out_file = sub_folder_path + '/' + id + '.xml'
# with open(out_file, 'wb') as f:
# f.write(response.text)
# log_text = ("\nQuery file: " + __file__ +
# "\nDownload date: " + str(datetime.today()) +
# "\n\nNumber of zip files downloaded: " + str(count_zip) +
# "\nNumber of non-zip files downloaded: " + str(count_single) +
# "\nNumber of other downloads: " + str(count_other) +
# "\nTotal number of cellar ids processed: " + str(count_zip + count_single + count_other) +
# "\n\nTotal number of downloaded files: " + str(count_zip + count_single) +
# "\nTotal number of cellar ids: " + str(len(id_list)) +
# "\n\n========================================\n"
# )
#
# print(log_text)
# Write the list of other (failed) downloads in a file
id_logs_path = 'id_logs/failed_' + timestamp + '.txt'
os.makedirs(os.path.dirname(id_logs_path), exist_ok=True)
with open(id_logs_path, 'w+') as f:
if len(other_downloads) != 0:
f.write('Failed downloads ' + timestamp + '\n' + str(other_downloads))
# Program starts here
# ===================
timestamp = str(datetime.now().strftime("%Y%m%d-%H%M%S"))
# Get SPARQL query from given file
sparql_query = text_to_str('queries/sparql_queries/financial_domain_sparql_2019-01-07.rq')
# print('SPARQL_PATH:', sparql_query)
# Get CELLAR information from EU SPARQL endpoint (in JSON format)
sparql_query_results = get_cellar_info_from_endpoint(sparql_query)
# Output SPARQL results to file
sparql_query_results_dir = "queries/sparql_query_results/"
os.makedirs(os.path.dirname(sparql_query_results_dir), exist_ok=True)
sparql_query_results_file = sparql_query_results_dir + "query_results_" + timestamp + ".json"
to_json_output_file(sparql_query_results_file, sparql_query_results)
# Create a list of ids from the SPARQL query results (in JSON format)
id_list = sorted(get_cellar_ids_from_json_results(sparql_query_results))
# print('ID_LIST:', len(id_list), id_list[:10])
# # ALTERNATIVELY
# # If you already have a CSV file with cellar ids,
# # e.g., copy-pasted from browser results,
# # specify file (path) containing the cellar IDs
# # Input format: cellarURIs,lang,mtypes,workTypes,subjects,subject_ids
# cellar_ids_file = 'queries/sparql_query_results/query_results_2019-01-07.csv'
# #
# # Create a list of CELLAR ids from the given CSV file
# id_list = get_cellar_ids_from_csv_file(cellar_ids_file)
# Output retrieved CELLAR ids list to txt file
# with each ID on a new line
cellar_ids_to_file(id_list, timestamp)
# Create a list of not-yet-downloaded file ids by comparing the results in id_list with files present in the given directory
# dir_to_check = None
dir_to_check = "data/cellar_files_20201214-165041/"
# dir_to_check = "dir_with_previously_downloaded_files/"
if dir_to_check and os.path.exists(dir_to_check):
id_list = check_ids_to_download(id_list, dir_to_check)
# print('NEW_FILES_TO_DOWNLOAD:', len(id_list))
# Specify folder path to store downloaded files
dwnld_folder_path = "data/cellar_files_" + timestamp + "/"
# Run multiple threads in parallel to download the files
# using the process_range(sub_list, dwnld_folder_path) function
# Adapted from: https://stackoverflow.com/questions/16982569/making-multiple-api-calls-in-parallel-using-python-ipython
nthreads = 11
threads = []
for i in range(nthreads): # Four times...
# print('ID_LIST:', id_list[i::nthreads])
sub_list = id_list[i::nthreads]
t = Thread(target=process_range, args=(sub_list, dwnld_folder_path))
threads.append(t)
# start the threads
[t.start() for t in threads]
# wait for the threads to finish
[t.join() for t in threads]
# Generate text files for downloaded XML and HTML files
# Set replace_existing to True to replace existing text files.
# To process only new files, set replace_existing to False (default).
# Usage: get_text(input_path, output_dir, replace_existing=False)
txt_folder_path = "data/text_files_" + dwnld_folder_path.split('_')[-1]
# print('TXT_DIR_PATH:', txt_folder_path)
get_text(dwnld_folder_path, txt_folder_path, replace_existing=False)