-
Notifications
You must be signed in to change notification settings - Fork 0
/
check_if_ctd_data.py
268 lines (173 loc) · 7.6 KB
/
check_if_ctd_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
import urllib3
import pandas as pd
import io
from itertools import compress
import os
import tempfile
import requests
import zipfile
import gzip
from gzip import decompress
from io import StringIO
def check_zip_files(url):
# data = response.read() # a `bytes` object
#request = requests.get(url)
#file = zipfile.ZipFile(io.BytesIO(request.content))
# https://stackoverflow.com/questions/55718917/download-zip-file-locally-to-tempfile-extract-files-to-tempfile-and-list-the-f
results = requests.get(url)
tmpdir = tempfile.mkdtemp()
zip_path = os.path.join(tmpdir, 'zip_folder.zip')
with open(zip_path, 'wb') as f:
f.write(results.content)
file = zipfile.ZipFile(tmpdir + '/zip_folder.zip')
file.extractall(path=tmpdir)
files = os.listdir(tmpdir)
for file in files:
if 'txt' in file:
pass
#print(file)
# if content_encoding == 'gzip' or 'zip'
# # Skip file but keep record to a zip file log
# # Store, page, dataset id, and content type
# # if zip file
# file = zipfile.ZipFile(io.BytesIO(response.content))
# # if gzip file
# # compressedFile = StringIO(response.text)
# # decompressedFile = gzip.GzipFile(fileobj=compressedFile)
# # if gzip file
# # file = decompress(response.content)
def is_ctd_column(column_names, needles, exclude):
column_names = [name.lower() for name in column_names]
is_ctd_column = False
needle_vals_found = []
exclude_vals_found = []
# Find excluded values if any
for val in exclude:
val_found = [name for name in column_names if name.startswith(val)]
if val_found:
exclude_vals_found.extend(val_found)
# Find needle values if any
for val in needles:
val_found = [name for name in column_names if name.startswith(val)]
if val_found:
needle_vals_found.extend(val_found)
# remove exclude cols from needle cols
for exclude in exclude_vals_found:
if exclude in needle_vals_found:
needle_vals_found.remove(exclude)
if needle_vals_found:
is_ctd_column = True
# Find index of name
ctd_col_index = [column_names.index(val) for val in needle_vals_found][0]
else:
ctd_col_index = None
print('needle vals found', needle_vals_found)
return is_ctd_column, ctd_col_index
def check_column_names_for_ctd(column_names):
pressure_exists, pressure_col_index = is_ctd_column(column_names, ['ctdprs', 'press'], [])
temperature_exists, temperature_col_index = is_ctd_column(column_names, ['ctdtmp', 'temp'], [])
salinity_exists, salinity_col_index = is_ctd_column(column_names, ['ctdsal', 'sal'], [])
latitude_exists, latitude_col_index = is_ctd_column(column_names, ['lat'], ['lat_range', 'lat_start'])
longitude_exists, longitude_col_index = is_ctd_column(column_names, ['lon'], ['lon_range', 'lon_start'])
# TODO. Exclude lat_start or not? maybe lat_end?
is_ctd = all([pressure_exists, temperature_exists, salinity_exists,
latitude_exists, longitude_exists])
print(pressure_col_index, latitude_col_index, longitude_col_index)
return is_ctd, pressure_col_index, latitude_col_index, longitude_col_index
def create_dataset_dataframe(header_list, data_list):
try:
df = pd.DataFrame(data_list, columns = header_list)
except:
df = None
return df
def get_data_list(text, line_count):
data_list = []
split_text = text.split('\n')
for line in split_text[line_count + 2:]:
line_split = line.split('\t')
data_list.append(line_split)
return data_list
def fix_two_headers_into_one(text):
split_text = text.split('\n')
# newline in header so header is over two lines
# Combine two header lines
line_count = 0
for line in split_text:
if '#' in line:
line_count = line_count + 1
continue
else:
break
# TODO
# assuming only two header lines. Check if
# more comment lines and log out can't get data.
first_line = split_text[line_count].rstrip('\n')
second_line = split_text[line_count + 1].rstrip('\n')
first_line_list = first_line.split('\t')
second_line_list = second_line.split('\t')
first_line_list.extend(second_line_list)
header_list = first_line_list
return header_list, line_count
def get_data_text(url, processing_log, dataset_id, page, number_of_datasets_per_page):
try:
# response = urllib3.request.urlopen(url)
# data = response.read()
response = requests.get(url, timeout=600)
data_text = response.text
except requests.exceptions.RequestException as e:
print(e)
with open(processing_log, 'a+') as f:
log_text = f"Can't read text file. {e} error at dataset id: {dataset_id} on page {page + 1} with {number_of_datasets_per_page} data sets on page\n"
f.write(log_text)
# try:
# # data is a `bytes` object
# text = data.decode('utf-8')
# except:
# text = None
return data_text
def get_data_url(dataset_id):
url = f'http://www.bco-dmo.org/dataset/{dataset_id}/data/download'
return url
def put_dataset_into_dataframe(dataset_id, processing_log, page, index, number_of_datasets_per_page):
url = get_data_url(dataset_id)
# To get content-type and see if it is zip
try:
response = requests.get(url, timeout=600)
content_type = response.headers['Content-Type']
# response = urllib3.request.urlopen(url)
# content_type = response.headers['Content-Type']
except requests.exceptions.RequestException as e:
with open(processing_log, 'a+') as f:
log_text = f"Can't read text file. {e} error at dataset id: {dataset_id} on page {page + 1} with {number_of_datasets_per_page} data sets on page\n"
f.write(log_text)
return None
# try:
# #response = urllib3.request.urlopen(url).read()
# except HTTPError as e:
# with open(processing_log, 'a+') as f:
# log_text = f"Can't read text file. {e} error at dataset id: {dataset_id} on page {page + 1} with {number_of_datasets_per_page} data sets on page\n"
# f.write(log_text)
# return None
# TODO. Get ctd data out of zip file to get lon/lat data
if content_type == 'application/zip':
#check_zip_files(url)
return None
text = get_data_text(url, processing_log, dataset_id, page, number_of_datasets_per_page)
if text is None:
with open(processing_log, 'a+') as f:
log_text = f"Can't read text file. UTF-8 decode error at dataset id: {dataset_id} on page {page + 1} with {number_of_datasets_per_page} data sets on page\n"
f.write(log_text)
return None
try:
# If fail, dataset usually has two headers where they
# should be one
dataset_df = pd.read_csv(io.StringIO(text), comment="#", sep="\t")
except:
# Log two headers occuring
with open(processing_log, 'a+') as f:
log_text = f"Can't read data into pandas. Try to account for header split over two lines at dataset id: {dataset_id} on page {page + 1} with {number_of_datasets_per_page} data sets on page\n"
f.write(log_text)
header_list, line_count = fix_two_headers_into_one(text)
data_list = get_data_list(text, line_count)
dataset_df = create_dataset_dataframe(header_list, data_list)
return dataset_df