-
Notifications
You must be signed in to change notification settings - Fork 9
/
erddap_query.py
155 lines (130 loc) · 6.04 KB
/
erddap_query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import erddapy
from erddapy import ERDDAP
import sys
import json
import requests
import pandas as pd
import random
# some configuration:
headers = {'User-agent': '{}-{}'.format((erddapy.__version__), "erddap-colocate-ohw19"),
'From': '[email protected]'}
#headers = {'From': '[email protected]'}
def query(url, **kw):
df = pd.DataFrame()
# we need to rstrip to prevent a '//' in the URL for some reason:
url = url.rstrip("/")
e = ERDDAP(
server=url,
protocol='tabledap',
response='csv'
)
# submit the query:
try:
# this is redundant to ERDDAPY API query below:
#r = requests.get(e.get_search_url(**kw), headers=headers)
#r.raise_for_status()
print("Testing ERDDAP {}".format(url))
df = pd.read_csv("{}".format(e.get_search_url(**kw), headers=headers))
print("ERDDAP {} returned results from URL: {}".format(url, e.get_search_url(**kw)))
df['server'] = url
df.dropna(subset=['tabledap'],inplace=True)
return df[['server','Dataset ID','tabledap','Institution','Summary']]
except Exception as ex:
# can happen if the dataset does not have any features within the query window, just log it here:
if type(ex).__name__ in ["HTTPError"]:
print(ex)
#raise
pass
return None
def get_coordinates(df, **kw):
'''
Example ERDDAP TableDAP URL:
dataset_url = '%s/tabledap/%s.csvp?latitude,longitude,time&longitude>=-72.0&longitude<=-69&latitude>=38&latitude<=41&time>=1278720000.0&time<=1470787200.0&distinct()' % (all_datasets['server'].iloc[int(i)],all_datasets['Dataset ID'].iloc[int(i)])
'''
df_coords = pd.DataFrame()
# alternate approach to above is iterate the original DataFrame passed (df), stopping either
# at final_dataset_limit (10 currently) or the max # of rows in df (conclusion of for loop)
# previous enclosing while loop is unnecessary as a result
final_dataset_limit = 10
datasets_found = 0
if df.shape[0] < final_dataset_limit:
final_dataset_limit = df.shape[0]
index_random = random.sample(range(0,df.shape[0]),df.shape[0])
print("index_random: {}".format(index_random))
#for i in range(subset_datasets.shape[0]):
for i in index_random:
server_url = df['server'].iloc[int(i)]
dataset_id = df['Dataset ID'].iloc[int(i)]
institution = df['Institution'].iloc[int(i)]
# skip some difficult datasets for now:
if "ROMS" in dataset_id or "DOP" in dataset_id: # skip ROMS model output
#print("Skipping %s" % server_url + dataset_id)
continue
e = ERDDAP(
server=server_url,
protocol='tabledap',
response='csv'
)
try:
print("datasets_found: {}".format(datasets_found))
# former config for query, replaced with new code below:
#e.variables=["latitude","longitude"]#,"time"]
#e.dataset_id = all_datasets['Dataset ID'].iloc[int(i)]
#e.constraints = {
# "time>=": kw['min_time'],
# "time<=": kw['max_time'],
# "longitude>=": kw['min_lon'],
# "longitude<=": kw['max_lon'],
# "latitude>=": kw['min_lat'],
# "latitude<=": kw['max_lat'],
# "distinct" : ()
#}
# Generate a download URL via e.get_download_url and pass to Pandas DataFrame via read_csv
# we need to use e.constraints here rather than in e.get_download_url to allow appending '>=' '<=' to the contstraints keys to match ERDDAP's API
# (parameter signature differs from the search API used above)
# also add a 'distinct = ()' param, generate a download url, and submit a csv dataset download request to ERDDAP
#kw["distinct"] = "()"
e.constraints = {
"time>=": kw['min_time'],
"time<=": kw['max_time'],
"longitude>=": kw['min_lon'],
"longitude<=": kw['max_lon'],
"latitude>=": kw['min_lat'],
"latitude<=": kw['max_lat'],
"distinct" : ()
}
url = e.get_download_url(
#constraints=kw,
response="csvp",
dataset_id=df['Dataset ID'].iloc[int(i)],
variables=["latitude","longitude"]
)
print("Download URL: {}".format(url))
#coords = pd.read_csv(url, headers=headers)
coords = pd.read_csv(url)
coords['dataset_count'] = i
coords['dataset_download_url'] = url
coords['Dataset ID'] = dataset_id
coords['Institution'] = institution
metadata_url = e.get_info_url(
dataset_id=df['Dataset ID'].iloc[int(i)],
response='csv'
)
metadata = pd.read_csv(metadata_url)
coords['cdm_data_type'] = "".join(metadata.loc[metadata["Attribute Name"] == "cdm_data_type", "Value"])
#get_var_by_attr example (ToDo):
#e.get_var_by_attr(dataset_id, standard_name='northward_sea_water_velocity')
print(coords.head())
df_coords = pd.concat([df_coords,coords])
# reaching this point in the query means the dataset query was successful, increment
# we need to break out of for loop here however if we reach final_dataset_limit to not go over:
datasets_found += 1
print("new dataset acquired; datasets_found: {}".format(datasets_found))
if datasets_found == final_dataset_limit: break
except Exception as ex:
# can happen if the dataset does not have any features within the query window, just log it here:
if type(ex).__name__ in ["HTTPError"]:
print(ex)
#raise
pass
return df_coords