Skip to content

Commit

Permalink
stub out search API latest x days #33
Browse files Browse the repository at this point in the history
  • Loading branch information
pdurbin committed Oct 8, 2019
1 parent 8311830 commit cfad3d9
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 5 deletions.
52 changes: 52 additions & 0 deletions config.json.33-scratch1
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
{
"installations": [
"https://data.inra.fr",
"https://data.qdr.syr.edu",
"https://dataverse.harvard.edu"
],
"api_response_cache_dir": "/Users/pdurbin/github/iqss/dataverse-metrics/cache",
"aggregate_output_dir": "/Users/pdurbin/github/iqss/dataverse-metrics",
"num_months_to_process": 12,
"num_days_to_process": 7,
"endpoints": {
"single": [
"dataverses/byCategory",
"datasets/bySubject"
],
"daily": [
"dataverse/published",
"dataset/published"
],
"monthly": [
"dataverses/toMonth",
"datasets/toMonth",
"files/toMonth",
"downloads/toMonth"
],
"monthly_itemized": [
]
},
"blacklists": {
"datasets/bySubject": [
"Not specified",
"Other"
]
},
"colors": {
"dataverses/toMonth": "#CF3636",
"dataverses/byCategory": [
"#B22200",
"#006699"
],
"datasets/toMonth": "#E58433",
"datasets/bySubject": [
"#B22200",
"#006699"
],
"files/toMonth": "#006699",
"downloads/toMonth": "#B94617"
},
"github_repos": [
"https://github.com/IQSS/dataverse"
]
}
48 changes: 43 additions & 5 deletions download.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import json
import sys
import os
from datetime import datetime, date
from datetime import datetime, date, timedelta
import datetime as DT
import calendar
try:
import urllib.request as urlrequest
Expand All @@ -20,20 +21,57 @@ def main():
installations = config['installations']
api_response_cache_dir = config['api_response_cache_dir']
num_months_to_process = config['num_months_to_process']
num_days_to_process = config['num_days_to_process']
monthly_endpoints = config['endpoints']['monthly']
daily_endpoints = config['endpoints']['daily']
single_endpoints = config['endpoints']['single']
monthly_itemized_endpoints = config['endpoints']['monthly_itemized']
github_repos = config.get('github_repos')

for installation in installations:
process_monthly_endpoints(installation, monthly_endpoints, api_response_cache_dir, num_months_to_process)
process_daily_endpoints(installation, daily_endpoints, api_response_cache_dir, num_days_to_process)
pass
# process_monthly_endpoints(installation, monthly_endpoints, api_response_cache_dir, num_months_to_process)
# "monthly itemized" metrics are downloaded the same way as regular montly metrics:
process_monthly_endpoints(installation, monthly_itemized_endpoints, api_response_cache_dir, num_months_to_process)
process_single_endpoints(installation, single_endpoints, api_response_cache_dir)
# process_monthly_endpoints(installation, monthly_itemized_endpoints, api_response_cache_dir, num_months_to_process)
# process_single_endpoints(installation, single_endpoints, api_response_cache_dir)

if github_repos:
for repo in github_repos:
process_github_repo(repo, api_response_cache_dir)
pass
# process_github_repo(repo, api_response_cache_dir)


def process_daily_endpoints(installation, daily_endpoints, api_response_cache_dir, num_days_to_process):
for endpoint in daily_endpoints:
process_daily_endpoint(installation, endpoint, api_response_cache_dir, num_days_to_process)

def process_daily_endpoint(installation, endpoint, api_response_cache_dir, num_days_to_process):
dvtype = endpoint.split('/')[0]
todayobj = datetime.today()
today = todayobj.strftime('%Y-%m-%d')
start_date = '2019-09-30'
earlier_date = todayobj - DT.timedelta(days= num_days_to_process + 1)
start_date = earlier_date.strftime('%Y-%m-%d')
# 1000 is the max! Iterate in the future if need be.
per_page = str(1000)
optionalHarvestedString = ''
if (dvtype == 'dataset'):
# the minus sign ("-") means NOT, not harvested
optionalHarvestedString = '&fq=-metadataSource%3A"Harvested"'
# fq=dateSort:[2015-05-01T00\:00\:00Z+TO+2015-05-10T00\:00\:00Z]
url = installation + '/api/search?q=*&per_page=' + per_page + '&type=' + dvtype + optionalHarvestedString + '&fq=dateSort:[' + start_date + 'T00\:00\:00Z+TO+' + today + 'T00\:00\:00Z]'
print(url)
path = api_response_cache_dir + '/daily/' + dvtype
if not os.path.exists(path):
os.makedirs(path)
response = urlrequest.urlopen(url)
json_out = get_remote_json(response)
o = urlparse(installation)
hostname = o.hostname
filename = hostname + '.json'
with open(path + '/' + filename, 'w') as outfile:
json.dump(json_out, outfile, indent=4)


def process_monthly_endpoints(installation, monthly_endpoints, api_response_cache_dir, num_months_to_process):
Expand Down

0 comments on commit cfad3d9

Please sign in to comment.