This repository has been archived by the owner on Aug 20, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhxl-datasets.py
57 lines (45 loc) · 1.58 KB
/
hxl-datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
"""Crawl HXL to list data providers and first dates of datasets
CKAN API documentation: http://docs.ckan.org/en/latest/api/
Python CKAN library: https://github.com/ckan/ckanapi
Started by David Megginson, 2017-09-17
"""
import ckanapi, time, sys, csv
DELAY = 2
"""Time delay in seconds between datasets, to give HDX a break."""
CHUNK_SIZE=100
"""Number of datasets to read at once"""
CKAN_URL = 'https://data.humdata.org'
"""Base URL for the CKAN instance."""
# Open a connection to HDX
ckan = ckanapi.RemoteCKAN(CKAN_URL)
# Open a CSV output stream
output = csv.writer(sys.stdout)
# Iterate through all the datasets ("packages") and resources on HDX
start = 0
result_count = 999999 # just a big, big number; will reset on first search result
output.writerow([
'Dataset name',
'Dataset title',
'HDX org',
'Source',
'Date created',
'Date updated',
])
while start < result_count:
result = ckan.action.package_search(fq='tags:hxl', start=start, rows=CHUNK_SIZE)
result_count = result['count']
print("Read {} package(s)...".format(len(result['results'])), file=sys.stderr)
for package in result['results']:
org = package['organization']
date = package['metadata_created'][:10]
output.writerow([
package['name'],
package['title'],
package['organization']['name'],
package['dataset_source'],
package['metadata_created'][:10],
package['metadata_modified'][:10],
])
start += CHUNK_SIZE # next chunk, but first ...
time.sleep(DELAY) # give HDX a short rest
# end