forked from OCHA-DAP/hdx-ext-scraper-geoboundaries
-
Notifications
You must be signed in to change notification settings - Fork 1
/
geoboundaries.py
executable file
·122 lines (109 loc) · 4.43 KB
/
geoboundaries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/python
"""
CODS:
-----
Generates urls from the geoBoundaries website.
"""
import logging
from hdx.data.dataset import Dataset
from hdx.data.hdxobject import HDXError
from hdx.data.resource import Resource
from hdx.location.country import Country
from hdx.utilities.dictandlist import dict_of_lists_add
from hdx.utilities.path import get_filename_from_url
from slugify import slugify
logger = logging.getLogger(__name__)
def get_data(downloader, url):
downloader.download(url)
admin_boundaries = dict()
ignored_countries = set()
for boundaryinfo in downloader.get_json():
if boundaryinfo["worldBankIncomeGroup"] in (
"High-income Countries",
"No income group available",
):
ignored_countries.add(boundaryinfo["boundaryName"])
continue
countryiso3 = boundaryinfo["boundaryISO"]
dict_of_lists_add(admin_boundaries, countryiso3, boundaryinfo)
logger.info(
f'Ignoring high income/no income available countries: {", ".join(sorted(ignored_countries))}'
)
return admin_boundaries
def get_name_url(url):
return get_filename_from_url(url), url
def generate_dataset(countryiso3, admin_boundaries):
countryname = Country.get_country_name_from_iso3(countryiso3)
title = f"{countryname} - Subnational Administrative Boundaries"
logger.info(f"Creating dataset: {title}")
name = f"geoBoundaries admin boundaries for {countryname}"
slugified_name = slugify(name).lower()
dataset = Dataset({"name": slugified_name, "title": title})
try:
dataset.add_country_location(countryiso3)
except HDXError as e:
logger.error(f"{title} has a problem! {e}")
return None, None, None
dataset.set_maintainer("0ec5ff66-dc01-4087-bb82-1d01f3b1c1ce")
dataset.set_organization("8be95204-f453-4b66-a4f6-dbe84cb0bdee")
dataset.set_expected_update_frequency("Live")
dataset.add_tags(["administrative boundaries-divisions", "geodata", "gazetteer"])
logger.info(f"Dataset added: {dataset}")
sources = list()
dataset_years = set()
resource_names = list()
def add_resource(key, description, filetype="geojson"):
name, url = get_name_url(admin_boundary[key])
resource = Resource({"name": name, "url": url, "description": description})
resource.set_file_type(filetype)
resource_names.append(name)
dataset.add_update_resource(resource)
all_hdx = True
boundarytypes = list()
logger.info(f"Entered into resource method: {dataset}")
for admin_boundary in sorted(admin_boundaries, key=lambda x: x["boundaryType"]):
if "data.humdata.org" not in admin_boundary["boundarySourceURL"]:
all_hdx = False
dataset_years.add(admin_boundary["boundaryYearRepresented"].replace(".0", ""))
logger.info(f"Printing all_hdx variable: {all_hdx}")
logger.info(f"Admin Boundary: {admin_boundary}")
source = admin_boundary["boundarySource"]
sources.append(source)
logger.info(f"printing dataset sources: {source}")
boundarytype = admin_boundary["boundaryType"]
logger.info(f"Admin Boundary Type: {boundarytype}")
boundarytypes.append(boundarytype)
add_resource(
"simplifiedGeometryGeoJSON",
f"Simplified GeoJSON {boundarytype} boundaries for {countryname}",
)
add_resource(
"gjDownloadURL", f"GeoJSON {boundarytype} boundaries for {countryname}"
)
add_resource(
"tjDownloadURL", f"TopoJSON {boundarytype} boundaries for {countryname}"
)
add_resource(
"staticDownloadLink",
f"Other formats including shape file {boundarytype} boundaries for {countryname}",
"shp",
)
if all_hdx:
logger.info(
f"Ignoring {countryname} as data for all admin levels comes from HDX!"
)
return None, None, None
if boundarytypes == ["ADM0"]:
dataset.set_subnational(False)
else:
dataset.set_subnational(True)
dataset_years = sorted(dataset_years)
dataset.set_reference_period_year_range(dataset_years[0], dataset_years[-1])
logger.info("CHecking line 1")
dataset["dataset_source"] = "".join(sorted(sources))
logger.info("CHecking line 2")
logger.info(
f'checking sources: {"".join(sorted(sources))}'
)
logger.info("Getting Out Of resource Method")
return boundarytypes, dataset, resource_names