-
Notifications
You must be signed in to change notification settings - Fork 113
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding script to download specific indicators from world bank WDI dat…
…aset (#968) * Adding script to download specific indicators from world bank WDI dataset * Adding script to download specific indicators from world bank WDI dataset * Adding script to download specific indicators from world bank WDI dataset * Adding script to download specific indicators from world bank WDI dataset
- Loading branch information
Showing
2 changed files
with
144 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
Author: @cbari | ||
|
||
This script is used to download a specific set of indicators from World Bank and | ||
write a cleaned csv to a specified output path. The script will loop through | ||
the set of indicators and download data for all dates/countries for that | ||
indicator. | ||
|
||
Currently, the result csv is used in a private Data Commons instance, but can be | ||
reused as necessary. |
135 changes: 135 additions & 0 deletions
135
scripts/world_bank/wdi/download_indicators/wdi_download_indicators.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
"""Download World Bank WDI via API and write cleaned csv to out_path.""" | ||
|
||
import io | ||
import urllib.request | ||
import zipfile | ||
|
||
from absl import app | ||
from absl import flags | ||
import numpy as np | ||
import pandas as pd | ||
|
||
_OUT_PATH = flags.DEFINE_string('out_path', None, 'CNS path to write output.') | ||
|
||
indicators = [ | ||
'SP.POP.TOTL', | ||
'SP.POP.TOTL.FE.IN', | ||
'SP.POP.TOTL.MA.IN', | ||
'SP.POP.0014.TO.ZS', | ||
'SP.POP.1564.TO.ZS', | ||
'SP.POP.65UP.TO.ZS', | ||
'SP.DYN.LE00.IN', | ||
'SP.URB.TOTL.IN.ZS', | ||
'SP.URB.TOTL', | ||
'SP.POP.0014.FE.ZS', | ||
'SP.POP.1519.FE.5Y', | ||
'SP.POP.2024.FE.5Y', | ||
'SP.POP.2529.FE.5Y', | ||
'SP.POP.3034.FE.5Y', | ||
'SP.POP.3539.FE.5Y', | ||
'SP.POP.4044.FE.5Y', | ||
'SP.POP.4549.FE.5Y', | ||
'SP.POP.5054.FE.5Y', | ||
'SP.POP.5559.FE.5Y', | ||
'SP.POP.6064.FE.5Y', | ||
'SP.POP.65UP.FE.ZS', | ||
'SP.POP.0014.MA.ZS', | ||
'SP.POP.1519.MA.5Y', | ||
'SP.POP.2024.MA.5Y', | ||
'SP.POP.2529.MA.5Y', | ||
'SP.POP.3034.MA.5Y', | ||
'SP.POP.3539.MA.5Y', | ||
'SP.POP.4044.MA.5Y', | ||
'SP.POP.4549.MA.5Y', | ||
'SP.POP.5054.MA.5Y', | ||
'SP.POP.5559.MA.5Y', | ||
'SP.POP.6064.MA.5Y', | ||
'SP.POP.65UP.MA.ZS', | ||
'SE.ADT.LITR.ZS', | ||
'SE.ADT.LITR.FE.ZS', | ||
'SE.ADT.LITR.MA.ZS', | ||
'SE.ADT.1524.LT.ZS', | ||
'SE.ADT.1524.LT.FE.ZS', | ||
'SE.ADT.1524.LT.MA.ZS', | ||
'NY.GDP.MKTP.CD', | ||
'NY.GDP.MKTP.KD.ZG', | ||
'NV.AGR.TOTL.ZS', | ||
'NV.AGR.TOTL.CD', | ||
'NV.SRV.TOTL.ZS', | ||
'NV.SRV.TOTL.CD', | ||
'NV.IND.TOTL.ZS', | ||
'NV.IND.TOTL.CD', | ||
'NY.GNP.PCAP.PP.CD', | ||
'SI.POV.GINI', | ||
'IT.NET.USER.ZS', | ||
] | ||
|
||
|
||
def DownloadAndParseCsvs() -> None: | ||
"""Loops through all indicators and downloads the data for all countries/dates. | ||
This data is then added to the output which is written to _OUT_PATH | ||
""" | ||
dat = [] | ||
for indicator in indicators: | ||
resp = urllib.request.urlopen( | ||
f'http://api.worldbank.org/v2/country/all/indicator/{indicator}?source=2&downloadformat=csv' | ||
) | ||
myzip = zipfile.ZipFile(io.BytesIO(resp.read())) | ||
csv_data = pd.DataFrame() | ||
start_index = 0 | ||
found = False | ||
for filename in myzip.namelist(): | ||
if filename.startswith('API_'): | ||
with myzip.open(filename) as f: | ||
for line in f: | ||
if line.decode('utf-8').startswith('"Country'): | ||
break | ||
start_index += 1 | ||
with myzip.open(filename) as f: | ||
csv_data = pd.read_csv(f, skiprows=start_index) | ||
found = True | ||
if found: | ||
for _, row in csv_data.iterrows(): | ||
if True in pd.isna(row): | ||
continue | ||
for year in range(1960, 2022): | ||
if pd.isna(row['Country Code']): | ||
continue | ||
country_str = 'dcid:country/' + row['Country Code'] | ||
sv_string = 'worldBank/' + row['Indicator Code'].replace( | ||
'.', '_') | ||
dat.append([ | ||
row['Indicator Code'], | ||
sv_string, | ||
'WorldBank_WDI_CSV', | ||
country_str, | ||
year, | ||
row[str(year)], | ||
'', | ||
]) | ||
|
||
out_df = pd.DataFrame( | ||
np.array(dat), | ||
columns=[ | ||
'indicatorcode', | ||
'statvar', | ||
'measurementmethod', | ||
'observationabout', | ||
'observationdate', | ||
'observationvalue', | ||
'unit', | ||
], | ||
) | ||
with open(_OUT_PATH.value, 'w+') as f_out: | ||
out_df.to_csv(f_out, index=False) | ||
|
||
|
||
def main(argv: list[str]) -> None: | ||
if len(argv) > 1: | ||
raise app.UsageError('Too many command-line arguments.') | ||
DownloadAndParseCsvs() | ||
|
||
|
||
if __name__ == '__main__': | ||
app.run(main) |