Skip to content

Commit

Permalink
Adding script to download specific indicators from world bank WDI dat…
Browse files Browse the repository at this point in the history
…aset (#968)

* Adding script to download specific indicators from world bank WDI dataset

* Adding script to download specific indicators from world bank WDI dataset

* Adding script to download specific indicators from world bank WDI dataset

* Adding script to download specific indicators from world bank WDI dataset
  • Loading branch information
cbari123 authored Jan 27, 2024
1 parent 0623f70 commit 4488bca
Show file tree
Hide file tree
Showing 2 changed files with 144 additions and 0 deletions.
9 changes: 9 additions & 0 deletions scripts/world_bank/wdi/download_indicators/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Author: @cbari

This script is used to download a specific set of indicators from World Bank and
write a cleaned csv to a specified output path. The script will loop through
the set of indicators and download data for all dates/countries for that
indicator.

Currently, the result csv is used in a private Data Commons instance, but can be
reused as necessary.
135 changes: 135 additions & 0 deletions scripts/world_bank/wdi/download_indicators/wdi_download_indicators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
"""Download World Bank WDI via API and write cleaned csv to out_path."""

import io
import urllib.request
import zipfile

from absl import app
from absl import flags
import numpy as np
import pandas as pd

_OUT_PATH = flags.DEFINE_string('out_path', None, 'CNS path to write output.')

indicators = [
'SP.POP.TOTL',
'SP.POP.TOTL.FE.IN',
'SP.POP.TOTL.MA.IN',
'SP.POP.0014.TO.ZS',
'SP.POP.1564.TO.ZS',
'SP.POP.65UP.TO.ZS',
'SP.DYN.LE00.IN',
'SP.URB.TOTL.IN.ZS',
'SP.URB.TOTL',
'SP.POP.0014.FE.ZS',
'SP.POP.1519.FE.5Y',
'SP.POP.2024.FE.5Y',
'SP.POP.2529.FE.5Y',
'SP.POP.3034.FE.5Y',
'SP.POP.3539.FE.5Y',
'SP.POP.4044.FE.5Y',
'SP.POP.4549.FE.5Y',
'SP.POP.5054.FE.5Y',
'SP.POP.5559.FE.5Y',
'SP.POP.6064.FE.5Y',
'SP.POP.65UP.FE.ZS',
'SP.POP.0014.MA.ZS',
'SP.POP.1519.MA.5Y',
'SP.POP.2024.MA.5Y',
'SP.POP.2529.MA.5Y',
'SP.POP.3034.MA.5Y',
'SP.POP.3539.MA.5Y',
'SP.POP.4044.MA.5Y',
'SP.POP.4549.MA.5Y',
'SP.POP.5054.MA.5Y',
'SP.POP.5559.MA.5Y',
'SP.POP.6064.MA.5Y',
'SP.POP.65UP.MA.ZS',
'SE.ADT.LITR.ZS',
'SE.ADT.LITR.FE.ZS',
'SE.ADT.LITR.MA.ZS',
'SE.ADT.1524.LT.ZS',
'SE.ADT.1524.LT.FE.ZS',
'SE.ADT.1524.LT.MA.ZS',
'NY.GDP.MKTP.CD',
'NY.GDP.MKTP.KD.ZG',
'NV.AGR.TOTL.ZS',
'NV.AGR.TOTL.CD',
'NV.SRV.TOTL.ZS',
'NV.SRV.TOTL.CD',
'NV.IND.TOTL.ZS',
'NV.IND.TOTL.CD',
'NY.GNP.PCAP.PP.CD',
'SI.POV.GINI',
'IT.NET.USER.ZS',
]


def DownloadAndParseCsvs() -> None:
"""Loops through all indicators and downloads the data for all countries/dates.
This data is then added to the output which is written to _OUT_PATH
"""
dat = []
for indicator in indicators:
resp = urllib.request.urlopen(
f'http://api.worldbank.org/v2/country/all/indicator/{indicator}?source=2&downloadformat=csv'
)
myzip = zipfile.ZipFile(io.BytesIO(resp.read()))
csv_data = pd.DataFrame()
start_index = 0
found = False
for filename in myzip.namelist():
if filename.startswith('API_'):
with myzip.open(filename) as f:
for line in f:
if line.decode('utf-8').startswith('"Country'):
break
start_index += 1
with myzip.open(filename) as f:
csv_data = pd.read_csv(f, skiprows=start_index)
found = True
if found:
for _, row in csv_data.iterrows():
if True in pd.isna(row):
continue
for year in range(1960, 2022):
if pd.isna(row['Country Code']):
continue
country_str = 'dcid:country/' + row['Country Code']
sv_string = 'worldBank/' + row['Indicator Code'].replace(
'.', '_')
dat.append([
row['Indicator Code'],
sv_string,
'WorldBank_WDI_CSV',
country_str,
year,
row[str(year)],
'',
])

out_df = pd.DataFrame(
np.array(dat),
columns=[
'indicatorcode',
'statvar',
'measurementmethod',
'observationabout',
'observationdate',
'observationvalue',
'unit',
],
)
with open(_OUT_PATH.value, 'w+') as f_out:
out_df.to_csv(f_out, index=False)


def main(argv: list[str]) -> None:
if len(argv) > 1:
raise app.UsageError('Too many command-line arguments.')
DownloadAndParseCsvs()


if __name__ == '__main__':
app.run(main)

0 comments on commit 4488bca

Please sign in to comment.