-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwb_scrape_metricCodes.py
66 lines (55 loc) · 2.56 KB
/
wb_scrape_metricCodes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
# go to Worldbank and scrape a (what I think is) complete list of metrics and their associated codes that are used for API calls
def wb_scrape_metricCodes(output_path):
url = 'https://data.worldbank.org/indicator?tab=all'
response = requests.get(url=url)
result_html = BeautifulSoup(response.content, 'html.parser')
### GET SECTION NAMES FOR THE METRICS ###
# syntax: {data-reactid: name, ...}
section_dict = {}
for x_section in result_html.find_all('h3'): # finding 'h3' tags
try:
section_id = x_section['data-reactid']
section_name = x_section.a.text # retrieve 'a' tag's text
section_dict[section_id] = section_name
except Exception:
pass
# print(section_dict)
### GET LIST OF METRICS AND THEIR NAMES ###
indicator_list = []
for x_link in result_html.find_all('a'): # finding 'a' tags
try:
# where 'href' attribute starts with 'x'
if x_link['href'].startswith('/indicator/'):
# this id is enumerated from top to bottom on page, can use this to later determine the section title of the metrics
x_dataReactId = int(x_link['data-reactid'])
x_href = x_link['href']
# remove non-indicator code text
x_href = x_href.replace(
'/indicator/', '').replace('?view=chart', '')
indicator_list.append([x_dataReactId, x_href, x_link.text])
else:
pass
except Exception:
pass
# print(indicator_list)
### ADD CATEGORY NAME TO METRICS ###
section_keys = list(section_dict.keys())
section_keys = [int(x) for x in section_keys] # convert to integers
# sorted descending. Occurs in-place don't need to assign to variable
section_keys.sort(reverse=True)
# print(section_keys)
df = pd.DataFrame(indicator_list, columns=['dataReactId', 'code', 'name'])
df.loc[:, 'category'] = np.nan # create NaN column to later fill
for x_key in section_keys:
section_name = section_dict[str(x_key)]
df.loc[((df['dataReactId'] > x_key) & (df['category'].isna())),
'category'] = section_name # from bottom up apply category names
df = df.drop(columns=['dataReactId']) # remove
# df['code'].isna().sum() # confirmed that no NaN codes
### EXPORT AS CSV ###
df.to_csv(output_path, index=False)
return (print("Worldbank metric codes have been exported to csv file."))