-
-
Notifications
You must be signed in to change notification settings - Fork 32
/
polio_utils.py
139 lines (114 loc) · 4.46 KB
/
polio_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import requests
from datetime import date, timedelta
import pdfplumber
import pandas as pd
import tabula
from owid import catalog
def find_latest_polio_data(url_stub: str, days_to_sub: int):
day = date.today() - timedelta(days=days_to_sub)
polio_date = day.strftime("%Y%m%d")
url = f"{url_stub}{polio_date}.pdf"
res = requests.get(url)
return res
def extract_wild_cases(file_path: str) -> pd.DataFrame:
pdf = pdfplumber.open(file_path)
table = pdf.pages[0].extract_table()
df = pd.DataFrame(table)
df.columns = df.iloc[2]
df.drop([0, 1, 2], inplace=True)
col_lim = date.today().year - 2015
df = df.iloc[:, 0:col_lim]
df.rename(columns={None: "entity"}, inplace=True)
years = df.columns.drop("entity")
dfe = df.set_index("entity")
df_sel = dfe.loc[:"Total (Type1)"]
df_sel = df_sel.reset_index(level=0)
df_cases = pd.melt(df_sel, id_vars=["entity"], value_vars=years)
df_cases.columns = ["entity", "year", "wild_polio_cases"]
return df_cases
def extract_historical_wild_cases(file_path: str) -> pd.DataFrame:
pdf = pdfplumber.open(file_path)
table = pdf.pages[0].extract_table()
df = pd.DataFrame(table)
df.columns = df.iloc[2]
df.drop([0, 1, 2], inplace=True)
col_lim = "2015"
df = df.iloc[:, 0:6]
df.rename(columns={None: "entity"}, inplace=True)
years = df.columns.drop("entity")
dfe = df.set_index("entity")
df_sel = dfe.loc[:"Total"]
df_sel = df_sel.reset_index(level=0)
df_cases = pd.melt(df_sel, id_vars=["entity"], value_vars=years)
df_cases.columns = ["entity", "year", "wild_polio_cases"]
return df_cases
def download_polio_data(url_stub: str):
i = 0
download_available = False
while not download_available:
res = find_latest_polio_data(url_stub, days_to_sub=i)
if res.ok:
download_available = True
else:
i += 1
return res
def extract_vd_cases(file_path: str):
table = tabula.read_pdf(file_path, pages=1)
df = pd.DataFrame(table[0])
df.iloc[:, 0] = df.iloc[:, 0].fillna(method="ffill")
col_lim = date.today().year - 2014
df = df.iloc[:, 0:col_lim]
years = range(2016, df.shape[1] - 2 + 2016)
years_str = [str(x) for x in years]
cols = ["strain", "entity"]
cols = cols + years_str
df.columns = cols
df.drop([0], inplace=True)
dfm = pd.melt(df, id_vars=["entity", "strain"], value_vars=years_str)
dfm = dfm[dfm.strain != "Gender"]
df_p = pd.pivot_table(
dfm, values="value", index=["entity", "variable"], columns=["strain"]
).reset_index()
df_p.rename(
columns={
"variable": "year",
"cVDPV11": "cVDPV1",
"cVDPV21": "cVDPV2",
"cVDPV31": "cVDPV3",
},
inplace=True,
)
df_p[["cVDPV1", "cVDPV2", "cVDPV3"]] = df_p[["cVDPV1", "cVDPV2", "cVDPV3"]].fillna(
0
)
df_p["total_cVDPV"] = df_p[["cVDPV1", "cVDPV2", "cVDPV3"]].sum(axis=1)
return df_p
def owid_population() -> pd.DataFrame:
population = (
catalog.find("population", dataset="key_indicators", namespace="owid")
.load()
.reset_index()
.rename(columns={"country": "entity"})[["entity", "year", "population"]]
)
return population
def standardise_countries(country=pd.Series) -> pd.DataFrame:
owid_countries = pd.read_csv(
"data/countries_to_standardise_country_standardized.csv",
usecols=["Country", "Our World In Data Name"],
)
owid_countries["Country"] = owid_countries["Country"].apply(lambda x: x.strip())
country = country.apply(lambda x: x.strip())
owid_countries = owid_countries.set_index("Country").squeeze().to_dict()
countries_standardised = country.apply(lambda x: owid_countries[x])
return countries_standardised
def get_who_data_and_regions():
who_polio = pd.read_excel('data/incidence_series.xls', sheet_name='Polio')
who_polio
regions = who_polio[['WHO_REGION', 'Cname']].drop_duplicates().rename(columns = {'Cname':'entity'})
regions['entity'] = standardise_countries(regions['entity'])
who_polio.drop(columns = ['Disease','WHO_REGION','ISO_code',], inplace=True)
who_melt = pd.melt(who_polio, id_vars=['Cname'])
who_melt['entity'] = standardise_countries(who_melt['Cname'])
who_melt = who_melt[['entity', 'variable', 'value']].rename(columns = {'variable':'year', 'value':'total_polio'})
who_melt[['year']]=who_melt[['year']].astype(int)
return who_melt, regions