-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcovid-tracking.py
138 lines (116 loc) · 6.72 KB
/
covid-tracking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import datetime
import sys
import pytz
"""Federal data combiner tool. Downloads federal testing, hospitalization, and case/death data and combines it
into unified state-level output.
This script outputs a csv file named federal-covid-data-DATE.csv and is intended to be run in the Google Colab
environment, but passing the 'STDOUT' argument will output to STDOUT instead for command-line use.
Note: We at The COVID Tracking Project will not be maintaining this script in the event that the federal data pages
change in impactful ways. This is simply a set of instructions for interested data users.
Changelog:
2021-03-15: Initial release
"""
HHS_TESTING_URL = "https://beta.healthdata.gov/api/views/j8mb-icvb/rows.csv?accessType=DOWNLOAD"
HHS_HOSPITALIZATION_URL = "https://beta.healthdata.gov/api/views/g62h-syeh/rows.csv?accessType=DOWNLOAD"
CDC_CASE_DEATH_URL = "https://data.cdc.gov/api/views/9mfq-cb36/rows.csv?accessType=DOWNLOAD"
COMBINE_NY_NYC = True
def get_hospitalization_dailies():
"""build a dataframe containing the combination of several days of recent HHS hospitalization daily data"""
csv_urls = get_hospitalization_csv_urls()
data_frames = []
for url in csv_urls:
data = pd.read_csv(url, parse_dates=['reporting_cutoff_start'])
# set each daily file's date to the reporting_cutoff_start date + 4 days
data['date'] = data['reporting_cutoff_start'] + pd.DateOffset(days=4)
data['date'] = data['date'].dt.date
data_frames.append(data)
hospitalization_dailies = pd.concat(data_frames)
return hospitalization_dailies
def parse_dates(date_col):
if date_col.str.contains(":").any():
return pd.to_datetime(date_col, format='%m/%d/%Y %H:%M:%S %p').dt.date
else:
return pd.to_datetime(date_col, format='%Y-%m-%d').dt.date
# download and parse all three data files
[testing, hospitalization, case_death] = [pd.read_csv(url) for url in
[HHS_TESTING_URL, HHS_HOSPITALIZATION_URL, CDC_CASE_DEATH_URL]]
# testing data comes out with one row per state/date/outcome combination.
# unpack that and squash it into one row per state/date only
testing = testing.set_index(
['state', 'date', 'overall_outcome']).unstack(level=-1)
testing = testing[['new_results_reported', 'total_results_reported']]
testing.columns = ['_'.join(tup).rstrip('_') for tup in testing.columns.values]
testing = testing.reset_index()
# allow for a choice of inconsistent datetime formats
testing['date'] = parse_dates(testing['date'])
# the HHS hospitalization time series is only updated weekly. To compensate, we download the latest daily data
# and merge it on top of the weekly data, taking only the most recent values for a given state/date
hospitalization['date'] = parse_dates(hospitalization['date'])
# hospitalization_dailies = get_hospitalization_dailies()
# we want to use the HHS weekly time series up until its last day, then fill in the rest of the data from the daily
# files. we overwrite the last day of the time series with the dailies because the dailies come out after the weekly
# hospitalization_dailies = hospitalization_dailies[hospitalization_dailies['date']
# >= hospitalization['date'].max()]
hospitalization.set_index(['state', 'date'])
# hospitalization = hospitalization.merge(hospitalization_dailies, how='outer')
# the keep='last' here keeps just the daily data when both duplicate weekly and daily data exist
hospitalization = hospitalization.drop_duplicates(
subset=['date', 'state'], keep='last', ignore_index=True)
# select a subset of columns to include in the output
hospitalization = hospitalization[[
'state',
'date',
'inpatient_beds_used_covid',
'total_adult_patients_hospitalized_confirmed_and_suspected_covid',
'total_adult_patients_hospitalized_confirmed_covid',
'total_pediatric_patients_hospitalized_confirmed_and_suspected_covid',
'total_pediatric_patients_hospitalized_confirmed_covid',
'staffed_icu_adult_patients_confirmed_and_suspected_covid',
'staffed_icu_adult_patients_confirmed_covid',
'previous_day_admission_adult_covid_confirmed',
'previous_day_admission_adult_covid_suspected',
'previous_day_admission_pediatric_covid_confirmed',
'previous_day_admission_pediatric_covid_suspected',
'inpatient_beds_used_covid_coverage',
'total_adult_patients_hospitalized_confirmed_and_suspected_covid_coverage',
'staffed_icu_adult_patients_confirmed_and_suspected_covid_coverage',
'previous_day_admission_adult_covid_confirmed_coverage']]
# HHS hospitalization data gets better on and after 7/15/20
hospitalization = hospitalization[hospitalization['date'] >= datetime.date(
year=2020, month=7, day=15)]
# case/death data: pick a subset of columns and prepare to merge
#case_death = case_death[
# ['submission_date', 'state', 'tot_cases', 'conf_cases', 'prob_cases', 'new_case', 'pnew_case', 'tot_death',
# 'conf_death', 'prob_death', 'new_death', 'pnew_death']]
case_death = case_death.rename(columns={'submission_date': 'date'})
# sum NY and NYC case/death data into one row
if COMBINE_NY_NYC:
# set all NYC rows to NY, group and sum the NY rows, and combine with the rest of the rows
case_death.loc[case_death['state'] == 'NYC', ['state']] = 'NY'
NY_combined = case_death[case_death['state']
== 'NY'].groupby(["date", "state"]).sum()
case_death = NY_combined.reset_index().append(
case_death[case_death['state'] != 'NY'])
# merge all the dataframes together into one big combination
combined = pd.merge(left=testing, right=hospitalization,
on=['state', 'date'], how='outer')
case_death['date'] = pd.to_datetime(case_death['date'], format='%m/%d/%Y')
combined['date'] = pd.to_datetime(combined['date'], format='%Y-%m-%d')
combined = combined.merge(case_death, on=['state', 'date'], how='outer')
combined.sort_values(by=['date', 'state'], inplace=True, ignore_index=True)
combined.to_csv('covid.csv', index=False)
# and output the data
new_combined = pd.DataFrame(
{'date': combined['date'], 'state': combined['state'], 'hospitalized': combined['inpatient_beds_used_covid'], 'deathIncrease': combined['new_death'], 'totalTestResultsIncrease': (combined['new_results_reported_Inconclusive'] + combined['new_results_reported_Negative']+combined['new_results_reported_Positive']), 'positiveIncrease': combined['new_case']+ combined['pnew_case']})
new_combined.to_csv('covid-tracking.csv', index=False)
# tell Google Colab to have the user download the output
# or do nothing if we're not in a Colab environment
try:
from google.colab import files
files.download(outfile_name)
except ModuleNotFoundError:
pass