This repository has been archived by the owner on Oct 25, 2023. It is now read-only.
-
-
Notifications
You must be signed in to change notification settings - Fork 13
/
build_database.py
49 lines (44 loc) · 1.8 KB
/
build_database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import sqlite_utils
import git
import json
def iterate_file_versions(repo_path, filepaths, ref="main"):
repo = git.Repo(repo_path, odbt=git.GitDB)
commits = reversed(list(repo.iter_commits(ref, paths=filepaths)))
for commit in commits:
blob = [b for b in commit.tree.blobs if b.name in filepaths][0]
yield commit.committed_datetime, commit.hexsha, blob.data_stream.read()
if __name__ == "__main__":
# File was originally called incidents.json, later renamed to states.json
it = iterate_file_versions(".", ("states.json", "incidents.json"))
count = 0
db = sqlite_utils.Database("cdc.db")
for i, (when, hash, content) in enumerate(it):
try:
states = json.loads(content)["vaccination_data"]
except ValueError:
# Bad JSON
continue
for state in states:
id = state["Location"] + "-" + state["Date"]
db["daily_reports"].insert(
dict(state, id=id), pk="id", alter=True, replace=True
)
for i, (when, hash, content) in enumerate(
iterate_file_versions(".", ("counties.json",))
):
try:
counties = json.loads(content)["vaccination_county_condensed_data"]
except ValueError:
# Bad JSON
continue
for county in counties:
id = county["FIPS"] + "-" + county["Date"]
db["daily_reports_counties"].insert(
dict(county, id=id), pk="id", alter=True, replace=True
)
# Create some indexes
db["daily_reports_counties"].create_index(["StateName", "County"])
db["daily_reports_counties"].create_index(["FIPS"])
db["daily_reports"].create_index(["Location"])
db["daily_reports"].create_index(["ShortName"])
db["daily_reports"].create_index(["LongName"])