forked from LuciMoore/sql_workshop
-
Notifications
You must be signed in to change notification settings - Fork 0
/
make_database.py
151 lines (128 loc) · 5.26 KB
/
make_database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 11 14:21:11 2019
@author: danielyaeger
This is code to create the database used in the sql_workshop Jupyter notebook.
To create the database, make sure the file 'NCHS_-_Leading_Causes_of_Death__United_States.csv'
is in the same directory as this script, navigae to the directory,
and then open a terminal and type:
python make_database
"""
import numpy as np
import re
import sqlite3
import pandas as pd
def clean_names(df):
L = []
for col in df.columns:
L.append(re.sub(r"\s+|-", '_', col))
L[1] = 'Cause_Description'
df.columns = L
def make_db(data):
conn = sqlite3.connect("leading_cases_of_death.sqlite")
cur = conn.cursor()
# Get unique years
yr_unique = np.sort(data.Year.unique()).tolist()
# Make year table
sql_statement1 = ("DROP TABLE IF EXISTS Year")
sql_statement2 = '''CREATE TABLE Year(
YearID INTEGER PRIMARY KEY,
Year INTEGER NOT NULL
)'''
cur.execute(sql_statement1)
cur.execute(sql_statement2)
# Insert data into year table
for yr in yr_unique:
cur.execute("INSERT INTO Year (Year) VALUES (?)", (yr,))
conn.commit()
# Create list of unique causes
cause_unique = data.Cause_Name.unique().tolist()
maxLengthCause = max([len(item) for item in cause_unique])
cause_desc_unique = data.Cause_Description.unique().tolist()
maxLengthDesc = max([len(item) for item in cause_desc_unique])
# Create Cause table
sql_statement1 = ("DROP TABLE IF EXISTS Cause")
sql_statement2 = '''CREATE TABLE Cause(
CauseID INTEGER PRIMARY KEY,
Cause_Name VARCHAR({0}),
Cause_Description VARCHAR({1})
)'''.format(maxLengthCause,maxLengthDesc)
cur.execute(sql_statement1)
cur.execute(sql_statement2)
# Insert data into Cause table
for i in range(len(cause_unique)):
cur.execute("INSERT INTO Cause (Cause_Name,Cause_Description) VALUES (?,?)",
(cause_unique[i], cause_desc_unique[i]))
conn.commit()
# Sort list of states
state_unique = np.sort(data.State.unique()).tolist()
maxLength = max([len(item) for item in state_unique])
# Make State table
sql_statement1 = ("DROP TABLE IF EXISTS State")
sql_statement2 = '''CREATE TABLE State(
StateID INTEGER PRIMARY KEY,
State VARCHAR({0})
)'''.format(maxLength,)
cur.execute(sql_statement1)
cur.execute(sql_statement2)
# Insert data into State
for state in state_unique:
cur.execute("INSERT INTO State (State) VALUES (?)", (state,))
conn.commit()
# Get dataframes from each table to merge
df_state = pd.read_sql_query("SELECT * FROM State", conn)
df_cause = pd.read_sql_query("SELECT * FROM Cause", conn)
df_year = pd.read_sql_query("SELECT * FROM Year", conn)
# Merge ID values to dataframe
merged = pd.merge(data,df_state,how='outer',on=['State'])
merged = pd.merge(merged,df_cause,how='outer',on=['Cause_Name'])
merged = pd.merge(merged, df_year, how ='outer',on=['Year'])
# Create Deaths table
sql_statement1 = ("DROP TABLE IF EXISTS Deaths")
sql_statement2 = '''CREATE TABLE Deaths (
ID INTEGER PRIMARY KEY,
YearID INTEGER,
CauseID INTEGER,
StateID INTEGER,
Deaths INTEGER,
Age_adjusted_Death_Rate FLOAT,
FOREIGN KEY (CauseID) REFERENCES Cause(CauseID),
FOREIGN KEY (YearID) REFERENCES Year(YearID),
FOREIGN KEY (StateID) REFERENCES State(StateID)
);'''
cur.execute(sql_statement1)
cur.execute(sql_statement2)
# Get merged data into right format
deaths = merged['Deaths'].values.tolist()
age_adj_rate = merged['Age_adjusted_Death_Rate'].values.tolist()
causeID = merged['CauseID'].values.tolist()
yearID = merged['YearID'].values.tolist()
stateID = merged['StateID'].values.tolist()
# Insert data into Deaths table
for i in range(len(merged)):
cur.execute('''INSERT INTO Deaths
(Deaths,Age_adjusted_Death_Rate,CauseID,YearID,StateID)
VALUES (?,?,?,?,?)''',
(deaths[i],age_adj_rate[i],causeID[i],yearID[i],stateID[i]))
conn.commit()
# Close database
conn.close()
if __name__ == "__main__":
# This chunk of code says that if you call this script from the command line
# it will execute the code below
print("Importing data...")
data = pd.read_csv('NCHS_-_Leading_Causes_of_Death__United_States.csv',',')
print("Cleaning variable names...")
clean_names(data)
print("Making SQLite database...")
make_db(data)
try:
conn = sqlite3.connect("leading_cases_of_death.sqlite")
df_state = pd.read_sql_query("SELECT * FROM State", conn)
if len(df_state) > 0:
print("Database named leading_cases_of_death.sqlite successfully created")
else:
print("Problem creating database")
except:
print("Error creating database!")