-
Notifications
You must be signed in to change notification settings - Fork 0
/
heart_data.R
188 lines (140 loc) · 7.57 KB
/
heart_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#heart health
library(mosaic)
library(haven)
library(car)
library(caret)
# desription of each variable
## anaemia: Decrease of red blood cells or hemoglobin (boolean)
## creatinine_phosphorylation: Level of the CPK enzyme in the blood (mcg/L)
## diabetes: If the patient has diabetes (boolean)
## ejection_fraction: Percentage of blood leaving the heart at each contraction (percentage)
## high_blood_pressure: If the patient has hypertension (boolean)
## platelets: Platelets in the blood (kiloplatelets/mL)
## serum_creatinine: Level of serum creatinine in the blood (mg/dL)
## serum_sodium: Level of serum sodium in the blood (mEq/L)
## sex: Woman or man (binary)
## smoking: If the patient smokes or not (boolean)
## time: Follow-up period (days) #irrelevent
## DEATH_EVENT: If the patient deceased during the follow-up period (boolean)
#data dictionary:
# Sex - Gender of patient Male = 1, Female =0
# Age - Age of patient
# Diabetes - 0 = No, 1 = Yes
# Anaemia - 0 = No, 1 = Yes
# High_blood_pressure - 0 = No, 1 = Yes
# Smoking - 0 = No, 1 = Yes
# DEATH_EVENT - 0 = No, 1 = Yes
#testing which variables affect probability of death
favstats(~DEATH_EVENT, data= heart_failure_clinical_records_dataset)
# min Q1 median Q3 max mean sd n missing
#0 0 0 1 1 0.3210702 0.4676704 299 0
tally(~DEATH_EVENT, data= heart_failure_clinical_records_dataset)
t.test(DEATH_EVENT~anaemia, data=heart_failure_clinical_records_dataset, mu=0, alternative="l")
# Welch Two Sample t-test
#data: DEATH_EVENT by anaemia
#t = -1.1366, df = 268.17, p-value = 0.1284
#alternative hypothesis: true difference in means between group 0 and group 1 is less than 0
#95 percent confidence interval:
# -Inf 0.02824787
#sample estimates:
# mean in group 0 mean in group 1
#0.2941176 0.3565891
t.test(DEATH_EVENT~diabetes, data=heart_failure_clinical_records_dataset, mu=0)
#data: DEATH_EVENT by diabetes
#t = 0.033485, df = 267.34, p-value = 0.9733
#alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
#95 percent confidence interval:
# -0.1062960 0.1099742
#sample estimates:
# mean in group 0 mean in group 1
#0.3218391 0.3200000
t.test(DEATH_EVENT~high_blood_pressure, data=heart_failure_clinical_records_dataset, mu=0, alternative="l")
#data: DEATH_EVENT by high_blood_pressure
#t = -1.347, df = 202.44, p-value = 0.08974
#alternative hypothesis: true difference in means between group 0 and group 1 is less than 0
#95 percent confidence interval:
# -Inf 0.01759657
#sample estimates:
# mean in group 0 mean in group 1
#0.2938144 0.3714286
t.test(DEATH_EVENT~smoking, data=heart_failure_clinical_records_dataset, mu=0, alternative="l")
#data: DEATH_EVENT by smoking
#t = 0.21817, df = 187.8, p-value = 0.5862
#alternative hypothesis: true difference in means between group 0 and group 1 is less than 0
#95 percent confidence interval:
# -Inf 0.1082656
#sample estimates:
# mean in group 0 mean in group 1
#0.3251232 0.3125000
## scientific inquiry: levels of which compoud has the highest death rate
t.test(creatinine_phosphokinase~DEATH_EVENT, data=heart_failure_clinical_records_dataset, mu=0, alternative="l")
#data: creatinine_phosphokinase by DEATH_EVENT
#t = -0.90119, df = 125.32, p-value = 0.1846
#alternative hypothesis: true difference in means between group 0 and group 1 is less than 0
#95 percent confidence interval:
# -Inf 109.1639
#sample estimates:
# mean in group 0 mean in group 1
#540.0542 670.1979
t.test(platelets~DEATH_EVENT, data=heart_failure_clinical_records_dataset, mu=0, alternative="g")
#data: platelets by DEATH_EVENT
#t = 0.84479, df = 184.79, p-value = 0.1997
#alternative hypothesis: true difference in means between group 0 and group 1 is greater than 0
#95 percent confidence interval:
# -9833.3 Inf
#sample estimates:
# mean in group 0 mean in group 1
#266657.5 256381.0
t.test(serum_creatinine~DEATH_EVENT, data=heart_failure_clinical_records_dataset, mu=0, alternative="l") # serum_creatinine and death event has a great p-value
t.test(serum_sodium~DEATH_EVENT, data=heart_failure_clinical_records_dataset, mu=0, alternative="g") # good p-value
##### severity of the heart attack
t.test(ejection_fraction~DEATH_EVENT, data=heart_failure_clinical_records_dataset, mu=0, alternative="g") # great p-value
t.test(age~DEATH_EVENT, data=heart_failure_clinical_records_dataset, mu=0)
#seeing if sex affects different levels
t.test(diabetes~sex, data=heart_failure_clinical_records_dataset, mu=0, alternative="g")
t.test(high_blood_pressure~sex, data=heart_failure_clinical_records_dataset, mu=0, alternative="g")
# regression models
gf_point(serum_creatinine~age, data=heart_failure_clinical_records_dataset, alpha=0.5) %>% gf_lm()
model1 <- lm(serum_creatinine~age+DEATH_EVENT+serum_sodium, data=heart_failure_clinical_records_dataset)
msummary(model1)
binarymodel <- glm(DEATH_EVENT~age, data=heart_failure_clinical_records_dataset)
msummary(binarymodel)
binarymodel2 <- glm(DEATH_EVENT~age+serum_creatinine+serum_sodium+ejection_fraction, data=heart_failure_clinical_records_dataset)
msummary(binarymodel2) #the best regression model
binarymodel3 <- glm(DEATH_EVENT~age+serum_creatinine+serum_sodium+ejection_fraction+creatinine_phosphokinase, data=heart_failure_clinical_records_dataset)
msummary(binarymodel3)
heart_failure_clinical_records_dataset1 <- mutate(heart_failure_clinical_records_dataset, DEATH_EVENT = as.factor(DEATH_EVENT))
levels(heart_failure_clinical_records_dataset1$DEATH_EVENT) <- make.names(levels(factor(heart_failure_clinical_records_dataset1$DEATH_EVENT)))
kNNModelName <- train(DEATH_EVENT ~ .-training, data = heart_failure_clinical_records_dataset1[heart_failure_clinical_records_dataset1$training==1, 1:12], method = "knn", preProcess=c("center", "scale"), tuneLength = 15)
plot(kNNModelName)
MortalityRate <- predict(kNNMortalityRateModel, heart_failure_clinical_records_dataset1, type = "prob") %>% round()
MR3 <- mutate(heart_failure_clinical_records_dataset1, predicted = MortalityRate$X1)
Train_Mortality_Rate_a <-
NewDataName$YVariableName[NewDataName$training==1]
pred_MortalityRate <- predict(kNNMortalityRate, heart_failure_clinical_records_dataset1, type = "prob") %>% round()
heartfailure_2 <- mutate(heart_failure_clinical_records_dataset1, predicted = DEATH_EVENT$X1)
TrainActualName <- heartfailure_2$DEATH_EVENT[heartfailure_2$training==1]
TrainPredictMortalityRate <- heartfailure_2$predicted[heartfailure_2$training==1]
mean(TrainActualName==TrainPredictName)
˃ TestActualName <-
heartfailure_2$YVariableName[heartfailure_2$training==0]
˃ TestPredictName <-
heartfailure_2$predicted[heartfailure_2$training==0]
˃ mean(TestActualName==TestPredictName)
pred_val <- predict(knnmodel1, NS1, type = "prob") %>%
round()
˃ NS3 <- mutate(NS1, predicted = pred_val$X1)
˃ train_a <- NS3$default[NS3$training==1]
˃ train_p <- NS3$predicted[NS3$training==1]
˃ mean(train_a==train_p) #gives you the % correct for
training data
˃ test_a <- NS3$default[NS3$training==0]
˃ test_p <- NS3$predicted[NS3$training==0]
˃ mean(test_a==test_p) #gives you the % correct for test data
nullmodel <- glm(DEATH_EVENT ~ 1, data = heart_failure_clinical_records_dataset, family = binomial)
fullmodel <- glm(DEATH_EVENT ~ ., data = heart_failure_clinical_records_dataset, family = binomial)
stepmodel <- step(nullmodel, scope = list(upper = fullmodel))
msummary(stepmodel)
NullDeathPred <- glm(DEATH_EVENT ~ 1, data =
heart_failure_clinical_records_dataset[heart_failure_clinical_records_dataset$training == 1, 1:12],
family=binomial)