-
Notifications
You must be signed in to change notification settings - Fork 0
/
Project 3 New Data Final.Rmd
91 lines (74 loc) · 2.25 KB
/
Project 3 New Data Final.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
---
title: "Project 3 New Data"
author: "Samantha Tinor"
date: "4/18/2020"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
---
title: "Project 3 New Data"
author: "Samantha Tinor"
date: "4/17/2020"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r}
library(tidyverse)
library(dplyr)
library(caret)
library(MASS)
library(BART)
library(kernlab)
library(class)
absent <-read.csv("Absenteeism_at_work2.csv")
```
# Cleaning Data & Other Prep Work
```{r}
absent$Reason.for.absence <- as.factor(absent$Reason.for.absence)
absent$Month.of.absence <- as.factor(absent$Month.of.absence)
absent$Day.of.the.week <- as.factor(absent$Day.of.the.week)
absent$Seasons <- as.factor(absent$Season)
absent$Disciplinary.failure <-as.factor(absent$Disciplinary.failure)
absent$Education <- as.factor(absent$Education)
absent$Social.drinker <-as.factor(absent$Social.drinker)
absent$Social.smoker <-as.factor(absent$Social.smoker)
absent$ï..ID <- NULL
absent$Work.load.Average.day<-as.numeric(absent$Work.load.Average.day)
absent$Month.of.absence<- NULL
absent$Weight<- NULL
absent$is_Thur<-if_else(absent$Day.of.the.week==5,1,0)
absent$Day.of.the.week<-NULL
absent$is_Seas4<-if_else(absent$Seasons==4,1,0)
absent$Seasons<-NULL
absent$Transportation.expense<-NULL
absent$Social.drinker<-NULL
absent$Disciplinary.failure<-NULL
lm.model <- lm( Absenteeism.time.in.hours ~., data=absent)
```
# Dividing Absent hours into categories
```{r}
absent$timecat <- absent$Absenteeism.time.in.hours
absent$timecat <- ifelse(absent$timecat == 0, "not.absent", ifelse (absent$timecat<56, "less.than.week","more.than.week"))
absent$timecat <- as.factor(absent$timecat)
absent$Absenteeism.time.in.hours <- NULL
```
# Randomize, Standardize and Normalize
```{r}
#Randomizing the data
set.seed(65)
absent_r<-absent[sample(nrow(absent)),]
absent_r
#Turning factors into binary
absent_mm <- as.data.frame(model.matrix(~ . -1, absent_r[-15]))
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
#Normalizing every variable except output
absent_n <- as.data.frame(lapply(absent_mm, normalize))
#Normalized data with normalized output
absent_n_output<-cbind(class.ind(as.factor(absent$timecat)),absent_n)
```