ShotAnalysis.Rmd

---
title: "ShotAnalysis"
output: html_document
---

```{r setup}
library(tidyverse)
library(caret)
library(dummies)
library(xgboost)
library(data.table)
library(Matrix)

shots <- read.csv("data.csv", header = TRUE)
```

```{r}
# Looking at the structure of the data
str(shots)
```

```{r}
# Converting all character variables to factors
shots[sapply(shots, is.character)] <- lapply(shots[sapply(shots, is.character)], as.factor)
str(shots)
```

```{r}
ggplot(shots, aes(x = lon, y = lat)) + geom_point()
```

```{r}
ggplot(shots, aes(x = loc_x, y = loc_y)) + geom_point()
```

```{r}
# Converting loc_x and loc_y from Cartesian coordinates to Polar coordinates
shots$loc_r <- sqrt((shots$loc_x)^2 + (shots$loc_y)^2)
shots$loc_theta <- atan(shots$loc_y/shots$loc_x)
shots$loc_theta[is.na(shots$loc_theta)] <- pi/2 # Some loc_x coordinates are 0, and we replace these values with pi / 2 radians or 90 degrees
```

```{r}
# Combining minutes_remaining and seconds_remaining
shots$time_remaining <- (shots$minutes_remaining * 60) + shots$seconds_remaining
# time_remaining = total seconds remaining
```

```{r}
# Selecting only the last 2 digits of the season variable
shots$season <- sapply(shots$season, function(x){str_extract(x, "(?<=[:punct:])[:digit:]{2}")}) 

# Don't know whether things will change if we leave it as a factor or convert it to an integer
```

```{r}
# Irrelevant variables
unique(shots$team_id)
unique(shots$team_name)
```

```{r}
# Creating home and away variables from the matchup variable
shots$away <- as.numeric(grepl("@", shots$matchup, fixed = TRUE))
shots$home <- as.numeric(grepl("vs.", shots$matchup, fixed = TRUE))
```

```{r}
# Creating a variable indicating whether the shot was taken in the last 3 minutes of a quarter
shots$lastminutes <- ifelse(shots$time_remaining <= 180, 1, 0)
```

```{r}
# Numbering each of Kobe's games
shots$game_num <- as.numeric(shots$game_date)

# Ordering the data by game_num
shots <- shots %>% arrange(game_num)
```

```{r}
# Creating a variable for Kobe post Achilles tear
shots$postachilles <- ifelse(shots$game_num > 1452, 1, 0)

# 1st team All NBA
# 2001-02 Regular Season => 395-474
# 2002-03 Regular Season => 
# 2003-04 Regular Season =>
# 2005-06 Regular Season => 740-819
# 2006-07 Regular Season => 827-903
# 2007-08 Regular Season => 909-990
# 2008-09 Regular Season =>
# 2009-10 Regular Season =>
# 2010-11 Regular Season =>
# 2011-12 Regular Season =>
# 2012-13 Regular Season =>
shots$first_team <- ifelse((shots$game_num >= 395 & shots$game_num <= 673) | (shots$game_num >= 740 & shots$game_num <= 1452), 1, 0)

# Scoring Leader
# 2005-06 Regular Season => 740-819
# 2006-07 Regular Season => 827-903
shots$scoring_leader <- ifelse((shots$game_num >= 740 & shots$game_num <= 819) | (shots$game_num >= 827 & shots$game_num <= 903), 1, 0)

# MVP
# 2007-08 Regular Season => 909-990
shots$mvp <- ifelse(shots$game_num >= 909 & shots$game_num <= 990, 1, 0)

# Finals MVP
# 2008-09 Finals => 1112-1116
# 2009-10 Finals => 1206-1212
shots$finals_mvp <- ifelse((shots$game_num >= 1112 & shots$game_num <= 1116) | (shots$game_num >= 1206 & shots$game_num <= 1212), 1, 0)

# num_rings
# with shaq
# with pau
```

```{r}
# Since we created polar coordinates for the shots, the loc_r variable is proportional to the shot_distance variable
ggplot(shots, aes(x = loc_r, y = shot_distance)) + geom_point(col = "blue")
```

```{r}
# Removing unneeded variables
shots_filtered <- shots %>% select(-c(shot_id, team_id, team_name, 
                                      #shot_zone_area, shot_zone_range, shot_zone_basic, 
                                      matchup, lon, lat, seconds_remaining, minutes_remaining, #shot_distance, 
                                      loc_x, loc_y, game_event_id, game_id, game_date))
```

```{r}
# Create test and training set

# Removing NAs
shots_filtered_test <- shots_filtered %>% filter(is.na(shot_made_flag)) #5000 shots where we don't know if it went in or not
shots_filtered_train <- shots_filtered %>% filter(!is.na(shot_made_flag))

# Setting the response variable
train.y <- shots_filtered_train$shot_made_flag

# Dropping the response variable from the training and test set
shots_filtered_train$shot_made_flag <- NULL
shots_filtered_test$shot_made_flag <- NULL
```

```{r}
# Creating data.matrix
trainM <- data.matrix(shots_filtered_train, rownames.force = NA)

# Creating DMarix for xgboost 
dtrain <- xgb.DMatrix(data = trainM, label = train.y, missing = NaN)

watchlist <- list(trainM = dtrain)
```

```{r}
param <- list(objective = "binary:logistic", 
              booster = "gbtree",
              eval_metric = "logloss",
              eta = .035, #.25
              max_depth = 3, # 3
              subsample = .75, # .75
              colsample_bytree = .6 # .6
              )

clf <- xgb.cv(params = param, 
              data = dtrain, 
              nrounds = 1500, # 150
              verbose = 1,
              watchlist = watchlist,
              maximize = FALSE,
              nfold = 5, # 10
              early_stopping_rounds = 20,
              print_every_n = 10
              )
```

```{r}
# Best round
bestRound <- clf$best_iteration
print(bestRound)
```

```{r}
# Best result
print(clf$evaluation_log[clf$best_iteration])
```

```{r}
# Running the model
xgb.model <- xgb.train(params = param, 
                       data = dtrain, 
                       nrounds = bestRound, 
                       verbose = 1,
                       watchlist = watchlist,
                       maximize = FALSE
                       )
```

```{r}
# Creating a submission file
testM <- data.matrix(shots_filtered_test, rownames.force = NA)
preds <- predict(xgb.model, testM)
submission <- data.frame(shot_id = test.id, shot_made_flag = preds)
write.csv(submission, "XGBoost-preds.csv", row.names = FALSE)
#.60374 => #306 out of 1117 on the Kaggle leaderboard ~72nd %tile

# Including the shot_area and shot_distance variables (4)
# .60321 => #273 out of 1117 on the Kaggle leaderboard ~75th %tile
```

```{r}
# Creating dummy variables for the factor variables that remain
action_type_dummies <- data.frame(dummy(shots_filtered$action_type, sep = "_"))

combined_shot_type_dummies <- data.frame(dummy(shots_filtered$combined_shot_type, sep = "_"))

shot_type_dummies <- data.frame(dummy(shots_filtered$shot_type, sep = "_"))

opponent_dummies <- data.frame(dummy(shots_filtered$opponent, sep = "_"))

period_dummies <- data.frame(dummy(shots_filtered$period, sep = "_"))

season_dummies <- data.frame(dummy(shots_filtered$season, sep = "_"))
```

```{r}
# Adding in the dummy variables
shots_dummies <- cbind(shots_filtered %>% select(-c(action_type, combined_shot_type, shot_type, opponent, period, season, shot_distance, shot_zone_area, shot_zone_basic, shot_zone_range)), action_type_dummies, combined_shot_type_dummies, shot_type_dummies, opponent_dummies, period_dummies, season_dummies)
```

```{r}
# Create test and training set

# Removing NAs
shots_test <- shots_dummies %>% filter(is.na(shot_made_flag)) #5000 shots where we don't know if it went in or not
shots_train <- shots_dummies %>% filter(!is.na(shot_made_flag))

# Setting the response variable
train.y <- shots_train$shot_made_flag

# Dropping the response variable from the training and test set
shots_train$shot_made_flag <- NULL
shots_test$shot_made_flag <- NULL
```

```{r}
# Creating data.matrix
trainM <- data.matrix(shots_train, rownames.force = NA)

# Creating DMarix for xgboost 
dtrain <- xgb.DMatrix(data = trainM, label = train.y, missing = NaN)

watchlist <- list(trainM = dtrain)
```

```{r}
param <- list(objective = "binary:logistic", 
              booster = "gbtree",
              eval_metric = "logloss",
              eta = .035, #.25
              max_depth = 4, # 3
              subsample = .8, # .75
              colsample_bytree = .8 # .6
              )

clf <- xgb.cv(params = param, 
              data = dtrain, 
              nrounds = 1500, # 150
              verbose = 1,
              watchlist = watchlist,
              maximize = FALSE,
              nfold = 5, # 10
              early_stopping_rounds = 20,
              print_every_n = 10
              )
```

```{r}
# Best round
bestRound <- clf$best_iteration
print(bestRound)
```

```{r}
# Best result
print(clf$evaluation_log[clf$best_iteration])
```

```{r}
# Running the model
xgb.model2 <- xgb.train(params = param, 
                        data = dtrain, 
                        nrounds = bestRound, 
                        verbose = 1,
                        watchlist = watchlist,
                        maximize = FALSE
                        )
```

```{r}
# Creating a submission file
testM <- data.matrix(shots_test, rownames.force = NA)
preds <- predict(xgb.model2, testM)
submission <- data.frame(shot_id = test.id, shot_made_flag = preds)
write.csv(submission, "XGBoost-preds2.csv", row.names = FALSE)
# Not including the shot_area and shot_distance variables (4)
# .60277 => #258 out of 1117 on the Kaggle leaderboard ~77th %tile
```

```{r}
# With classProbs = TRUE, the response variable cannot be 1 and 0
shots_train <- shots_train %>% mutate(shot_made_flag = ifelse(shot_made_flag == 1, "Yes", "No"))
```

```{r}
# Building first model

# Define resampling procedure as standard k-fold Cross Validations
myControl <- trainControl(method = "cv", 
                          number = 10, 
                          classProbs = TRUE,
                          summaryFunction = mnLogLoss
                          )

tunegrid <- expand.grid(nrounds = 60,
                        max_depth = 3,
                        eta = .25,
                        gamma = 0,
                        colsample_bytree = .6,
                        min_child_weight = 0,
                        subsample = .75
                        )

xgbTree.model <- train(as.factor(shot_made_flag) ~ .,
                       data = shots_train,
                       method = "xgbTree",
                       trControl = myControl,
                       tuneGrid = tunegrid,
                       metric = "logLoss"
                       )

print(xgbTree.model)
# reported logLoss of ~ .60345 on training data
beepr::beep(sound = 2)
```

```{r}
# Current Score: 10.9903
model.preds <- predict(xgbTree.model, shots_test)
model.preds <- ifelse(model.preds == "Yes", 1, 0)
test.id <- shots %>% filter(is.na(shot_made_flag)) %>% select(shot_id)
xgbTree <- data.frame(shot_id = test.id, shot_made_flag = model.preds)
write_csv(xgbTree, "xgbTree-preds.csv")
```

```{r}
tunegrid <- expand.grid(mtry = 54,
                        splitrule = "gini",
                        min.node.size = 1
                        )

ranger.model <- train(as.factor(shot_made_flag) ~ .,
                      data = shots_train,
                      method = "ranger",
                      trControl = myControl,
                      tuneGrid = tunegrid,
                      metric = "logLoss"
                      )

print(ranger.model)
# reported logLoss of ~ .6255 on training data
beepr::beep(sound = 2)
```

```{r}
# Current Score: 11.32879
ranger.preds <- predict(ranger.model, shots_test)
ranger.preds <- ifelse(ranger.preds == "Yes", 1, 0)
test.id <- shots %>% filter(is.na(shot_made_flag)) %>% select(shot_id)
ranger <- data.frame(shot_id = test.id, shot_made_flag = ranger.preds)
write_csv(ranger, "ranger-preds.csv")
```