1) Overview

Google Cloud and NCAA® have teamed up to bring you this year’s version of the Kaggle machine learning competition. In the first stage of the competition, Kagglers will rely on results of past tournaments to build and test models. In the second stage, competitors will forecast outcomes of all possible match-ups in the 2018 NCAA Division I Men’s Championships. To download data click here.

2) Preprocessing

2.1) Loading libraries

library(stringr)
library(dplyr)
library(reshape)
library(caret)

2.2) Loading data for all past seasons

setwd("C:\\Users\\parki\\Documents\\R Scripts\\March Madness Submission Data")
temp = list.files(pattern="*.csv")
for (i in 1:length(temp)) assign(substring(temp[i],1,str_length(temp[i])-4), read.csv(temp[i]))

3) Processing Data

3.1) Using Massey Rankings from 2003 and up to get the mean for each team in each season.

massey_ordinals_train <- filter(MasseyOrdinals_thruSeason2018_Day128)
massey_ordinals_train <- rbind(massey_ordinals_train,MasseyOrdinals_2018_133_only_43Systems)
avg_rank_train <- aggregate(data = massey_ordinals_train, OrdinalRank ~ TeamID + Season, mean)

3.2) Filtering data to only include games from 2003 and up since our elo ratings and rankings are only from 2003 to current.

ncaa_tourney <- select(NCAATourneyCompactResults,Season,WTeamID,LTeamID)
ncaa_tourney <- filter(ncaa_tourney, Season >= 2003)

3.3) Joining Massey rankings and past tournaments

ncaa_tourney <- left_join(ncaa_tourney,avg_rank_train, by = c("Season","WTeamID"="TeamID"))
ncaa_tourney <- left_join(ncaa_tourney,avg_rank_train, by = c("Season","LTeamID"="TeamID"))
ncaa_tourney$RankMargin <- ncaa_tourney$OrdinalRank.x - ncaa_tourney$OrdinalRank.y

3.4) Joining Elo Ratings and past tournaments. Click here to see Elo Rating model using python.

names(elos_season) <- c("Season","SeasonElo","TeamID")

ncaa_tourney <- left_join(ncaa_tourney,elos_season, by = c("Season","WTeamID"="TeamID"))
ncaa_tourney <- left_join(ncaa_tourney,elos_season, by = c("Season","LTeamID"="TeamID"))
ncaa_tourney$EloMargin <- ncaa_tourney$SeasonElo.x - ncaa_tourney$SeasonElo.y

3.5) Creating training set, doubling observations by flipping variables and outcomes

train_w <- filter(ncaa_tourney)
train_w <- select(train_w, RankMargin, EloMargin)
train_l <- -train_w
train_w$Result <- 1 
train_l$Result <- 0
train <- rbind(train_w,train_l)
train <- train[sample(nrow(train)),]

train$Result <-mapply(ifelse,train$Result==1,"One","Zero")
train$Result<- as.factor(train$Result)

4) Modelling

4.1) Setting up environment for stochastic gradient boosting, with cross validation and logloss as the error function. Centering and scaling data first, setting up grid for hyper parameter search.

control <- trainControl(method="cv", number=5, classProbs=TRUE, summaryFunction=mnLogLoss)

gbm_grid <- expand.grid(interaction.depth = c(1,2,3),
                        n.trees = c(10,20,30,40,50,60,70,80,90,100),
                        shrinkage = c(0.05,0.1,0.15,0.20,0.25,0.30,0.35),
                        n.minobsinnode = seq.int(5,15))

fit_gbm <- caret::train(Result ~ RankMargin + EloMargin, data = train, method = "gbm",
                        trControl = control,
                        tuneGrid = gbm_grid,
                        preProc = c("center", "scale"),
                        metric = "logLoss",
                        verbose = FALSE)

5) Predicting

5.1) Creating all possible tournament results, and combing the Massey Rankings and Elo Ratings for final predictions.

sample_submission <- as.data.frame(substring(SampleSubmissionStage2$ID,1,4)) 
names(sample_submission) <- "Season"
sample_submission$Team1 <- substring(SampleSubmissionStage2$ID,6,9)
sample_submission$Team2 <- substring(SampleSubmissionStage2$ID,11,14)
sample_submission$Team1 <- as.integer(sample_submission$Team1)
sample_submission$Team2 <- as.integer(sample_submission$Team2)
sample_submission$Season <- as.character(sample_submission$Season)
sample_submission$Season <- as.integer(sample_submission$Season)

sample_submission <- left_join(sample_submission,avg_rank_train, by = c("Season","Team1"="TeamID"))
sample_submission <- left_join(sample_submission,avg_rank_train, by = c("Season","Team2"="TeamID"))
sample_submission$RankMargin <- sample_submission$OrdinalRank.x - sample_submission$OrdinalRank.y

sample_submission <- left_join(sample_submission,elos_season, by = c("Season","Team1"="TeamID"))
sample_submission <- left_join(sample_submission,elos_season, by = c("Season","Team2"="TeamID"))
sample_submission$EloMargin <- sample_submission$SeasonElo.x - sample_submission$SeasonElo.y

5.2) Using trained model to predict teams probabilities

test <- select(sample_submission, RankMargin, EloMargin)
test$Pred <- predict(fit_gbm, test, type = "prob")
test$Pred <- test$Pred$One

submission <- select(SampleSubmissionStage2, ID)
submission$Pred <- test$Pred

5.3) Creating csv file for final probabilities of all possible games.

write.csv(submission,"SubmissionStage2_2.csv", row.names = FALSE)