Google Cloud and NCAA® have teamed up to bring you this year’s version of the Kaggle machine learning competition. In the first stage of the competition, Kagglers will rely on results of past tournaments to build and test models. In the second stage, competitors will forecast outcomes of all possible match-ups in the 2018 NCAA Division I Men’s Championships. To download data click here.
library(stringr)
library(dplyr)
library(reshape)
library(caret)
setwd("C:\\Users\\parki\\Documents\\R Scripts\\March Madness Submission Data")
temp = list.files(pattern="*.csv")
for (i in 1:length(temp)) assign(substring(temp[i],1,str_length(temp[i])-4), read.csv(temp[i]))
massey_ordinals_train <- filter(MasseyOrdinals_thruSeason2018_Day128)
massey_ordinals_train <- rbind(massey_ordinals_train,MasseyOrdinals_2018_133_only_43Systems)
avg_rank_train <- aggregate(data = massey_ordinals_train, OrdinalRank ~ TeamID + Season, mean)
ncaa_tourney <- select(NCAATourneyCompactResults,Season,WTeamID,LTeamID)
ncaa_tourney <- filter(ncaa_tourney, Season >= 2003)
ncaa_tourney <- left_join(ncaa_tourney,avg_rank_train, by = c("Season","WTeamID"="TeamID"))
ncaa_tourney <- left_join(ncaa_tourney,avg_rank_train, by = c("Season","LTeamID"="TeamID"))
ncaa_tourney$RankMargin <- ncaa_tourney$OrdinalRank.x - ncaa_tourney$OrdinalRank.y
names(elos_season) <- c("Season","SeasonElo","TeamID")
ncaa_tourney <- left_join(ncaa_tourney,elos_season, by = c("Season","WTeamID"="TeamID"))
ncaa_tourney <- left_join(ncaa_tourney,elos_season, by = c("Season","LTeamID"="TeamID"))
ncaa_tourney$EloMargin <- ncaa_tourney$SeasonElo.x - ncaa_tourney$SeasonElo.y
train_w <- filter(ncaa_tourney)
train_w <- select(train_w, RankMargin, EloMargin)
train_l <- -train_w
train_w$Result <- 1
train_l$Result <- 0
train <- rbind(train_w,train_l)
train <- train[sample(nrow(train)),]
train$Result <-mapply(ifelse,train$Result==1,"One","Zero")
train$Result<- as.factor(train$Result)
control <- trainControl(method="cv", number=5, classProbs=TRUE, summaryFunction=mnLogLoss)
gbm_grid <- expand.grid(interaction.depth = c(1,2,3),
n.trees = c(10,20,30,40,50,60,70,80,90,100),
shrinkage = c(0.05,0.1,0.15,0.20,0.25,0.30,0.35),
n.minobsinnode = seq.int(5,15))
fit_gbm <- caret::train(Result ~ RankMargin + EloMargin, data = train, method = "gbm",
trControl = control,
tuneGrid = gbm_grid,
preProc = c("center", "scale"),
metric = "logLoss",
verbose = FALSE)
sample_submission <- as.data.frame(substring(SampleSubmissionStage2$ID,1,4))
names(sample_submission) <- "Season"
sample_submission$Team1 <- substring(SampleSubmissionStage2$ID,6,9)
sample_submission$Team2 <- substring(SampleSubmissionStage2$ID,11,14)
sample_submission$Team1 <- as.integer(sample_submission$Team1)
sample_submission$Team2 <- as.integer(sample_submission$Team2)
sample_submission$Season <- as.character(sample_submission$Season)
sample_submission$Season <- as.integer(sample_submission$Season)
sample_submission <- left_join(sample_submission,avg_rank_train, by = c("Season","Team1"="TeamID"))
sample_submission <- left_join(sample_submission,avg_rank_train, by = c("Season","Team2"="TeamID"))
sample_submission$RankMargin <- sample_submission$OrdinalRank.x - sample_submission$OrdinalRank.y
sample_submission <- left_join(sample_submission,elos_season, by = c("Season","Team1"="TeamID"))
sample_submission <- left_join(sample_submission,elos_season, by = c("Season","Team2"="TeamID"))
sample_submission$EloMargin <- sample_submission$SeasonElo.x - sample_submission$SeasonElo.y
test <- select(sample_submission, RankMargin, EloMargin)
test$Pred <- predict(fit_gbm, test, type = "prob")
test$Pred <- test$Pred$One
submission <- select(SampleSubmissionStage2, ID)
submission$Pred <- test$Pred
write.csv(submission,"SubmissionStage2_2.csv", row.names = FALSE)