Random Forest CV
fn.cv.rf = function(x){
library(randomForest)
set.seed(12345)
s = sample(nrow(x), replace=FALSE, nrow(x)*0.75)
train = x[s,]
valid = x[-s,]
cv = NULL
n = names(train)
formula <- as.formula(paste("TARGET ~", paste(n[!n %in% c('TARGET','INDEX')], collapse = " + ")))
for(i in seq(100, 500, by=50)){
model.rf =
randomForest(formula,
data=train[, !names(x) %in% c('INDEX')],
ntree=i, importance=TRUE)
train$P_RF = predict(model.rf)
error = abs(train$P_RF - train$TARGET)
error_train = sum(error); mean_error_train = mean(error)
valid$P_RF = predict(model.rf, valid)
error = abs(valid$P_RF - valid$TARGET)
error_valid = sum(error); mean_error_valid = mean(error)
cv=rbind(cv, data.frame(i, error_train, mean_error_train,
error_valid, mean_error_valid))
}
rm(train, valid)
return(cv) #return(c(cv, model.rf))
}
cv = fn.cv.rf(cluster)
# https://www.kaggle.com/c/the-analytics-edge-mit-15-071x/forums/t/8082/reading-and-interpreting-random-forest-models
# http://stats.stackexchange.com/questions/21152/obtaining-knowledge-from-a-random-forest
# https://www.youtube.com/watch?v=-nai4NBx5zI
library(ggplot2)
ggplot(cv, aes(i)) +
geom_line(aes(y = mean_error_train, colour = "train error")) +
geom_line(aes(y = mean_error_valid, colour = "valid error"))
GBM CV
library(gbm)
# x = dataset
# kf = k-fold cv
# n = n.tree
# a = shrinkage / learning rate
fn.cv.gbm = function(x,kf,ntree,a){
set.seed(12345)
x$s = ceiling(runif(nrow(x), 0, kf))
cv = NULL
n = names(x)
formula <- as.formula(paste("TARGET ~", paste(n[!n %in% c('TARGET','INDEX')], collapse = " + ")))
for (kf_i in 1:kf){
train = x[!x$s==kf_i,]
valid = x[x$s==kf_i,]
for(ntree_i in seq(50, ntree, by=50)){
for(shrink_i in seq(0.01, 0.04, by=a)){
model.gbm = gbm( formula=formula, data=train,
distribution = "poisson",
# gaussian for GBM regression or adaboost
n.trees=ntree_i,
shrinkage=shrink_i, #0.01
# smaller values of shrinkage typically give slighly better performance
# the cost is that the model takes longer to run for smaller values
interaction.depth=2,
#use CV to choose interaction delpth
n.minobsinnode=100,
# n.minobsinmode has an importnt effect on overfitting!
# decrease in this number may result the overfitting
bag.fraction=0.5,
train.fraction=0.99,
### DO NOT USE CV.FOLDS!!!
# use this for CV. This option only works for gbm.fit(), not gmb()
# var.monotone=c(),
# can help with overfitting, will smooth bumpy curves
verbose=TRUE)
train$P_GBM = predict(model.gbm)
error = abs(train$P_GBM - train$TARGET)
error_train = sum(error); mean_error_train = mean(error)
n_train = nrow(train)
valid$P_GBM = predict(model.gbm, valid)
error = abs(valid$P_GBM - valid$TARGET)
error_valid = sum(error); mean_error_valid = mean(error)
n_valid = nrow(valid)
cv = rbind(cv, data.frame(kf_i, ntree_i, shrink_i,
n_train, error_train, mean_error_train,
n_valid, error_valid, mean_error_valid))
}
}
}
return(cv)
}
#function(x,kf,ntree,a)
cv = fn.cv.gbm(cluster, 4, 500, 0.003)
a=aggregate(cv$mean_error_valid, list(cv$ntree_i, cv$shrink_i), mean)
a$id=paste(a$Group.1, a$Group.2, sep='_')
plot(row.names(a), a$x, type='p', col=a$Group.1)
library(ggplot2)
ggplot(a, aes(x=Group.2, y=x)) +
geom_point() +
facet_grid(.~Group.1) +
ggtitle("GBM 4 fold cv") + labs(x='n.tree / Shrinkage', y='mean cv error')
#https://www.kaggle.com/c/15-071x-the-analytics-edge-competition-spring-2015/forums/t/13749/gbm-output