Sunday, February 28, 2016

Random Forest & GBM Cross Validation Loop


Random Forest CV



fn.cv.rf = function(x){
  library(randomForest)

  set.seed(12345)
  s = sample(nrow(x), replace=FALSE, nrow(x)*0.75)

  train = x[s,]
  valid = x[-s,]
  cv = NULL
  n = names(train)
  formula <- as.formula(paste("TARGET ~", paste(n[!n %in% c('TARGET','INDEX')], collapse = " + ")))

  for(i in seq(100, 500, by=50)){
    model.rf =
      randomForest(formula,
                   data=train[, !names(x) %in% c('INDEX')],
                   ntree=i, importance=TRUE)
 
    train$P_RF = predict(model.rf)
    error = abs(train$P_RF - train$TARGET)
    error_train = sum(error); mean_error_train = mean(error)
 
    valid$P_RF = predict(model.rf, valid)
    error = abs(valid$P_RF - valid$TARGET)
    error_valid = sum(error); mean_error_valid = mean(error)
 
    cv=rbind(cv, data.frame(i, error_train, mean_error_train,
                            error_valid, mean_error_valid))
  }
  rm(train, valid)
  return(cv)   #return(c(cv, model.rf))
}
cv = fn.cv.rf(cluster)


# https://www.kaggle.com/c/the-analytics-edge-mit-15-071x/forums/t/8082/reading-and-interpreting-random-forest-models
# http://stats.stackexchange.com/questions/21152/obtaining-knowledge-from-a-random-forest
# https://www.youtube.com/watch?v=-nai4NBx5zI


library(ggplot2)
ggplot(cv, aes(i)) +
    geom_line(aes(y = mean_error_train, colour = "train error")) +
    geom_line(aes(y = mean_error_valid, colour = "valid error"))














GBM CV


library(gbm)
# x  = dataset
# kf = k-fold cv
# n  = n.tree
# a  = shrinkage / learning rate


fn.cv.gbm = function(x,kf,ntree,a){
  set.seed(12345)
  x$s = ceiling(runif(nrow(x), 0, kf))
  cv = NULL
  n = names(x)
  formula <- as.formula(paste("TARGET ~", paste(n[!n %in% c('TARGET','INDEX')], collapse = " + ")))


  for (kf_i in 1:kf){
    train = x[!x$s==kf_i,]
    valid = x[x$s==kf_i,]
 
    for(ntree_i in seq(50, ntree, by=50)){
   
      for(shrink_i in seq(0.01, 0.04, by=a)){
        model.gbm = gbm(  formula=formula, data=train,
                          distribution = "poisson",
                          # gaussian for GBM regression or adaboost
                          n.trees=ntree_i,
                          shrinkage=shrink_i, #0.01
                          # smaller values of shrinkage typically give slighly better performance
                          # the cost is that the model takes longer to run for smaller values
                          interaction.depth=2,
                          #use CV to choose interaction delpth
                          n.minobsinnode=100,
                          # n.minobsinmode has an importnt effect on overfitting!
                          # decrease in this number may result the overfitting
                          bag.fraction=0.5,
                          train.fraction=0.99,
                          ### DO NOT USE CV.FOLDS!!!
                          # use this for CV. This option only works for gbm.fit(), not gmb()
                          # var.monotone=c(),
                          # can help with overfitting, will smooth bumpy curves
                          verbose=TRUE)
     
        train$P_GBM = predict(model.gbm)
        error = abs(train$P_GBM - train$TARGET)
        error_train = sum(error); mean_error_train = mean(error)
        n_train = nrow(train)
     
     
        valid$P_GBM = predict(model.gbm, valid)
        error = abs(valid$P_GBM - valid$TARGET)
        error_valid = sum(error); mean_error_valid = mean(error)
        n_valid = nrow(valid)
     
        cv = rbind(cv, data.frame(kf_i, ntree_i, shrink_i,
                                  n_train, error_train, mean_error_train,
                                  n_valid, error_valid, mean_error_valid))
      }
    }
  }
  return(cv)
}
#function(x,kf,ntree,a)
cv = fn.cv.gbm(cluster, 4, 500, 0.003)

a=aggregate(cv$mean_error_valid, list(cv$ntree_i, cv$shrink_i), mean)
a$id=paste(a$Group.1, a$Group.2, sep='_')
plot(row.names(a), a$x, type='p', col=a$Group.1)

library(ggplot2)
ggplot(a, aes(x=Group.2, y=x)) +
  geom_point() +
  facet_grid(.~Group.1) +
    ggtitle("GBM 4 fold cv") + labs(x='n.tree / Shrinkage', y='mean cv error')



#https://www.kaggle.com/c/15-071x-the-analytics-edge-competition-spring-2015/forums/t/13749/gbm-output

No comments:

Post a Comment