Sunday, February 28, 2016

Neural Network (neuralnet) code


library(neuralnet)

length(names(train))
n <- names(train)
formula <- as.formula(paste("TARGET ~", paste(n[!n %in% c('TARGET','INDEX')], collapse = " + ")))


model.nn = neuralnet(formula, data=train, hidden=4,
                     threshold=0.05, rep=1,
                     linear.output=TRUE, err.fct='sse', act.fct='logistic',
                     algorithm='rprop+',
                     lifesign = 'minimal', lifesign.step=10000, stepmax=1e6)


model.nn$result.matrix

plot(model.nn)
prediction(model.nn); ?prediction
P_nn <- compute(model.nn, train[,3:24])
length(P_nn[[2]])

par(mfrow=c(1,2))
hist(P_nn[[2]])
plot(density(P_nn[[2]]))

error = abs(train$TARGET - P_nn[[2]])
plot(density(error))
mean(error) #nn pred on ctree data 0.99497

print(model.nn)

## save model
save(model.nn, file = "model_nn_raw_ctree_input.rda")

## load the model
load("model_nn_raw_ctree_input.rda")
 ## predict for the new `x`s in `newdf`
predict(model.nn, newdata = newdf)




## http://www.r-bloggers.com/fitting-a-neural-network-in-r-neuralnet-package/
## http://www.r-bloggers.com/using-neural-networks-for-credit-scoring-a-simple-example/
## https://journal.r-project.org/archive/2010-1/RJournal_2010-1_Guenther+Fritsch.pdf

Random Forest & GBM Cross Validation Loop


Random Forest CV



fn.cv.rf = function(x){
  library(randomForest)

  set.seed(12345)
  s = sample(nrow(x), replace=FALSE, nrow(x)*0.75)

  train = x[s,]
  valid = x[-s,]
  cv = NULL
  n = names(train)
  formula <- as.formula(paste("TARGET ~", paste(n[!n %in% c('TARGET','INDEX')], collapse = " + ")))

  for(i in seq(100, 500, by=50)){
    model.rf =
      randomForest(formula,
                   data=train[, !names(x) %in% c('INDEX')],
                   ntree=i, importance=TRUE)
 
    train$P_RF = predict(model.rf)
    error = abs(train$P_RF - train$TARGET)
    error_train = sum(error); mean_error_train = mean(error)
 
    valid$P_RF = predict(model.rf, valid)
    error = abs(valid$P_RF - valid$TARGET)
    error_valid = sum(error); mean_error_valid = mean(error)
 
    cv=rbind(cv, data.frame(i, error_train, mean_error_train,
                            error_valid, mean_error_valid))
  }
  rm(train, valid)
  return(cv)   #return(c(cv, model.rf))
}
cv = fn.cv.rf(cluster)


# https://www.kaggle.com/c/the-analytics-edge-mit-15-071x/forums/t/8082/reading-and-interpreting-random-forest-models
# http://stats.stackexchange.com/questions/21152/obtaining-knowledge-from-a-random-forest
# https://www.youtube.com/watch?v=-nai4NBx5zI


library(ggplot2)
ggplot(cv, aes(i)) +
    geom_line(aes(y = mean_error_train, colour = "train error")) +
    geom_line(aes(y = mean_error_valid, colour = "valid error"))














GBM CV


library(gbm)
# x  = dataset
# kf = k-fold cv
# n  = n.tree
# a  = shrinkage / learning rate


fn.cv.gbm = function(x,kf,ntree,a){
  set.seed(12345)
  x$s = ceiling(runif(nrow(x), 0, kf))
  cv = NULL
  n = names(x)
  formula <- as.formula(paste("TARGET ~", paste(n[!n %in% c('TARGET','INDEX')], collapse = " + ")))


  for (kf_i in 1:kf){
    train = x[!x$s==kf_i,]
    valid = x[x$s==kf_i,]
 
    for(ntree_i in seq(50, ntree, by=50)){
   
      for(shrink_i in seq(0.01, 0.04, by=a)){
        model.gbm = gbm(  formula=formula, data=train,
                          distribution = "poisson",
                          # gaussian for GBM regression or adaboost
                          n.trees=ntree_i,
                          shrinkage=shrink_i, #0.01
                          # smaller values of shrinkage typically give slighly better performance
                          # the cost is that the model takes longer to run for smaller values
                          interaction.depth=2,
                          #use CV to choose interaction delpth
                          n.minobsinnode=100,
                          # n.minobsinmode has an importnt effect on overfitting!
                          # decrease in this number may result the overfitting
                          bag.fraction=0.5,
                          train.fraction=0.99,
                          ### DO NOT USE CV.FOLDS!!!
                          # use this for CV. This option only works for gbm.fit(), not gmb()
                          # var.monotone=c(),
                          # can help with overfitting, will smooth bumpy curves
                          verbose=TRUE)
     
        train$P_GBM = predict(model.gbm)
        error = abs(train$P_GBM - train$TARGET)
        error_train = sum(error); mean_error_train = mean(error)
        n_train = nrow(train)
     
     
        valid$P_GBM = predict(model.gbm, valid)
        error = abs(valid$P_GBM - valid$TARGET)
        error_valid = sum(error); mean_error_valid = mean(error)
        n_valid = nrow(valid)
     
        cv = rbind(cv, data.frame(kf_i, ntree_i, shrink_i,
                                  n_train, error_train, mean_error_train,
                                  n_valid, error_valid, mean_error_valid))
      }
    }
  }
  return(cv)
}
#function(x,kf,ntree,a)
cv = fn.cv.gbm(cluster, 4, 500, 0.003)

a=aggregate(cv$mean_error_valid, list(cv$ntree_i, cv$shrink_i), mean)
a$id=paste(a$Group.1, a$Group.2, sep='_')
plot(row.names(a), a$x, type='p', col=a$Group.1)

library(ggplot2)
ggplot(a, aes(x=Group.2, y=x)) +
  geom_point() +
  facet_grid(.~Group.1) +
    ggtitle("GBM 4 fold cv") + labs(x='n.tree / Shrinkage', y='mean cv error')



#https://www.kaggle.com/c/15-071x-the-analytics-edge-competition-spring-2015/forums/t/13749/gbm-output

PCA & Clustering


names(train)
pca <- prcomp(train[,-c(1,2)], scale=TRUE)

#names(pca); pca$sdev; pca$rotation; pca$center; pca$scale; pca$x
#summary(pca); biplot(pca, scale=0); head(pca$x)

###############################################
### Select number of PCA
###############################################
cluster = cbind(train[,c(1,2)],pca$x[,1:10]) #head(cluster)



fn.cv.kmean = function(x,y){
  set.seed(12345)
  error = NULL
  for(i in 1:y){
    km=kmeans(x[,!names(x) %in% c('INDEX', 'TARGET')], i, iter.max=1e6, nstart=50, algorithm='Lloyd')
    error = rbind(error, data.frame(i, km$tot.withinss, km$totss))
    #table(km$cluster, x$TARGET)
    #plot(x[,3], x[,4], col=km$cluster)
  }
  return(error)
}
cv=fn.cv.kmean(cluster,12)
plot(cv$i, cv$km.tot.withinss, type='l')



set.seed(12345)
km=kmeans(cluster[,!names(cluster) %in% c('INDEX', 'TARGET')], 6, iter.max=1e6, nstart=50, algorithm='Lloyd')
cluster$kmean = km$cluster
table(km$cluster, cluster$TARGET)
#summary(km); head(cluster)
#names(km)


plot(table(km$cluster, cluster$TARGET))
plot(km$cluster, cluster$TARGET)





library(scatterplot3d)
scatterplot3d(cluster$PC1, cluster$PC2, cluster$kmean, main='',
              highlight.3d=TRUE, color='green', col.grid='lightblue', col.axis = 'blue')

scatterplot3d(cluster$PC1, cluster$PC2, cluster$TARGET,
              highlight.3d=TRUE, color='green', col.grid='lightblue', col.axis = 'blue')

scatterplot3d(cluster$PC1, cluster$PC2, cluster$km, pch=1,
              color=cluster$TARGET, col.grid='lightblue', col.axis='blue')

scatterplot3d(cluster$PC1, cluster$PC2, cluster$TARGET,
              color=cluster$km, col.grid='lightblue', col.axis='blue')

error = abs(cluster$km - cluster$TARGET)
mean(error)



Add caption




############################
############################

hc.complete = hclust(dist(cluster[,-c(1,2)]), method='complete')
hc.average  = hclust(dist(cluster[,-c(1,2)]), method='average')
hc.single   = hclust(dist(cluster[,-c(1,2)]), method='single')
#summary(hc.average); names(hc.average)


plot(hc.complete)
plot(hc.average)
plot(hc.single)








Tree based imputation Part 2 of 2


Conditional tree based imputation
Note that formula making command for casting its type.
Generally, ctree performs better than rpart
It is noted that GBM can handle missing value, perhaps GBM can be used to impute. Refer here.

rm(list=ls())
library("partykit") #for ctree

setwd ('C:\\Users\\Ted\\Documents\\MSPA\\2016 Winter\\Pred 411\\Unit_03')
train = read.csv('wine.csv')
str(train); class(train)
colnames(train)[1] = 'INDEX'


apply(apply(train,2,is.na),2,sum)
missCol = names(which(apply(is.na(train),2,any)))


for (i in 1:length(missCol)) {
  var     = missCol[i]
  MIFlag  = paste0(var,'_MIFLAG')
  formula = as.formula(paste0(var, '~.'))

  model.ctree =
    ctree(formula,
          data=train[!is.na(train[,missCol[i]]),
                     !names(train) %in% c('INDEX', 'TARGET_FLAG', 'TARGET_AMT')]
    )

  #assign(paste0('model.ctree.',var), model.ctree)

  train[[MIFlag]] = ifelse(is.na(train[[var]]), 1, 0)
  train[,var]     =
    ifelse(is.na(train[,var]), predict(model.ctree), train[,var])
  rm(model.ctree, formula, i, MIFlag, var)
}

apply(apply(train,2,is.na),2,sum)

Tree based imputation Part 1 of 2


Missing value imputation via rpart

rm(list=ls())
library(rpart)

setwd ('C:\\Users\\Ted\\Documents\\MSPA\\2016 Winter\\Pred 411\\Unit_03')
train = read.csv('wine.csv')
str(train); class(train)
colnames(train)[1] = 'INDEX'


apply(apply(train,2,is.na),2,sum)
missCol = names(which(apply(is.na(train),2,any)))
# "ResidualSugar"      "Chlorides"          "FreeSulfurDioxide"
# "TotalSulfurDioxide" "pH"                 "Sulphates"        
# "Alcohol"            "STARS"



for (i in 1:length(missCol)) {
  var     = missCol[i]
  MIFlag  = paste0(var,'_MIFLAG')
  formula = paste(var, '~ .')
 
  model.rpart =
    rpart(formula,
          data=train[!is.na(train[,missCol[i]]),
                     !names(train) %in% c('INDEX', 'TARGET')]      
    )
  opt<-which.min(model.rpart$cptable[,'xerror'])
  cp<-model.rpart$cptable[opt,'CP']
  model.rpart.prune <- prune(model.rpart, cp = cp)
  #assign(paste0('model.rpart.prune.',var), model.rpart.prune)
 
  train[[MIFlag]] = ifelse(is.na(train[[var]]), 1, 0)
  train[,var]   =
    ifelse(is.na(train[,var]), predict(model.rpart.prune), train[,var])
}

#check if the missing values have been imputed
apply(apply(train,2,is.na),2,sum) #apply(is.na(train),2,any)

write.table(train, "wine_train.csv", sep=",", row.names=FALSE)

library(rattle)
asRules(model.rpart.prune.STARS)
asRules(model.rpart.prune.ResidualSugar)

## http://stats.stackexchange.com/questions/72251/an-example-lasso-regression-using-glmnet-for-binary-outcome
## http://stats.stackexchange.com/questions/77546/how-to-interpret-glmnet