Saturday, February 4, 2017

xgboost and impact coding

rm(list=ls())
setwd('C:\\Users\\Ted\\Documents\\Kaggle\\RedHat\\Data')

library(data.table)
library(ggplot2)
library(randomForest)
library(lattice)
library(caret)


people = fread("people.csv")
p_logi <- names(people)[which(sapply(people, is.logical))]

for (col in p_logi) set(people, j = col,
                        value = as.integer(people[[col]]))

train  = fread("act_train.csv")
dt     = merge(x=people, y=train, by = "people_id", all.x = T)
#dt$outcome = as.factor(dt$outcome)

rm(people);rm(train)
set.seed(12345)
s = sample(nrow(dt), replace=FALSE, floor(nrow(dt)*0.1))
dt = dt[s,]; rm(s)
dt = dt[!is.na(outcome),]

#############################################################
#############################################################
### one-hot encoding can be implemented instead of impact coding ###


dt.level=sapply(lapply(dt, unique), length)
var = names(dt.level)[!names(dt.level) %in% 
                          c('people_id','date.x','date.y','activity_id','char_10.y','group_1','outcome')]

pvalue = NULL
for (i in var) {
  a = table(dt[[i]])
  a = a[a > 100]
  b = chisq.test(table(
        dt[get(i) %in% names(a) & !is.na(outcome), .(get(i), outcome)]
        )
      )
  isNum = ifelse(is.numeric(dt[,get(i)]),'numeric','character')
  level = length(unique(dt[,get(i)]))
  pvalue = rbind(pvalue, data.frame(i, b$p.value, isNum, level))
}
pvalue


# return a model of the conditional probability 
# of dependent variable (depvar) by level 
# assumes outcome is logical and not null
impactModel = function(xcol, depvar) {
  n = length(depvar)
  p = sum(as.numeric(levels(depvar))[dt$outcome])/n
  # duplicate output for NA (average NA towards grand uniform average) 
  x = c(xcol,xcol)
  y = c(depvar, depvar)
  x[(1+n):(2*n)] = NA
  levelcounts = table(x, y, useNA="always")
  condprobmodel = (levelcounts[,2]+p)/(levelcounts[,1]+levelcounts[,2]+1.0) 
  # apply model example: applyImpactModel(condprobmodel,data[,varname])
  condprobmodel
}  # impact_char_1.y = impactModel(dt$char_1.y, dt$outcome)



# apply model to column to essentially return condprobmodel[rawx]
# both NA's and new levels are smoothed to original grand average 
applyImpactModel = function(condprobmodel, xcol) {
  naval = condprobmodel[is.na(names(condprobmodel))]
  dim = length(xcol)
  condprobvec = numeric(dim) + naval
  for(nm in names(condprobmodel)) {
    if(!is.na(nm)) {
      condprobvec[xcol==nm] = condprobmodel[nm]
    }
  }
  condprobvec
} # dt$impact_char_1.y = applyImpactModel(impact_char_1.y, dt$char_1.y)

# Original - convert Address variable to its impact model outcome
# impact_char_1.y = impactModel(dt$char_1.y, dt$outcome)
# dt$impact_char_1.y = applyImpactModel(impact_char_1.y, dt$outcome)
# debugonce(function_name); function_name()


var1 = subset(pvalue, isNum =='character' & level > 6 & level < 2000)
var1 = var1$i

for (i in var1){
  a = paste0('impact_', i)
  print(a)
  dt[[a]] = applyImpactModel(impactModel(dt[[i]], dt$outcome),
                             dt[[i]])
}

# a = 'impact_char_8.y'
# b = paste0('impact_', a)
# dt[[b]] = applyImpactModel(impactModel(dt[[a]], dt$outcome),
#                            dt$outcome)


var2 = subset(pvalue, isNum =='character' & level <=6)
var2 = var2$i

a = paste('~', paste(var2, collapse=' + '))

dmy = dummyVars(a, data=dt, fullRank=TRUE); # trsf = data.table(predict(dmy, newdata=dt))
var2 = names(data.table(predict(dmy, newdata=dt)))
dt = cbind(dt, data.table(predict(dmy, newdata=dt)))


fn_remove_na = function(DT) {
# either of the following for loops
# by name :
#   for (j in names(DT))
#     set(DT,which(is.na(DT[[j]])),j,0)
  
# or by number (slightly faster than by name) :
  for (j in seq_len(ncol(DT)))
    set(DT,which(is.na(DT[[j]])),j,0)
}
fn_remove_na(dt)





#############################################################
#############################################################

library(xgboost)
library(Matrix)

fn.cv.xgb = function(x,kfold,ntree,depth,eta){
  set.seed(12345)
  x$s = ceiling(runif(nrow(x), 0, kfold))
  cv = NULL
  n = names(x)
  # formula <- as.formula(paste("TARGET ~", paste(n[!n %in% c('TARGET','INDEX')], collapse = " + ")))
  
  
  for (kfold_idx in 1:kfold){
    train = x[!x$s==kfold_idx,] 
    valid = x[x$s==kfold_idx,]
    Ytrain = sparse.model.matrix(train[['outcome']])
    Yvalid = sparse.model.matrix(valid[['outcome']])
    train = sparse.model.matrix(train[,!colnames(train) %in% c('s','outcome'), with=F])
    valid = sparse.model.matrix(valid[,!colnames(valid) %in% c('s','outcome'), with=F])
    
    for(ntree_idx in seq(10, ntree, by=10)){
      
      for(depth_idx in seq(1, depth, by=1)){
        
        for(eta_idx in seq(0.2, eta, by=0.1)){
          
          # sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df)
          model.xgb <- xgboost(data=train, label=Ytrain,
                               max.depth=depth, nround=ntree, eta=eta_idx,
                               objective = "binary:logistic", verbose=0)
          
          
          P_xgb =predict(model.xgb, train)
          error = (P_xgb - Ytrain)^2
          error_train = sum(error); mean_error_train = mean(error)
          n_train = nrow(train)
          
          
          P_xgb = predict(model.xgb, valid)
          error = (P_xgb - Yvalid)^2
          error_valid = sum(error); mean_error_valid = mean(error)
          n_valid = nrow(valid)
          
          cv = rbind(cv, data.frame(kfold_idx, ntree_idx, depth_idx, eta_idx,
                                    n_train, error_train, mean_error_train,
                                    n_valid, error_valid, mean_error_valid))
        }
      }
    }
  }
  return(cv)
}
#function(x,kfold,ntree,depth,eta)
# cv=fn.cv.xgb(dt, 5, 50, 6, 1.2)

cv=fn.cv.xgb(dt, 5, 50, 4, 0.4)




Sunday, February 28, 2016

Neural Network (neuralnet) code


library(neuralnet)

length(names(train))
n <- names(train)
formula <- as.formula(paste("TARGET ~", paste(n[!n %in% c('TARGET','INDEX')], collapse = " + ")))


model.nn = neuralnet(formula, data=train, hidden=4,
                     threshold=0.05, rep=1,
                     linear.output=TRUE, err.fct='sse', act.fct='logistic',
                     algorithm='rprop+',
                     lifesign = 'minimal', lifesign.step=10000, stepmax=1e6)


model.nn$result.matrix

plot(model.nn)
prediction(model.nn); ?prediction
P_nn <- compute(model.nn, train[,3:24])
length(P_nn[[2]])

par(mfrow=c(1,2))
hist(P_nn[[2]])
plot(density(P_nn[[2]]))

error = abs(train$TARGET - P_nn[[2]])
plot(density(error))
mean(error) #nn pred on ctree data 0.99497

print(model.nn)

## save model
save(model.nn, file = "model_nn_raw_ctree_input.rda")

## load the model
load("model_nn_raw_ctree_input.rda")
 ## predict for the new `x`s in `newdf`
predict(model.nn, newdata = newdf)




## http://www.r-bloggers.com/fitting-a-neural-network-in-r-neuralnet-package/
## http://www.r-bloggers.com/using-neural-networks-for-credit-scoring-a-simple-example/
## https://journal.r-project.org/archive/2010-1/RJournal_2010-1_Guenther+Fritsch.pdf

Random Forest & GBM Cross Validation Loop


Random Forest CV



fn.cv.rf = function(x){
  library(randomForest)

  set.seed(12345)
  s = sample(nrow(x), replace=FALSE, nrow(x)*0.75)

  train = x[s,]
  valid = x[-s,]
  cv = NULL
  n = names(train)
  formula <- as.formula(paste("TARGET ~", paste(n[!n %in% c('TARGET','INDEX')], collapse = " + ")))

  for(i in seq(100, 500, by=50)){
    model.rf =
      randomForest(formula,
                   data=train[, !names(x) %in% c('INDEX')],
                   ntree=i, importance=TRUE)
 
    train$P_RF = predict(model.rf)
    error = abs(train$P_RF - train$TARGET)
    error_train = sum(error); mean_error_train = mean(error)
 
    valid$P_RF = predict(model.rf, valid)
    error = abs(valid$P_RF - valid$TARGET)
    error_valid = sum(error); mean_error_valid = mean(error)
 
    cv=rbind(cv, data.frame(i, error_train, mean_error_train,
                            error_valid, mean_error_valid))
  }
  rm(train, valid)
  return(cv)   #return(c(cv, model.rf))
}
cv = fn.cv.rf(cluster)


# https://www.kaggle.com/c/the-analytics-edge-mit-15-071x/forums/t/8082/reading-and-interpreting-random-forest-models
# http://stats.stackexchange.com/questions/21152/obtaining-knowledge-from-a-random-forest
# https://www.youtube.com/watch?v=-nai4NBx5zI


library(ggplot2)
ggplot(cv, aes(i)) +
    geom_line(aes(y = mean_error_train, colour = "train error")) +
    geom_line(aes(y = mean_error_valid, colour = "valid error"))














GBM CV


library(gbm)
# x  = dataset
# kf = k-fold cv
# n  = n.tree
# a  = shrinkage / learning rate


fn.cv.gbm = function(x,kf,ntree,a){
  set.seed(12345)
  x$s = ceiling(runif(nrow(x), 0, kf))
  cv = NULL
  n = names(x)
  formula <- as.formula(paste("TARGET ~", paste(n[!n %in% c('TARGET','INDEX')], collapse = " + ")))


  for (kf_i in 1:kf){
    train = x[!x$s==kf_i,]
    valid = x[x$s==kf_i,]
 
    for(ntree_i in seq(50, ntree, by=50)){
   
      for(shrink_i in seq(0.01, 0.04, by=a)){
        model.gbm = gbm(  formula=formula, data=train,
                          distribution = "poisson",
                          # gaussian for GBM regression or adaboost
                          n.trees=ntree_i,
                          shrinkage=shrink_i, #0.01
                          # smaller values of shrinkage typically give slighly better performance
                          # the cost is that the model takes longer to run for smaller values
                          interaction.depth=2,
                          #use CV to choose interaction delpth
                          n.minobsinnode=100,
                          # n.minobsinmode has an importnt effect on overfitting!
                          # decrease in this number may result the overfitting
                          bag.fraction=0.5,
                          train.fraction=0.99,
                          ### DO NOT USE CV.FOLDS!!!
                          # use this for CV. This option only works for gbm.fit(), not gmb()
                          # var.monotone=c(),
                          # can help with overfitting, will smooth bumpy curves
                          verbose=TRUE)
     
        train$P_GBM = predict(model.gbm)
        error = abs(train$P_GBM - train$TARGET)
        error_train = sum(error); mean_error_train = mean(error)
        n_train = nrow(train)
     
     
        valid$P_GBM = predict(model.gbm, valid)
        error = abs(valid$P_GBM - valid$TARGET)
        error_valid = sum(error); mean_error_valid = mean(error)
        n_valid = nrow(valid)
     
        cv = rbind(cv, data.frame(kf_i, ntree_i, shrink_i,
                                  n_train, error_train, mean_error_train,
                                  n_valid, error_valid, mean_error_valid))
      }
    }
  }
  return(cv)
}
#function(x,kf,ntree,a)
cv = fn.cv.gbm(cluster, 4, 500, 0.003)

a=aggregate(cv$mean_error_valid, list(cv$ntree_i, cv$shrink_i), mean)
a$id=paste(a$Group.1, a$Group.2, sep='_')
plot(row.names(a), a$x, type='p', col=a$Group.1)

library(ggplot2)
ggplot(a, aes(x=Group.2, y=x)) +
  geom_point() +
  facet_grid(.~Group.1) +
    ggtitle("GBM 4 fold cv") + labs(x='n.tree / Shrinkage', y='mean cv error')



#https://www.kaggle.com/c/15-071x-the-analytics-edge-competition-spring-2015/forums/t/13749/gbm-output

PCA & Clustering


names(train)
pca <- prcomp(train[,-c(1,2)], scale=TRUE)

#names(pca); pca$sdev; pca$rotation; pca$center; pca$scale; pca$x
#summary(pca); biplot(pca, scale=0); head(pca$x)

###############################################
### Select number of PCA
###############################################
cluster = cbind(train[,c(1,2)],pca$x[,1:10]) #head(cluster)



fn.cv.kmean = function(x,y){
  set.seed(12345)
  error = NULL
  for(i in 1:y){
    km=kmeans(x[,!names(x) %in% c('INDEX', 'TARGET')], i, iter.max=1e6, nstart=50, algorithm='Lloyd')
    error = rbind(error, data.frame(i, km$tot.withinss, km$totss))
    #table(km$cluster, x$TARGET)
    #plot(x[,3], x[,4], col=km$cluster)
  }
  return(error)
}
cv=fn.cv.kmean(cluster,12)
plot(cv$i, cv$km.tot.withinss, type='l')



set.seed(12345)
km=kmeans(cluster[,!names(cluster) %in% c('INDEX', 'TARGET')], 6, iter.max=1e6, nstart=50, algorithm='Lloyd')
cluster$kmean = km$cluster
table(km$cluster, cluster$TARGET)
#summary(km); head(cluster)
#names(km)


plot(table(km$cluster, cluster$TARGET))
plot(km$cluster, cluster$TARGET)





library(scatterplot3d)
scatterplot3d(cluster$PC1, cluster$PC2, cluster$kmean, main='',
              highlight.3d=TRUE, color='green', col.grid='lightblue', col.axis = 'blue')

scatterplot3d(cluster$PC1, cluster$PC2, cluster$TARGET,
              highlight.3d=TRUE, color='green', col.grid='lightblue', col.axis = 'blue')

scatterplot3d(cluster$PC1, cluster$PC2, cluster$km, pch=1,
              color=cluster$TARGET, col.grid='lightblue', col.axis='blue')

scatterplot3d(cluster$PC1, cluster$PC2, cluster$TARGET,
              color=cluster$km, col.grid='lightblue', col.axis='blue')

error = abs(cluster$km - cluster$TARGET)
mean(error)



Add caption




############################
############################

hc.complete = hclust(dist(cluster[,-c(1,2)]), method='complete')
hc.average  = hclust(dist(cluster[,-c(1,2)]), method='average')
hc.single   = hclust(dist(cluster[,-c(1,2)]), method='single')
#summary(hc.average); names(hc.average)


plot(hc.complete)
plot(hc.average)
plot(hc.single)








Tree based imputation Part 2 of 2


Conditional tree based imputation
Note that formula making command for casting its type.
Generally, ctree performs better than rpart
It is noted that GBM can handle missing value, perhaps GBM can be used to impute. Refer here.

rm(list=ls())
library("partykit") #for ctree

setwd ('C:\\Users\\Ted\\Documents\\MSPA\\2016 Winter\\Pred 411\\Unit_03')
train = read.csv('wine.csv')
str(train); class(train)
colnames(train)[1] = 'INDEX'


apply(apply(train,2,is.na),2,sum)
missCol = names(which(apply(is.na(train),2,any)))


for (i in 1:length(missCol)) {
  var     = missCol[i]
  MIFlag  = paste0(var,'_MIFLAG')
  formula = as.formula(paste0(var, '~.'))

  model.ctree =
    ctree(formula,
          data=train[!is.na(train[,missCol[i]]),
                     !names(train) %in% c('INDEX', 'TARGET_FLAG', 'TARGET_AMT')]
    )

  #assign(paste0('model.ctree.',var), model.ctree)

  train[[MIFlag]] = ifelse(is.na(train[[var]]), 1, 0)
  train[,var]     =
    ifelse(is.na(train[,var]), predict(model.ctree), train[,var])
  rm(model.ctree, formula, i, MIFlag, var)
}

apply(apply(train,2,is.na),2,sum)

Tree based imputation Part 1 of 2


Missing value imputation via rpart

rm(list=ls())
library(rpart)

setwd ('C:\\Users\\Ted\\Documents\\MSPA\\2016 Winter\\Pred 411\\Unit_03')
train = read.csv('wine.csv')
str(train); class(train)
colnames(train)[1] = 'INDEX'


apply(apply(train,2,is.na),2,sum)
missCol = names(which(apply(is.na(train),2,any)))
# "ResidualSugar"      "Chlorides"          "FreeSulfurDioxide"
# "TotalSulfurDioxide" "pH"                 "Sulphates"        
# "Alcohol"            "STARS"



for (i in 1:length(missCol)) {
  var     = missCol[i]
  MIFlag  = paste0(var,'_MIFLAG')
  formula = paste(var, '~ .')
 
  model.rpart =
    rpart(formula,
          data=train[!is.na(train[,missCol[i]]),
                     !names(train) %in% c('INDEX', 'TARGET')]      
    )
  opt<-which.min(model.rpart$cptable[,'xerror'])
  cp<-model.rpart$cptable[opt,'CP']
  model.rpart.prune <- prune(model.rpart, cp = cp)
  #assign(paste0('model.rpart.prune.',var), model.rpart.prune)
 
  train[[MIFlag]] = ifelse(is.na(train[[var]]), 1, 0)
  train[,var]   =
    ifelse(is.na(train[,var]), predict(model.rpart.prune), train[,var])
}

#check if the missing values have been imputed
apply(apply(train,2,is.na),2,sum) #apply(is.na(train),2,any)

write.table(train, "wine_train.csv", sep=",", row.names=FALSE)

library(rattle)
asRules(model.rpart.prune.STARS)
asRules(model.rpart.prune.ResidualSugar)

## http://stats.stackexchange.com/questions/72251/an-example-lasso-regression-using-glmnet-for-binary-outcome
## http://stats.stackexchange.com/questions/77546/how-to-interpret-glmnet

Sunday, August 23, 2015

Google Compute Engine - Cloud Computing Setup

Google Compute Engine Cloud Computing Set-ups:
Unix + R Stdudio + WebInterface

gcloud compute images create IMAGE_NAME --source-uri URI

gcloud compute images create rstudio-image --source-uri http://storage.googleapis.com/rstudio-image/rstudio-image.image.tar.gz

gcloud compute images create rstudioz --source-uri http://storage.googleapis.com/rstudio-image/rstudio-image.image.tar.gz


gcloud compute images create IMAGE_NAME --source-uri gs://BUCKET_NAME/IMAGE_NAME.image.tar.gz

gcloud compute images create rstudio-image --source-uri gs://rstudio-image/rstudio-image2.image.tar.tar
gcloud compute images create rstudio-image --source-uri http://storage.googleapis.com/rstudio-image/rstudio-image.tar.gz
gcloud compute images create rstudio-image --source-uri http://storage.googleapis.com/r-gce-sunholo/98789ee2fa89942875b1409a36a3fb8e1e719aae.image.tar.gz

http://storage.googleapis.com/rstudio-image/rstudio-image.tar.gz

==================
http://storage.googleapis.com/r-gce-sunholo/98789ee2fa89942875b1409a36a3fb8e1e719aae.image.tar.gz
================
gcloud compute instances create example-instance \
         --image https://www.googleapis.com/compute/v1/projects/debian-cloud/global/images/debian-7-wheezy-vYYYMMDD

gcloud compute instances create rstudio-image \ --image https://storage.googleapis.com/rstudio-image/rstudio-image.tar.gz
================

tar -Sczf rstudio-image.tar.gz disk.raw




If you don't know your username, try this command using gcloud to see your user details:
$ gcloud auth login
Any users you add to Debian running on the instance will have a user in RStudio - to log into Debian and add new users, see below:
$ ## ssh into the running instance
$ gcloud compute ssh <your-username>@new-instance-name
$ #### It should now tell you that you are logged into your instance #####
$ #### Once logged in, add a user: example with jsmith
$ sudo useradd jsmith
$ sudo passwd jsmith
$ ## give the new user a directory and change ownership to them
$ sudo mkdir /home/jsmith
$ sudo chown jsmith:users /home/jsmith