Blog

Saturday, February 4, 2017

xgboost and impact coding

rm(list=ls())
setwd('C:\\Users\\Ted\\Documents\\Kaggle\\RedHat\\Data')

library(data.table)
library(ggplot2)
library(randomForest)
library(lattice)
library(caret)

people = fread("people.csv")
p_logi <- names(people)[which(sapply(people, is.logical))]

for (col in p_logi) set(people, j = col,
value = as.integer(people[[col]]))

train = fread("act_train.csv")
dt = merge(x=people, y=train, by = "people_id", all.x = T)
#dt$outcome = as.factor(dt$outcome)

rm(people);rm(train)
set.seed(12345)
s = sample(nrow(dt), replace=FALSE, floor(nrow(dt)*0.1))
dt = dt[s,]; rm(s)
dt = dt[!is.na(outcome),]

#############################################################

### one-hot encoding can be implemented instead of impact coding ###

dt.level=sapply(lapply(dt, unique), length)

var = names(dt.level)[!names(dt.level) %in%

c('people_id','date.x','date.y','activity_id','char_10.y','group_1','outcome')]

pvalue = NULL

for (i in var) {

a = table(dt[[i]])

a = a[a > 100]

b = chisq.test(table(

dt[get(i) %in% names(a) & !is.na(outcome), .(get(i), outcome)]

)

isNum = ifelse(is.numeric(dt[,get(i)]),'numeric','character')

level = length(unique(dt[,get(i)]))

pvalue = rbind(pvalue, data.frame(i, b$p.value, isNum, level))

}

pvalue

# return a model of the conditional probability

# of dependent variable (depvar) by level

# assumes outcome is logical and not null

impactModel = function(xcol, depvar) {

n = length(depvar)

p = sum(as.numeric(levels(depvar))[dt$outcome])/n

# duplicate output for NA (average NA towards grand uniform average)

x = c(xcol,xcol)

y = c(depvar, depvar)

x[(1+n):(2*n)] = NA

levelcounts = table(x, y, useNA="always")

condprobmodel = (levelcounts[,2]+p)/(levelcounts[,1]+levelcounts[,2]+1.0)

# apply model example: applyImpactModel(condprobmodel,data[,varname])

condprobmodel

} # impact_char_1.y = impactModel(dt$char_1.y, dt$outcome)

# apply model to column to essentially return condprobmodel[rawx]

# both NA's and new levels are smoothed to original grand average

applyImpactModel = function(condprobmodel, xcol) {

naval = condprobmodel[is.na(names(condprobmodel))]

dim = length(xcol)

condprobvec = numeric(dim) + naval

for(nm in names(condprobmodel)) {

if(!is.na(nm)) {

condprobvec[xcol==nm] = condprobmodel[nm]

}

condprobvec

} # dt$impact_char_1.y = applyImpactModel(impact_char_1.y, dt$char_1.y)

# Original - convert Address variable to its impact model outcome

# impact_char_1.y = impactModel(dt$char_1.y, dt$outcome)

# dt$impact_char_1.y = applyImpactModel(impact_char_1.y, dt$outcome)

# debugonce(function_name); function_name()

var1 = subset(pvalue, isNum =='character' & level > 6 & level < 2000)

var1 = var1$i

for (i in var1){

a = paste0('impact_', i)

print(a)

dt[[a]] = applyImpactModel(impactModel(dt[[i]], dt$outcome),

dt[[i]])

}

# a = 'impact_char_8.y'

# b = paste0('impact_', a)

# dt[[b]] = applyImpactModel(impactModel(dt[[a]], dt$outcome),

# dt$outcome)

var2 = subset(pvalue, isNum =='character' & level <=6)

var2 = var2$i

a = paste('~', paste(var2, collapse=' + '))

dmy = dummyVars(a, data=dt, fullRank=TRUE); # trsf = data.table(predict(dmy, newdata=dt))

var2 = names(data.table(predict(dmy, newdata=dt)))

dt = cbind(dt, data.table(predict(dmy, newdata=dt)))

fn_remove_na = function(DT) {

# either of the following for loops

# by name :

# for (j in names(DT))

# set(DT,which(is.na(DT[[j]])),j,0)

# or by number (slightly faster than by name) :

for (j in seq_len(ncol(DT)))

set(DT,which(is.na(DT[[j]])),j,0)

}

fn_remove_na(dt)

#############################################################

library(xgboost)

library(Matrix)

fn.cv.xgb = function(x,kfold,ntree,depth,eta){

set.seed(12345)

x$s = ceiling(runif(nrow(x), 0, kfold))

cv = NULL

n = names(x)

# formula <- as.formula(paste("TARGET ~", paste(n[!n %in% c('TARGET','INDEX')], collapse = " + ")))

for (kfold_idx in 1:kfold){

train = x[!x$s==kfold_idx,]

valid = x[x$s==kfold_idx,]

Ytrain = sparse.model.matrix(train[['outcome']])

Yvalid = sparse.model.matrix(valid[['outcome']])

train = sparse.model.matrix(train[,!colnames(train) %in% c('s','outcome'), with=F])

valid = sparse.model.matrix(valid[,!colnames(valid) %in% c('s','outcome'), with=F])

for(ntree_idx in seq(10, ntree, by=10)){

for(depth_idx in seq(1, depth, by=1)){

for(eta_idx in seq(0.2, eta, by=0.1)){

# sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df)

model.xgb <- xgboost(data=train, label=Ytrain,

max.depth=depth, nround=ntree, eta=eta_idx,

objective = "binary:logistic", verbose=0)

P_xgb =predict(model.xgb, train)

error = (P_xgb - Ytrain)^2

error_train = sum(error); mean_error_train = mean(error)

n_train = nrow(train)

P_xgb = predict(model.xgb, valid)

error = (P_xgb - Yvalid)^2

error_valid = sum(error); mean_error_valid = mean(error)

n_valid = nrow(valid)

cv = rbind(cv, data.frame(kfold_idx, ntree_idx, depth_idx, eta_idx,

n_train, error_train, mean_error_train,

n_valid, error_valid, mean_error_valid))

}

return(cv)

}

#function(x,kfold,ntree,depth,eta)

# cv=fn.cv.xgb(dt, 5, 50, 6, 1.2)

cv=fn.cv.xgb(dt, 5, 50, 4, 0.4)

Sunday, February 28, 2016

Neural Network (neuralnet) code

library(neuralnet)

length(names(train))
n <- names(train)
formula <- as.formula(paste("TARGET ~", paste(n[!n %in% c('TARGET','INDEX')], collapse = " + ")))

model.nn = neuralnet(formula, data=train, hidden=4,
threshold=0.05, rep=1,
linear.output=TRUE, err.fct='sse', act.fct='logistic',
algorithm='rprop+',
lifesign = 'minimal', lifesign.step=10000, stepmax=1e6)

model.nn$result.matrix

plot(model.nn)
prediction(model.nn); ?prediction
P_nn <- compute(model.nn, train[,3:24])
length(P_nn[[2]])

par(mfrow=c(1,2))
hist(P_nn[[2]])
plot(density(P_nn[[2]]))

error = abs(train$TARGET - P_nn[[2]])
plot(density(error))
mean(error) #nn pred on ctree data 0.99497

print(model.nn)

## save model
save(model.nn, file = "model_nn_raw_ctree_input.rda")

## load the model
load("model_nn_raw_ctree_input.rda")
## predict for the new `x`s in `newdf`
predict(model.nn, newdata = newdf)

## http://www.r-bloggers.com/fitting-a-neural-network-in-r-neuralnet-package/
## http://www.r-bloggers.com/using-neural-networks-for-credit-scoring-a-simple-example/
## https://journal.r-project.org/archive/2010-1/RJournal_2010-1_Guenther+Fritsch.pdf

Random Forest & GBM Cross Validation Loop

Random Forest CV

fn.cv.rf = function(x){
library(randomForest)

set.seed(12345)
s = sample(nrow(x), replace=FALSE, nrow(x)*0.75)

train = x[s,]
valid = x[-s,]
cv = NULL
n = names(train)
formula <- as.formula(paste("TARGET ~", paste(n[!n %in% c('TARGET','INDEX')], collapse = " + ")))

for(i in seq(100, 500, by=50)){
model.rf =
randomForest(formula,
data=train[, !names(x) %in% c('INDEX')],
ntree=i, importance=TRUE)

train$P_RF = predict(model.rf)
error = abs(train$P_RF - train$TARGET)
error_train = sum(error); mean_error_train = mean(error)

valid$P_RF = predict(model.rf, valid)
error = abs(valid$P_RF - valid$TARGET)
error_valid = sum(error); mean_error_valid = mean(error)

cv=rbind(cv, data.frame(i, error_train, mean_error_train,
error_valid, mean_error_valid))
}
rm(train, valid)
return(cv) #return(c(cv, model.rf))
}
cv = fn.cv.rf(cluster)

# https://www.kaggle.com/c/the-analytics-edge-mit-15-071x/forums/t/8082/reading-and-interpreting-random-forest-models
# http://stats.stackexchange.com/questions/21152/obtaining-knowledge-from-a-random-forest
# https://www.youtube.com/watch?v=-nai4NBx5zI

library(ggplot2)
ggplot(cv, aes(i)) +
geom_line(aes(y = mean_error_train, colour = "train error")) +
geom_line(aes(y = mean_error_valid, colour = "valid error"))

GBM CV

library(gbm)
# x = dataset
# kf = k-fold cv
# n = n.tree
# a = shrinkage / learning rate

fn.cv.gbm = function(x,kf,ntree,a){
set.seed(12345)
x$s = ceiling(runif(nrow(x), 0, kf))
cv = NULL
n = names(x)
formula <- as.formula(paste("TARGET ~", paste(n[!n %in% c('TARGET','INDEX')], collapse = " + ")))

for (kf_i in 1:kf){
train = x[!x$s==kf_i,]
valid = x[x$s==kf_i,]

for(ntree_i in seq(50, ntree, by=50)){

for(shrink_i in seq(0.01, 0.04, by=a)){
model.gbm = gbm( formula=formula, data=train,
distribution = "poisson",
# gaussian for GBM regression or adaboost
n.trees=ntree_i,
shrinkage=shrink_i, #0.01
# smaller values of shrinkage typically give slighly better performance
# the cost is that the model takes longer to run for smaller values
interaction.depth=2,
#use CV to choose interaction delpth
n.minobsinnode=100,
# n.minobsinmode has an importnt effect on overfitting!
# decrease in this number may result the overfitting
bag.fraction=0.5,
train.fraction=0.99,
### DO NOT USE CV.FOLDS!!!
# use this for CV. This option only works for gbm.fit(), not gmb()
# var.monotone=c(),
# can help with overfitting, will smooth bumpy curves
verbose=TRUE)

train$P_GBM = predict(model.gbm)
error = abs(train$P_GBM - train$TARGET)
error_train = sum(error); mean_error_train = mean(error)
n_train = nrow(train)

valid$P_GBM = predict(model.gbm, valid)
error = abs(valid$P_GBM - valid$TARGET)
error_valid = sum(error); mean_error_valid = mean(error)
n_valid = nrow(valid)

cv = rbind(cv, data.frame(kf_i, ntree_i, shrink_i,
n_train, error_train, mean_error_train,
n_valid, error_valid, mean_error_valid))
}
}
}
return(cv)
}
#function(x,kf,ntree,a)
cv = fn.cv.gbm(cluster, 4, 500, 0.003)

a=aggregate(cv$mean_error_valid, list(cv$ntree_i, cv$shrink_i), mean)
a$id=paste(a$Group.1, a$Group.2, sep='_')
plot(row.names(a), a$x, type='p', col=a$Group.1)

library(ggplot2)
ggplot(a, aes(x=Group.2, y=x)) +
geom_point() +
facet_grid(.~Group.1) +
ggtitle("GBM 4 fold cv") + labs(x='n.tree / Shrinkage', y='mean cv error')

#https://www.kaggle.com/c/15-071x-the-analytics-edge-competition-spring-2015/forums/t/13749/gbm-output

PCA & Clustering

names(train)
pca <- prcomp(train[,-c(1,2)], scale=TRUE)

#names(pca); pca$sdev; pca$rotation; pca$center; pca$scale; pca$x
#summary(pca); biplot(pca, scale=0); head(pca$x)

###############################################
### Select number of PCA
###############################################
cluster = cbind(train[,c(1,2)],pca$x[,1:10]) #head(cluster)

fn.cv.kmean = function(x,y){
set.seed(12345)
error = NULL
for(i in 1:y){
km=kmeans(x[,!names(x) %in% c('INDEX', 'TARGET')], i, iter.max=1e6, nstart=50, algorithm='Lloyd')
error = rbind(error, data.frame(i, km$tot.withinss, km$totss))
#table(km$cluster, x$TARGET)
#plot(x[,3], x[,4], col=km$cluster)
}
return(error)
}
cv=fn.cv.kmean(cluster,12)
plot(cv$i, cv$km.tot.withinss, type='l')

set.seed(12345)
km=kmeans(cluster[,!names(cluster) %in% c('INDEX', 'TARGET')], 6, iter.max=1e6, nstart=50, algorithm='Lloyd')
cluster$kmean = km$cluster
table(km$cluster, cluster$TARGET)
#summary(km); head(cluster)
#names(km)

plot(table(km$cluster, cluster$TARGET))
plot(km$cluster, cluster$TARGET)

library(scatterplot3d)
scatterplot3d(cluster$PC1, cluster$PC2, cluster$kmean, main='',
highlight.3d=TRUE, color='green', col.grid='lightblue', col.axis = 'blue')

scatterplot3d(cluster$PC1, cluster$PC2, cluster$TARGET,
highlight.3d=TRUE, color='green', col.grid='lightblue', col.axis = 'blue')

scatterplot3d(cluster$PC1, cluster$PC2, cluster$km, pch=1,
color=cluster$TARGET, col.grid='lightblue', col.axis='blue')

scatterplot3d(cluster$PC1, cluster$PC2, cluster$TARGET,
color=cluster$km, col.grid='lightblue', col.axis='blue')

error = abs(cluster$km - cluster$TARGET)
mean(error)

Add caption

############################
############################

hc.complete = hclust(dist(cluster[,-c(1,2)]), method='complete')
hc.average = hclust(dist(cluster[,-c(1,2)]), method='average')
hc.single = hclust(dist(cluster[,-c(1,2)]), method='single')
#summary(hc.average); names(hc.average)

plot(hc.complete)
plot(hc.average)
plot(hc.single)

Tree based imputation Part 2 of 2

Conditional tree based imputation
Note that formula making command for casting its type.
Generally, ctree performs better than rpart
It is noted that GBM can handle missing value, perhaps GBM can be used to impute. Refer here.

rm(list=ls())
library("partykit") #for ctree

setwd ('C:\\Users\\Ted\\Documents\\MSPA\\2016 Winter\\Pred 411\\Unit_03')
train = read.csv('wine.csv')
str(train); class(train)
colnames(train)[1] = 'INDEX'

apply(apply(train,2,is.na),2,sum)
missCol = names(which(apply(is.na(train),2,any)))

for (i in 1:length(missCol)) {
var = missCol[i]
MIFlag = paste0(var,'_MIFLAG')
formula = as.formula(paste0(var, '~.'))

model.ctree =
ctree(formula,
data=train[!is.na(train[,missCol[i]]),
!names(train) %in% c('INDEX', 'TARGET_FLAG', 'TARGET_AMT')]
)

#assign(paste0('model.ctree.',var), model.ctree)

train[[MIFlag]] = ifelse(is.na(train[[var]]), 1, 0)
train[,var] =
ifelse(is.na(train[,var]), predict(model.ctree), train[,var])
rm(model.ctree, formula, i, MIFlag, var)
}

apply(apply(train,2,is.na),2,sum)

Tree based imputation Part 1 of 2

Missing value imputation via rpart

rm(list=ls())
library(rpart)

setwd ('C:\\Users\\Ted\\Documents\\MSPA\\2016 Winter\\Pred 411\\Unit_03')
train = read.csv('wine.csv')
str(train); class(train)
colnames(train)[1] = 'INDEX'

apply(apply(train,2,is.na),2,sum)
missCol = names(which(apply(is.na(train),2,any)))
# "ResidualSugar" "Chlorides" "FreeSulfurDioxide"
# "TotalSulfurDioxide" "pH" "Sulphates"
# "Alcohol" "STARS"

for (i in 1:length(missCol)) {
var = missCol[i]
MIFlag = paste0(var,'_MIFLAG')
formula = paste(var, '~ .')

model.rpart =
rpart(formula,
data=train[!is.na(train[,missCol[i]]),
!names(train) %in% c('INDEX', 'TARGET')]
)
opt<-which.min(model.rpart$cptable[,'xerror'])
cp<-model.rpart$cptable[opt,'CP']
model.rpart.prune <- prune(model.rpart, cp = cp)
#assign(paste0('model.rpart.prune.',var), model.rpart.prune)

train[[MIFlag]] = ifelse(is.na(train[[var]]), 1, 0)
train[,var] =
ifelse(is.na(train[,var]), predict(model.rpart.prune), train[,var])
}

#check if the missing values have been imputed
apply(apply(train,2,is.na),2,sum) #apply(is.na(train),2,any)

write.table(train, "wine_train.csv", sep=",", row.names=FALSE)

library(rattle)
asRules(model.rpart.prune.STARS)
asRules(model.rpart.prune.ResidualSugar)

## http://stats.stackexchange.com/questions/72251/an-example-lasso-regression-using-glmnet-for-binary-outcome
## http://stats.stackexchange.com/questions/77546/how-to-interpret-glmnet

Sunday, August 23, 2015

Google Compute Engine - Cloud Computing Setup

Google Compute Engine Cloud Computing Set-ups:
Unix + R Stdudio + WebInterface

gcloud compute images create IMAGE_NAME --source-uri URI

gcloud compute images create rstudio-image --source-uri http://storage.googleapis.com/rstudio-image/rstudio-image.image.tar.gz

gcloud compute images create rstudioz --source-uri http://storage.googleapis.com/rstudio-image/rstudio-image.image.tar.gz

gcloud compute images create IMAGE_NAME --source-uri gs://BUCKET_NAME/IMAGE_NAME.image.tar.gz

gcloud compute images create rstudio-image --source-uri gs://rstudio-image/rstudio-image2.image.tar.tar
gcloud compute images create rstudio-image --source-uri http://storage.googleapis.com/rstudio-image/rstudio-image.tar.gz
gcloud compute images create rstudio-image --source-uri http://storage.googleapis.com/r-gce-sunholo/98789ee2fa89942875b1409a36a3fb8e1e719aae.image.tar.gz

http://storage.googleapis.com/rstudio-image/rstudio-image.tar.gz

==================
http://storage.googleapis.com/r-gce-sunholo/98789ee2fa89942875b1409a36a3fb8e1e719aae.image.tar.gz
================
gcloud compute instances create example-instance \
--image https://www.googleapis.com/compute/v1/projects/debian-cloud/global/images/debian-7-wheezy-vYYYMMDD

gcloud compute instances create rstudio-image \ --image https://storage.googleapis.com/rstudio-image/rstudio-image.tar.gz
================

tar -Sczf rstudio-image.tar.gz disk.raw

If you don't know your username, try this command using gcloud to see your user details:

$ gcloud auth login

Any users you add to Debian running on the instance will have a user in RStudio - to log into Debian and add new users, see below:

$ ## ssh into the running instance
$ gcloud compute ssh <your-username>@new-instance-name
$ #### It should now tell you that you are logged into your instance #####
$ #### Once logged in, add a user: example with jsmith
$ sudo useradd jsmith
$ sudo passwd jsmith
$ ## give the new user a directory and change ownership to them
$ sudo mkdir /home/jsmith
$ sudo chown jsmith:users /home/jsmith