Blog: 2014

Wednesday, December 3, 2014

Avazu CTR Competition

Kaggle Avazu CTR

#This code attempts to utilize the following concepts:
#Gradient Boosting Model, Stochastic Simulation, Learning rate (shrinkage) optimization, bootstraping

rm(list=ls())
setwd("C:/Users/Ted/Documents/Kaggle/Avazu/Data")

library("ff")
library("ffbase")
library("doParallel")
library("data.table")
library(ggplot2)
library(lattice)

csvfile <- file.path(getwd(),'ff.train.pct.sample','train_5pct.csv')
train <- fread(csvfile, header=TRUE, stringsAsFactors=TRUE)

summary(train);str(train);head(train) #stringsASFactors doesn't work
rm(list=ls())

## train.1p[,site_id := as.factor(site_id)]
names_factors<- colnames(train)
for (col in names_factors) set(train, j=col, value=as.factor(train[[col]]))

#### get day & 3hr time zone ####

train$date <- paste('20',substring(train$hour, first = 1, last = 2),
substring(train$hour, first=3, last=8), sep='')

train$date <- paste(substring(train$date, first=1, last=4), '-',
substring(train$date, first=5, last=6), '-',
substring(train$date, first=7, last=8), ' ',
substring(train$date, first=9, last=19), sep='')

## train$date <- as.POSIXct(train, strptime(train$date, "%Y-%M-%D %H"))
## train$date <- as.Date(train$date, "%Y-%M-%D %H")

train$day <- weekdays(as.Date(train$date))

train$day1 <- train$day
train$day1 <- gsub("Monday" , 1 ,train$day1)
train$day1 <- gsub("Tuesday" , 2, train$day1)
train$day1 <- gsub("Wednesday", 3, train$day1)
train$day1 <- gsub("Thursday" , 4, train$day1)
train$day1 <- gsub("Friday" , 5, train$day1)
train$day1 <- gsub("Saturday" , 6, train$day1)
train$day1 <- gsub("Sunday" , 7, train$day1)

train$hour <- substring(train$hour, first=7, last=8)
train$hour <- gsub("00", "Q1", train$hour); train$hour <- gsub("01", "Q1", train$hour); train$hour <- gsub("02", "Q1", train$hour);
train$hour <- gsub("03", "Q1", train$hour); train$hour <- gsub("04", "Q1", train$hour); train$hour <- gsub("05", "Q1", train$hour);
train$hour <- gsub("06", "Q2", train$hour); train$hour <- gsub("07", "Q2", train$hour); train$hour <- gsub("08", "Q2", train$hour);
train$hour <- gsub("09", "Q2", train$hour); train$hour <- gsub("10", "Q2", train$hour); train$hour <- gsub("11", "Q2", train$hour);
train$hour <- gsub("12", "Q3", train$hour); train$hour <- gsub("13", "Q3", train$hour); train$hour <- gsub("14", "Q3", train$hour);
train$hour <- gsub("15", "Q3", train$hour); train$hour <- gsub("16", "Q3", train$hour); train$hour <- gsub("17", "Q3", train$hour);
train$hour <- gsub("18", "Q4", train$hour); train$hour <- gsub("19", "Q4", train$hour); train$hour <- gsub("20", "Q4", train$hour);
train$hour <- gsub("21", "Q4", train$hour); train$hour <- gsub("22", "Q4", train$hour); train$hour <- gsub("23", "Q4", train$hour);

names_factors<- colnames(train)
for (col in names_factors) set(train, j=col, value=as.factor(train[[col]]))

##final check
summary(train);str(train);head(train)

## save model feed data
csvfile <- file.path(getwd(),'ff.train.pct.modelfeed','train_5pct.csv')
system.time(
write.table(train, file=csvfile, row.names=FALSE, col.names=TRUE)
)

gc()
help(memory.size);memory.limit();memory.size()

############################################################################
############################################################################

rm(list=ls())

library(data.table)

## load model feed data
setwd("C:/Users/Ted/Documents/Kaggle/Avazu/Data")
csvfile <- file.path(getwd(),'ff.train.pct.modelfeed','train_3pct.csv')
train <- fread(csvfile, header=TRUE)

## find distinct levels in each columns
names_factors<- colnames(train)
for (col in names_factors) set(train, j=col, value=as.factor(train[[col]]))

lvl_cnt <- NULL
for (col in names_factors) { lvl_cnt[col] <- (length(levels(train[[col]]))) }
lvl_cnt <- as.data.frame(lvl_cnt); lvl_cnt$name <- rownames(lvl_cnt)
colnames(lvl_cnt)[1] <- 'count'; lvl_cnt[with(lvl_cnt, order(count)),]
##xtabs(C21 ~ click, data=train)

## imipact modeling
impactModel = function(xcol, ycol){
n = length(ycol)
p = sum(as.numeric(ycol))/n
#duplicate output for NA (average NA towards grand uniform average)
x = c(xcol, xcol)
y = c(ycol, ycol)
x[(1+n):(2*n)] = NA
levelcounts = table(x, y, useNA="always")
condprobmodel = (levelcounts[,2]+p)/(levelcounts[,1]+levelcounts[,2]+1.0)
# apply model example: applyImpactModel(condprobmodel, data[,varname])
condprobmodel
}

applyImpactModel = function(condprobmodel, xcol) {
naval = condprobmodel[is.na(names(condprobmodel))]
dim = length(xcol)
condprobvec = numeric(dim) + naval
for(nm in names(condprobmodel)) {
if(!is.na(nm)) {
condprobvec[xcol==nm] = condprobmodel[nm]
}
}
condprobvec
}

impact_c20 = impactModel(train$c20, train$click)
# train <- as.data.frame(train)
train$impact_c20 = applyImpactModel(impact_c20, train$c20)

## randomForest model
library(randomForest)
set.seed(12345)
formula <- click ~ hour + device_type + device_conn_type + C18 + C1 + banner_pos + C15 +
C16 + site_category + app_category + day ## C21 + C19

formula1 <- click ~ device_type + device_conn_type + C18 + C1 + banner_pos + C15 +
C16 + site_category * app_category + day + hour ## C21 + C19

system.time(
model.rf <- randomForest(formula = formula,
data = train,
ntree=150, importance=T))

## 1pct
## user system elapsed
## 249.31 4.13 253.62

print(model.rf)
attributes(model.rf)
importance(model.rf)
varImpPlot(model.rf)
plot(model.rf)
summary(model.rf)

## first formula
table(predict(model.rf), train.1p$click)
###### with hour x1 #######
# 0 1
# 0 334812 67290
# 1 801 1386
###### with hour x2 #######
# 0 1
# 0 334911 67361
# 1 702 1315

## second formula
table(predict(model.rf), train.1p$click)
# 0 1
# 0 334896 67421
# 1 717 1255

# test random forest using test data
irisPred <- predict(rf, newdata=testData)

# check the results
table(irisPred, testData$Species)
plot(margin(rf, testData$Species))

head(train)

names_factors<- colnames(test)
for (col in names_factors) set(test, j=col, value=as.factor(test[[col]]))

## predict
predict.rf <- predict(model.rf, newdata=test)

#fit the randomforest model
model <- randomForest(Sepal.Length~.,
data = training,
importance=TRUE,
keep.forest=TRUE
)
print(model)

#what are the important variables (via permutation)
varImpPlot(model, type=1)

#predict the outcome of the testing data
predicted <- predict(model, newdata=testing[ ,-1])

# what is the proportion variation explained in the outcome of the testing data?
# i.e., what is 1-(SSerror/SStotal)
actual <- testing$Sepal.Length
rsq <- 1-sum((actual-predicted)^2)/sum((actual-mean(actual))^2)
print(rsq)

# gbm model fitting

library(gbm)

library(dplyr)

library(data.table)

setwd("C:\\Users\\intrepid-honor-803\\Documents\\Kaggle\\Avazu\\Data")

csvfile <- file.path(getwd(),'ff.train.pct.modelfeed','train_50pct.csv')

train <- fread(csvfile, header=TRUE)

## click <- train$click

click <- as.numeric(train$click)

## train <- select(train, -click)

train <- select(train,

device_type, device_conn_type, C18, C1, banner_pos, C15,

C16, site_category, app_category, day, hour)

names_factors<- colnames(train)

for (col in names_factors) set(train, j=col, value=as.factor(train[[col]]))

?gbm

model.gbm = gbm.fit(x=train,

y=click,

distribution = "bernoulli",

# gaussian for GBM regression or adaboost

n.trees=10000,

shrinkage=0.05,

# smaller values of shrinkage typically give slighly better performance

# the cost is that the model takes longer to run for smaller values

interaction.depth=3,

#use CV to choose interaction delpth

n.minobsinnode=500,

# n.minobsinmode has an importnt effect on overfitting!

# decrease in this number may result the overfitting

nTrain=round(nrow(train) * 0.8),

# var.monotone=c(),

# can help with overfitting, will smooth bumpy curves

verbose=TRUE

)

summary(model.gbm)

gbm.perf(model.gbm) ## gbm.perf(model.gbm, method="test")

save(model.gbm, file="gbm_depth3_minobs500_n10000")

load("C:\\Users\\intrepid-honor-803\\Documents\\Kaggle\\Avazu\\Data\\predicted\\1gbm_depth3_minobs500_n5000\\gbm_depth3_minobs500_n5000.rda")

rm(train)

R packages

List of R packages updates for tracking

ggplot2 / ggvis (interactive)
dplyr
ff/bigglm
reshape2
data.table
gmb
doParallel
rattle/caret
stringr

http://www-bcf.usc.edu/~gareth/ISL/

http://adv-r.had.co.nz/

http://www.louisaslett.com/RStudio_AMI/

http://www.r-bloggers.com/in-depth-introduction-to-machine-learning-in-15-hours-of-expert-videos/

Thursday, October 2, 2014

Big data in R

Approaches and packages to handle big data in R
This post is largely credit to this blog.

Data Storage I/O

http://blog.revolutionanalytics.com/2009/12/r-tip-save-time-and-space-by-compressing-data-files.html
fread
data.table
http://stackoverflow.com/questions/1727772/quickly-reading-very-large-tables-as-dataframes-in-r
http://davetang.org/muse/2013/09/03/handling-big-data-in-r/

Data Manipulation

dplyr in plyr package

Data Visualization

bigvis
ggplot2

Memory

ffbase
http://www.slideshare.net/EdwindeJonge1/ffbase

Ensemble
http://www.r-bloggers.com/improve-predictive-performance-in-r-with-bagging/
http://vikparuchuri.com/blog/parallel-r-loops-for-windows-and-linux/

Parallelization

http://adv-r.had.co.nz/Profiling.html#parallelise
http://notjustmath.wordpress.com/2012/01/22/parallel-computing-with-r/
http://stackoverflow.com/questions/24335569/in-r-how-to-predict-with-svm-model-in-parallel-using-foreach-snow
http://topepo.github.io/caret/parallel.html
http://stackoverflow.com/questions/7782501/how-to-interpret-predict-result-of-svm-in-r?rq=1
http://www.r-bloggers.com/parallel-r-model-prediction-building-and-analytics/

Thursday, May 1, 2014

Neural Network - Single & Multiple output nodes

##### AllState Prediction Model using Neural Network Algorithm
##### Working in progress

rm(list=ls())

###### read in file
setwd('C:\\Users\\Ted\\Desktop\\Kaggle\\AllState')
dat1 <- read.csv(file="train.csv", header=T)
test1 <- read.csv(file="test_v2.csv", header=T)

##### overview of data
str(dat1);summary(dat1);nrow(dat1);head(dat1,2)
apply(apply(dat1,2,is.na),2,sum)

##### dat2 is for only purchase record
dat1$p <- apply(dat1[,c("A", "B", "C", "D", "E", "F", "G")],1,paste, collapse='')

##### convert factors into numbers #dat1$st <- NULL
a <- data.frame(sort(unique(dat1$state)), order(sort(unique(dat1$state))))
colnames(a) <- c("state","state_no")
b <- data.frame(sort(unique(dat1$p)), order(sort(unique(dat1$p))))
colnames(b) <- c("p","p_no")
dat1 <- merge(dat1,a, by='state');rm(a)
dat1 <- merge(dat1,b, by="p");rm(b)

##### binarize car value
cat <- levels(dat1$car_value)
cat[1] <- c("u")
#for (i in cat) {cat[i] <- paste("car_value",i, collapse=" ")};cat

binarize <- function(x) {return(dat1$car_value == x)}
newcols <- --sapply(cat, binarize)
colnames(newcols) <- cat
dat1 <- cbind(dat1, newcols)
rm(cat);rm(newcols);rm(binarize)

##### do the same manipulation on test set
apply(apply(test1,2,is.na),2,sum)
test1$p <- apply(test1[,c("A", "B", "C", "D", "E", "F", "G")], 1, paste, collapse='')

a <- data.frame(sort(unique(test1$state)), order(sort(unique(test1$state))))
colnames(a) <- c("state","state_no")
b <- data.frame(sort(unique(test1$p)), order(sort(unique(test1$p))))
colnames(b) <- c("p","p_no")
test1 <- merge(test1,a, by='state');rm(a)
test1 <- merge(test1,b, by="p");rm(b)

cat <- levels(test1$car_value)
cat[1] <- c("u")
binarize <- function(x) {return(test1$car_value == x)}
newcols <- --sapply(cat, binarize)
colnames(newcols) <- cat
test1 <- cbind(test1, newcols)
rm(cat);rm(newcols);rm(binarize)

##### final cut for record type = 1
dat2 <- dat1[dat1$record_type==1,]

##### graph data distribution
hist(dat2$p_no)

##### random forest model
library(randomForest)
formula <- p ~ day + state + location + group_size + homeowner + car_age + car_value + age_oldest +
age_youngest + married_couple + cost # + risk_factor + c_previous + duration_previous
model.rf <- randomForest(formula = formula, data=dat2, ntree=100, importance=T)

head(dat2)
##### neural network model
library(neuralnet); args(neuralnet)
model.nn <- neuralnet(p_no ~ day + state_no + location + group_size + homeowner + car_age + car_value + age_oldest +
age_youngest + married_couple + cost # + risk_factor + c_previous + duration_previous
, data=dat2, hidden=3, act.fct="logistic",rep = 3, linear.output = F)

m <- model.matrix( ~ p_no + day + state_no + location + group_size + homeowner + car_age +
u + a + b + c + d + e + f + g + h + i + age_oldest +
age_youngest + married_couple + cost # + risk_factor + c_previous + duration_previous
,data = dat2)

model.nn <- neuralnet(p_no ~ day + state_no + location + group_size + homeowner + car_age +
u + a + b + c + d + e + f + g + h + i + age_oldest +
age_youngest + married_couple + cost # + risk_factor + c_previous + duration_previous
,data=m , hidden = 2, threshold=0.01, linear.output=F)

pred.bin <- prediction(model.nn)
pred.bin$rep1
plot(model.nn, rep="best")

Monday, April 14, 2014

# Date: 4/10/2014
# Ted Kim
# Answer Financial Inc. Data Project
# This code tries to categorize customer base per their attributes to find out each cluster's propensity to
# purchase the products offered.
# Decision tree models were used for classifications and to validate the statistically significant divergence
# found in offered products' price standard deviation

############ get your data
setwd('C:\\Users\\Ted\\Desktop\\Kaggle\\Answer Financial'); getwd()
all.data <-read.csv(file='AFIDA_Data.csv', header=T)
############

############ find min, sd, and no. of product offered
all.data$MIN <- apply(all.data[,c(6,7,8)], 1, min, na.rm=T)
all.data$MIN[all.data$MIN==Inf] <- 0
all.data$OFFER <- 3-apply(apply(all.data,1,is.na),2,sum)
all.data$SD <- apply(all.data[, c(6,7,8)], 1, sd, na.rm=T)
all.data$SD[is.na(all.data$SD)] <- 0
############

#### transform categorical data to binary format
cat <- levels(all.data$C1);cat
binarize <- function(x) {return(all.data$C1 == x)}
newcols <- --sapply(cat, binarize)
colnames(newcols) <- cat
all.data <- cbind(all.data, newcols)
newcols[1:15,]; all.data[1:15,]

cat <- levels(all.data$C2)
binarize <- function(x) {return(all.data$C2 == x)}
newcols <- --sapply(cat, binarize)
colnames(newcols) <- cat
all.data <- cbind(all.data, newcols)

cat <- levels(all.data$C3)
binarize <- function(x) {return(all.data$C3 == x)}
newcols <- --sapply(cat, binarize)
colnames(newcols) <- cat
all.data <- cbind(all.data, newcols)

#### scale offer min price, and sd data
all.data.scale <- cbind(all.data, scale(all.data[,9:11]))
colnames(all.data.scale)[23] <- "S.MIN"
colnames(all.data.scale)[24] <- "S.OFFER"
colnames(all.data.scale)[25] <- "S.SD"

#### discretize SD into 5 buckets
segments <- 5
maxL <- max(all.data.scale$SD)
minL <- min(all.data.scale$SD)
theBreaks <- seq(minL, maxL, by=(maxL-minL)/segments)
all.data.scale$D.SD <- cut(all.data.scale$SD, breaks = theBreaks, include.lowest=F)

rm(newcols); rm(all.data)
rm(cat); rm(binarize)

#head(all.data.scale)
#all.data.scale <- subset(all.data.scale, select = -c(C11))

#### Convert categorical values to numeric values
all.data.scale$C11[all.data.scale$C1=="X"] <- 0
all.data.scale$C11[all.data.scale$C1=="Y"] <- 1
all.data.scale$C22[all.data.scale$C2=="A"] <- 0
all.data.scale$C22[all.data.scale$C2=="B"] <- 1
all.data.scale$C22[all.data.scale$C2=="C"] <- 2
all.data.scale$C22[all.data.scale$C2=="D"] <- 3
all.data.scale$C22[all.data.scale$C2=="E"] <- 4
all.data.scale$C22[all.data.scale$C2=="F"] <- 5
all.data.scale$C33[all.data.scale$C3=="G"] <- 0
all.data.scale$C33[all.data.scale$C3=="H"] <- 1
all.data.scale$C33[all.data.scale$C3=="I"] <- 2

CAT <-do.call(paste, c(all.data.scale[c("C1","C2","C3")], sep=""))
all.data.scale$CAT <- CAT
rm(CAT)

CATID <-do.call(paste, c(all.data.scale[c("CV","C1","C2","C3")], sep=""))
all.data.scale$CATID <- CATID
rm(CATID)

dat1 <- as.data.frame(all.data.scale[,c("CATID","CV","CAT")])

library(reshape)
dat1 <- cast(dat1, CAT ~ CV)
d1 <- as.data.frame(table(all.data.scale[,c(30,2)]))

library(lattice)
barchart( CAT ~ Freq, data = d1 , group = CV, stack = T)

### randomForest model
library(randomForest)
formula <- CV ~ C1+C2+C3
#formula <- CV ~ C11+C22+C33
rf.model <- randomForest(formula = formula #CV ~ C1+ C2 + C3 #+ MIN + OFFER + SD
,data = all.data.scale
,ntree = 100
,importance=T)
importance(rf.model)

### conditional tree model
library(partykit)
ctree.model <- ctree(CV ~ C1+ C2 + C3 + C1:C2 + C1:C3 + C2:C3 #+ MIN + OFFER + SD
,data = all.data.scale)

ctree.model <- ctree(CV ~ C1 + C2 + C3 #+ S.SD #+ C1:C2 + C1:C3 + C2:C3 #+ MIN + OFFER + SD
,data = all.data.scale)

ctree.model <- ctree(CV ~ C11+ C22 + C33 # + C1:C2 + C1:C3 + C2:C3 #+ MIN + OFFER + SD
,data = all.data.scale)

#plot(ctree.model)
plot(ctree.model, gp = gpar(fontsize = 10)
,inner_panel=node_inner
,ip_agrs=list(abbreviate = T, id = F)
)

#Prune tree methods
library(rpart)
library(rpart.plot)
library(RColorBrewer)
library(rattle)
library(partykit)

formula <- CV ~ C1 + C2 + C3
#formula <- CV ~ C11 + C22 + C33

model.rpart <-rpart(formula, data=all.data.scale) #print(model.rpart$cptable)

# here we prune back the large initial tree:

opt<-which.min(model.rpart$cptable[,'xerror'])
cp<-model.rpart$cptable[opt,'CP']

model.rpart.prune <- prune(model.rpart, cp = cp)

plot(as.party(model.rpart.prune),
tp_args = list(id = FALSE))

fancyRpartPlot(model.rpart.prune)

# summary(model.rpart.prune)
# test_pred <- predict(model.rpart.prune, all.data.scale, type = "class")
# model.rpart.prune$frame
# model.rpart.prune$where

all.data.scale$NODE <- 0
all.data.scale[as.vector(model.rpart.prune$where==4),]$NODE <- 4
all.data.scale[as.vector(model.rpart.prune$where==5),]$NODE <- 5
all.data.scale[as.vector(model.rpart.prune$where==2),]$NODE <- 2

####################################
# find the no of split distribution by cross validations (25)

no.split <- vector(mode = 'integer', length=25)

for ( i in 1:length(no.split)) {
cp <- rpart(formula
, data = all.data.scale)$cptable
no.split[i] <- cp[which.min(cp[,"xerror"]), "nsplit"]
}
table(no.split)
#####################################

all.data.scale[as.vector(model.rpart.prune$where==2),] # node 1
all.data.scale[as.vector(model.rpart.prune$where==4),] # node 4
head(all.data.scale[as.vector(model.rpart.prune$where==5),]) # node 5

library(lattice)
### breakdown of characterisics
barchart( CAT ~ Freq | NODE, data = d1 , group = CV, stack = T)

splom(~all.data.scale[as.vector(model.rpart.prune$where==5),c("OFFER","S.SD","CV","S.MIN")]
,pscale = 0, type = c("g", "p", "smooth"))
splom(~all.data.scale[as.vector(model.rpart.prune$where==5),c("OFFER","S.SD","CV","S.MIN")])

splom(~all.data.scale[as.vector(model.rpart.prune$where==4),c("OFFER","S.SD","CV","S.MIN")]
,pscale = 0, type = c("g", "p", "smooth"))
splom(~all.data.scale[as.vector(model.rpart.prune$where==4),c("OFFER","S.SD","CV","S.MIN")])

splom(~all.data.scale[as.vector(model.rpart.prune$where==2),c("OFFER","S.SD","CV","S.MIN")]
,pscale = 0, type = c("g", "p", "smooth"))
splom(~all.data.scale[as.vector(model.rpart.prune$where==2),c("OFFER","S.SD","CV","S.MIN")])

### customer characteristic distribution for node = 5
barchart( ~ CV | C1 + C2 + C3
,data = all.data.scale[as.vector(model.rpart.prune$where==5)
,c("OFFER","D.SD","CV","C1","C2","C3")]
,group = OFFER, stack = T)

### SD on price is different from different nodes
histogram( CV ~ SD | C1+C2+C3
,data = all.data.scale[as.vector(model.rpart.prune$where==5)
,c("OFFER","MIN","SD","D.SD","CV","C1","C2","C3")])

#Price distribution of each node group
histogram( CV ~ MIN | C1 + C2 + C3
,data = all.data.scale[as.vector(model.rpart.prune$where==4)
,c("OFFER","MIN","D.SD","CV","C1","C2","C3")]
)

histogram( CV ~ MIN | C1 + C2 + C3
,data = all.data.scale[as.vector(model.rpart.prune$where==4)&all.data.scale$CV==1,c("OFFER","MIN","D.SD","CV","C1","C2","C3")]
)

### distribution of SD
histogram( ~ SD | CV
,data = all.data.scale[as.vector(model.rpart.prune$where==5)
,c("OFFER","D.SD","SD","CV","C1","C2","C3")])

bwplot(CV~ SD|C1+C2+C3
,data = all.data.scale[as.vector(model.rpart.prune$where==5)
,c("OFFER","D.SD","SD","CV","C1","C2","C3")])

### no of offer affecting CV. Not much affects.
histogram( ~ OFFER | CV
,data = all.data.scale[as.vector(model.rpart.prune$where==4)
,c("OFFER","D.SD","CV","C1","C2","C3")])
histogram(~CV | OFFER
,data = all.data.scale[as.vector(model.rpart.prune$where==5),c("OFFER","D.SD","SD","CV","C1","C2","C3")])

histogram( ~SD | C1+C2+C3
,data = all.data.scale[as.vector(model.rpart.prune$where==4)&all.data.scale$CV==0, c("OFFER","D.SD","SD","CV","C1","C2","C3")])

#################################################################
#################################################################
# build decision tree including SD feature

library(rpart)
library("partykit")

formula <- CV ~ S.OFFER + S.SD
formula <- CV ~ C11 + C22 + C33 + S.OFFER + S.SD
#formula <- CV ~ X + Y + A + B + C + D + E + F + G + H + I + S.OFFER + S.SD

model.rpart <-rpart(formula, data=all.data.scale)

#print(model.rpart$cptable)
# here we prune back the large initial tree:

opt<-which.min(model.rpart$cptable[,'xerror'])
cp<-model.rpart$cptable[opt,'CP']

model.rpart.prune <- prune(model.rpart, cp = cp)

plot(as.party(model.rpart.prune),
tp_args = list(id = FALSE))
fancyRpartPlot(model.rpart.prune)

####################################
# find the no of split distribution by cross validations (25)

no.split <- vector(mode = 'integer', length=25)

for ( i in 1:length(no.split)) {
cp <- rpart(formula, data = all.data.scale)$cptable
no.split[i] <- cp[which.min(cp[,"xerror"]), "nsplit"]
}
table(no.split)
#####################################