Wednesday, December 3, 2014

Avazu CTR Competition

Kaggle Avazu CTR

#This code attempts to utilize the following concepts:
#Gradient Boosting Model, Stochastic Simulation, Learning rate (shrinkage) optimization, bootstraping

rm(list=ls())
setwd("C:/Users/Ted/Documents/Kaggle/Avazu/Data")

library("ff")
library("ffbase")
library("doParallel")
library("data.table")
library(ggplot2)
library(lattice)



csvfile <- file.path(getwd(),'ff.train.pct.sample','train_5pct.csv')
train <- fread(csvfile, header=TRUE, stringsAsFactors=TRUE)


summary(train);str(train);head(train) #stringsASFactors doesn't work
rm(list=ls())


## train.1p[,site_id := as.factor(site_id)]
names_factors<- colnames(train)
for (col in names_factors) set(train, j=col, value=as.factor(train[[col]]))



#### get day & 3hr time zone ####


train$date <- paste('20',substring(train$hour, first = 1, last = 2),
                       substring(train$hour, first=3, last=8), sep='')

train$date <- paste(substring(train$date, first=1, last=4), '-',
                       substring(train$date, first=5, last=6), '-',
                       substring(train$date, first=7, last=8), ' ',
                       substring(train$date, first=9, last=19), sep='')

## train$date <- as.POSIXct(train, strptime(train$date, "%Y-%M-%D %H"))
## train$date <- as.Date(train$date, "%Y-%M-%D %H")

train$day <- weekdays(as.Date(train$date))

train$day1 <- train$day
train$day1 <- gsub("Monday"   , 1 ,train$day1)
train$day1 <- gsub("Tuesday"  , 2, train$day1)
train$day1 <- gsub("Wednesday", 3, train$day1)
train$day1 <- gsub("Thursday" , 4, train$day1)
train$day1 <- gsub("Friday"   , 5, train$day1)
train$day1 <- gsub("Saturday" , 6, train$day1)
train$day1 <- gsub("Sunday"   , 7, train$day1)


train$hour <- substring(train$hour, first=7, last=8)
train$hour <- gsub("00", "Q1", train$hour); train$hour <- gsub("01", "Q1", train$hour); train$hour <- gsub("02", "Q1", train$hour);
train$hour <- gsub("03", "Q1", train$hour); train$hour <- gsub("04", "Q1", train$hour); train$hour <- gsub("05", "Q1", train$hour);
train$hour <- gsub("06", "Q2", train$hour); train$hour <- gsub("07", "Q2", train$hour); train$hour <- gsub("08", "Q2", train$hour);
train$hour <- gsub("09", "Q2", train$hour); train$hour <- gsub("10", "Q2", train$hour); train$hour <- gsub("11", "Q2", train$hour);
train$hour <- gsub("12", "Q3", train$hour); train$hour <- gsub("13", "Q3", train$hour); train$hour <- gsub("14", "Q3", train$hour);
train$hour <- gsub("15", "Q3", train$hour); train$hour <- gsub("16", "Q3", train$hour); train$hour <- gsub("17", "Q3", train$hour);
train$hour <- gsub("18", "Q4", train$hour); train$hour <- gsub("19", "Q4", train$hour); train$hour <- gsub("20", "Q4", train$hour);
train$hour <- gsub("21", "Q4", train$hour); train$hour <- gsub("22", "Q4", train$hour); train$hour <- gsub("23", "Q4", train$hour);


names_factors<- colnames(train)
for (col in names_factors) set(train, j=col, value=as.factor(train[[col]]))


##final check
summary(train);str(train);head(train)


## save model feed data
csvfile <- file.path(getwd(),'ff.train.pct.modelfeed','train_5pct.csv')
system.time(
  write.table(train, file=csvfile, row.names=FALSE, col.names=TRUE)
)

gc()
help(memory.size);memory.limit();memory.size()


############################################################################
############################################################################

rm(list=ls())

library(data.table)

## load model feed data
setwd("C:/Users/Ted/Documents/Kaggle/Avazu/Data")
csvfile <- file.path(getwd(),'ff.train.pct.modelfeed','train_3pct.csv')
train <- fread(csvfile, header=TRUE)


## find distinct levels in each columns
names_factors<- colnames(train)
for (col in names_factors) set(train, j=col, value=as.factor(train[[col]]))

lvl_cnt <- NULL
for (col in names_factors)  { lvl_cnt[col] <- (length(levels(train[[col]]))) }
lvl_cnt <- as.data.frame(lvl_cnt); lvl_cnt$name <- rownames(lvl_cnt)
colnames(lvl_cnt)[1] <- 'count'; lvl_cnt[with(lvl_cnt, order(count)),]
##xtabs(C21 ~ click, data=train)


## imipact modeling
impactModel = function(xcol, ycol){
  n = length(ycol)
  p = sum(as.numeric(ycol))/n
  #duplicate output for NA (average NA towards grand uniform average)
  x = c(xcol, xcol)
  y = c(ycol, ycol)
  x[(1+n):(2*n)] = NA
  levelcounts = table(x, y, useNA="always")
  condprobmodel = (levelcounts[,2]+p)/(levelcounts[,1]+levelcounts[,2]+1.0)
  # apply model example: applyImpactModel(condprobmodel, data[,varname])
  condprobmodel
}

applyImpactModel = function(condprobmodel, xcol) {
  naval = condprobmodel[is.na(names(condprobmodel))]
  dim = length(xcol)
  condprobvec = numeric(dim) + naval
  for(nm in names(condprobmodel)) {
    if(!is.na(nm)) {
      condprobvec[xcol==nm] = condprobmodel[nm]
    }
  }
  condprobvec
}


impact_c20 = impactModel(train$c20, train$click)
# train <- as.data.frame(train)
train$impact_c20 = applyImpactModel(impact_c20, train$c20)



## randomForest model
library(randomForest)
set.seed(12345)
formula <- click ~ hour + device_type + device_conn_type + C18 + C1 + banner_pos + C15 +
                   C16 + site_category + app_category  + day  ## C21 + C19

formula1 <- click ~ device_type + device_conn_type + C18 + C1 + banner_pos + C15 +
  C16 + site_category * app_category  + day + hour ## C21 + C19


system.time(
model.rf <- randomForest(formula = formula,
                         data = train,
                         ntree=150, importance=T))

##   1pct
##   user  system elapsed
## 249.31    4.13  253.62



print(model.rf)
attributes(model.rf)
importance(model.rf)
varImpPlot(model.rf)
plot(model.rf)
summary(model.rf)

## first formula
table(predict(model.rf), train.1p$click)
###### with hour x1 #######
# 0      1
# 0 334812  67290
# 1    801   1386
###### with hour x2 #######
# 0      1
# 0 334911  67361
# 1    702   1315


## second formula
table(predict(model.rf), train.1p$click)
# 0      1
# 0 334896  67421
# 1    717   1255


# test random forest using test data
irisPred <- predict(rf, newdata=testData)

# check the results
table(irisPred, testData$Species)
plot(margin(rf, testData$Species))


head(train)


names_factors<- colnames(test)
for (col in names_factors) set(test, j=col, value=as.factor(test[[col]]))


## predict
predict.rf <- predict(model.rf, newdata=test)


#fit the randomforest model
model <- randomForest(Sepal.Length~.,
                      data = training,
                      importance=TRUE,
                      keep.forest=TRUE
)
print(model)

#what are the important variables (via permutation)
varImpPlot(model, type=1)

#predict the outcome of the testing data
predicted <- predict(model, newdata=testing[ ,-1])

# what is the proportion variation explained in the outcome of the testing data?
# i.e., what is 1-(SSerror/SStotal)
actual <- testing$Sepal.Length
rsq <- 1-sum((actual-predicted)^2)/sum((actual-mean(actual))^2)
print(rsq)



# gbm model fitting

library(gbm)
library(dplyr)
library(data.table)

setwd("C:\\Users\\intrepid-honor-803\\Documents\\Kaggle\\Avazu\\Data")
csvfile <- file.path(getwd(),'ff.train.pct.modelfeed','train_50pct.csv')
train <- fread(csvfile, header=TRUE)



## click <- train$click 
click <- as.numeric(train$click)

## train <- select(train, -click)
train <- select(train, 
                device_type, device_conn_type, C18, C1, banner_pos, C15, 
                C16, site_category, app_category, day, hour)


names_factors<- colnames(train)
for (col in names_factors) set(train, j=col, value=as.factor(train[[col]]))

?gbm
model.gbm = gbm.fit(x=train,
                    y=click,      
                    distribution = "bernoulli", 
                    # gaussian for GBM regression or adaboost
                    n.trees=10000,
                    shrinkage=0.05, 
                    # smaller values of shrinkage typically give slighly better performance
                    # the cost is that the model takes longer to run for smaller values
                    interaction.depth=3,
                    #use CV to choose interaction delpth
                    n.minobsinnode=500,
                    # n.minobsinmode has an importnt effect on overfitting!
                    # decrease in this number may result the overfitting
                    nTrain=round(nrow(train) * 0.8),
                    # var.monotone=c(),
                    # can help with overfitting, will smooth bumpy curves
                    verbose=TRUE
)

summary(model.gbm)
gbm.perf(model.gbm) ## gbm.perf(model.gbm, method="test")
save(model.gbm, file="gbm_depth3_minobs500_n10000")
load("C:\\Users\\intrepid-honor-803\\Documents\\Kaggle\\Avazu\\Data\\predicted\\1gbm_depth3_minobs500_n5000\\gbm_depth3_minobs500_n5000.rda")
rm(train)

R packages


List of R packages updates for tracking

    • ggplot2 / ggvis (interactive)
    • dplyr
    • ff/bigglm
    • reshape2
    • data.table
    • gmb
    • doParallel
    • rattle/caret
    • stringr

http://www-bcf.usc.edu/~gareth/ISL/
http://adv-r.had.co.nz/
http://www.louisaslett.com/RStudio_AMI/
http://www.r-bloggers.com/in-depth-introduction-to-machine-learning-in-15-hours-of-expert-videos/

Thursday, October 2, 2014

Big data in R

Approaches and packages to handle big data in R
This post is largely credit to this blog.


Data Storage I/O

  • http://blog.revolutionanalytics.com/2009/12/r-tip-save-time-and-space-by-compressing-data-files.html
  • fread
  • data.table
  • http://stackoverflow.com/questions/1727772/quickly-reading-very-large-tables-as-dataframes-in-r
  • http://davetang.org/muse/2013/09/03/handling-big-data-in-r/


Data Manipulation

  • dplyr in plyr package


Data Visualization

  • bigvis
  • ggplot2

Memory

  • ffbase
  • http://www.slideshare.net/EdwindeJonge1/ffbase
Ensemble
http://www.r-bloggers.com/improve-predictive-performance-in-r-with-bagging/
http://vikparuchuri.com/blog/parallel-r-loops-for-windows-and-linux/


Parallelization

  • http://adv-r.had.co.nz/Profiling.html#parallelise
  • http://notjustmath.wordpress.com/2012/01/22/parallel-computing-with-r/
  • http://stackoverflow.com/questions/24335569/in-r-how-to-predict-with-svm-model-in-parallel-using-foreach-snow
  • http://topepo.github.io/caret/parallel.html
  • http://stackoverflow.com/questions/7782501/how-to-interpret-predict-result-of-svm-in-r?rq=1
  • http://www.r-bloggers.com/parallel-r-model-prediction-building-and-analytics/

Thursday, May 1, 2014

Neural Network - Single & Multiple output nodes

##### AllState Prediction Model using Neural Network Algorithm
##### Working in progress

rm(list=ls())

###### read in file
setwd('C:\\Users\\Ted\\Desktop\\Kaggle\\AllState')
dat1 <- read.csv(file="train.csv", header=T)
test1 <- read.csv(file="test_v2.csv", header=T)

##### overview of data
str(dat1);summary(dat1);nrow(dat1);head(dat1,2)
apply(apply(dat1,2,is.na),2,sum)


##### dat2 is for only purchase record
dat1$p <- apply(dat1[,c("A", "B", "C", "D", "E", "F", "G")],1,paste, collapse='')

##### convert factors into numbers #dat1$st <- NULL
a <- data.frame(sort(unique(dat1$state)), order(sort(unique(dat1$state))))
colnames(a) <- c("state","state_no")
b <- data.frame(sort(unique(dat1$p)), order(sort(unique(dat1$p))))
colnames(b) <- c("p","p_no")
dat1 <- merge(dat1,a, by='state');rm(a)
dat1 <- merge(dat1,b, by="p");rm(b)


##### binarize car value
cat <- levels(dat1$car_value)
cat[1] <- c("u")
#for (i in cat) {cat[i] <- paste("car_value",i, collapse=" ")};cat

binarize <- function(x) {return(dat1$car_value == x)}
newcols <- --sapply(cat, binarize)
colnames(newcols) <- cat
dat1 <- cbind(dat1, newcols)
rm(cat);rm(newcols);rm(binarize)



##### do the same manipulation on test set
apply(apply(test1,2,is.na),2,sum)
test1$p <- apply(test1[,c("A", "B", "C", "D", "E", "F", "G")], 1, paste, collapse='')

a <- data.frame(sort(unique(test1$state)), order(sort(unique(test1$state))))
colnames(a) <- c("state","state_no")
b <- data.frame(sort(unique(test1$p)), order(sort(unique(test1$p))))
colnames(b) <- c("p","p_no")
test1 <- merge(test1,a, by='state');rm(a)
test1 <- merge(test1,b, by="p");rm(b)

cat <- levels(test1$car_value)
cat[1] <- c("u")
binarize <- function(x) {return(test1$car_value == x)}
newcols <- --sapply(cat, binarize)
colnames(newcols) <- cat
test1 <- cbind(test1, newcols)
rm(cat);rm(newcols);rm(binarize)






##### final cut for record type = 1
dat2 <- dat1[dat1$record_type==1,]






##### graph data distribution
hist(dat2$p_no)





##### random forest model
library(randomForest)
formula <- p ~ day + state + location + group_size + homeowner + car_age + car_value + age_oldest +
               age_youngest + married_couple + cost # + risk_factor +  c_previous + duration_previous
model.rf <- randomForest(formula = formula, data=dat2, ntree=100, importance=T)


head(dat2)
##### neural network model
library(neuralnet); args(neuralnet)
model.nn <- neuralnet(p_no ~ day + state_no + location + group_size + homeowner + car_age + car_value + age_oldest +
                        age_youngest + married_couple + cost # + risk_factor +  c_previous + duration_previous
                      , data=dat2, hidden=3, act.fct="logistic",rep = 3, linear.output = F)


m <- model.matrix( ~ p_no + day + state_no + location + group_size + homeowner + car_age +
                     u + a + b + c + d + e + f + g + h + i + age_oldest +
                    age_youngest + married_couple + cost # + risk_factor +  c_previous + duration_previous
                   ,data = dat2)

model.nn <- neuralnet(p_no ~ day + state_no + location + group_size + homeowner + car_age +
                        u + a + b + c + d + e + f + g + h + i + age_oldest +
                        age_youngest + married_couple + cost # + risk_factor +  c_previous + duration_previous
                      ,data=m , hidden = 2, threshold=0.01, linear.output=F)


pred.bin <- prediction(model.nn)
pred.bin$rep1
plot(model.nn, rep="best")





Monday, April 14, 2014



# Date: 4/10/2014
# Ted Kim
# Answer Financial Inc. Data Project
# This code tries to categorize customer base per their attributes to find out each cluster's propensity to
# purchase the products offered.
# Decision tree models were used for classifications and to validate the statistically significant divergence
# found in offered products' price standard deviation


############ get your data
setwd('C:\\Users\\Ted\\Desktop\\Kaggle\\Answer Financial'); getwd()
all.data <-read.csv(file='AFIDA_Data.csv', header=T)
############


############ find min, sd, and no. of product offered
all.data$MIN <- apply(all.data[,c(6,7,8)], 1, min, na.rm=T)
all.data$MIN[all.data$MIN==Inf] <- 0
all.data$OFFER <- 3-apply(apply(all.data,1,is.na),2,sum)
all.data$SD <- apply(all.data[, c(6,7,8)], 1, sd, na.rm=T)
all.data$SD[is.na(all.data$SD)] <- 0
############


#### transform categorical data to binary format
cat <- levels(all.data$C1);cat
binarize <- function(x) {return(all.data$C1 == x)}
newcols <- --sapply(cat, binarize)
colnames(newcols) <- cat
all.data <- cbind(all.data, newcols)
newcols[1:15,]; all.data[1:15,]

cat <- levels(all.data$C2)
binarize <- function(x) {return(all.data$C2 == x)}
newcols <- --sapply(cat, binarize)
colnames(newcols) <- cat
all.data <- cbind(all.data, newcols)

cat <- levels(all.data$C3)
binarize <- function(x) {return(all.data$C3 == x)}
newcols <- --sapply(cat, binarize)
colnames(newcols) <- cat
all.data <- cbind(all.data, newcols)


#### scale offer min price, and sd data
all.data.scale <- cbind(all.data, scale(all.data[,9:11]))
colnames(all.data.scale)[23] <- "S.MIN"
colnames(all.data.scale)[24] <- "S.OFFER"
colnames(all.data.scale)[25] <- "S.SD"


#### discretize SD into 5 buckets
segments <- 5
maxL <- max(all.data.scale$SD)
minL <- min(all.data.scale$SD)
theBreaks <- seq(minL, maxL, by=(maxL-minL)/segments)
all.data.scale$D.SD <- cut(all.data.scale$SD, breaks = theBreaks, include.lowest=F)


rm(newcols); rm(all.data)
rm(cat); rm(binarize)

#head(all.data.scale)
#all.data.scale <- subset(all.data.scale, select = -c(C11))


#### Convert categorical values to numeric values
all.data.scale$C11[all.data.scale$C1=="X"] <- 0
all.data.scale$C11[all.data.scale$C1=="Y"] <- 1
all.data.scale$C22[all.data.scale$C2=="A"] <- 0
all.data.scale$C22[all.data.scale$C2=="B"] <- 1
all.data.scale$C22[all.data.scale$C2=="C"] <- 2
all.data.scale$C22[all.data.scale$C2=="D"] <- 3
all.data.scale$C22[all.data.scale$C2=="E"] <- 4
all.data.scale$C22[all.data.scale$C2=="F"] <- 5
all.data.scale$C33[all.data.scale$C3=="G"] <- 0
all.data.scale$C33[all.data.scale$C3=="H"] <- 1
all.data.scale$C33[all.data.scale$C3=="I"] <- 2

CAT <-do.call(paste, c(all.data.scale[c("C1","C2","C3")], sep=""))
all.data.scale$CAT <- CAT
rm(CAT)


CATID <-do.call(paste, c(all.data.scale[c("CV","C1","C2","C3")], sep=""))
all.data.scale$CATID <- CATID
rm(CATID)

dat1 <- as.data.frame(all.data.scale[,c("CATID","CV","CAT")])


library(reshape)
dat1 <- cast(dat1, CAT ~ CV)
d1 <- as.data.frame(table(all.data.scale[,c(30,2)]))


library(lattice)
barchart( CAT ~ Freq, data = d1 , group = CV, stack = T)






### randomForest model
library(randomForest)
formula <- CV ~ C1+C2+C3
#formula <- CV ~ C11+C22+C33
rf.model <-  randomForest(formula = formula #CV ~ C1+ C2 + C3 #+ MIN + OFFER + SD
                          ,data = all.data.scale
                          ,ntree = 100
                          ,importance=T)
importance(rf.model)



### conditional tree model
library(partykit)
ctree.model <- ctree(CV ~ C1+ C2 + C3 + C1:C2 + C1:C3 + C2:C3 #+ MIN + OFFER + SD
                     ,data = all.data.scale)

ctree.model <- ctree(CV ~ C1 + C2 + C3 #+ S.SD #+ C1:C2 + C1:C3 + C2:C3 #+ MIN + OFFER + SD
                     ,data = all.data.scale)

ctree.model <- ctree(CV ~ C11+ C22 + C33 # + C1:C2 + C1:C3 + C2:C3 #+ MIN + OFFER + SD
                     ,data = all.data.scale)

#plot(ctree.model)
plot(ctree.model, gp = gpar(fontsize = 10)
     ,inner_panel=node_inner
     ,ip_agrs=list(abbreviate = T, id = F)
    )










#Prune tree methods
library(rpart)
library(rpart.plot)
library(RColorBrewer)
library(rattle)
library(partykit)


formula <- CV ~ C1 + C2 + C3
#formula <- CV ~ C11 + C22 + C33

model.rpart <-rpart(formula, data=all.data.scale) #print(model.rpart$cptable)

# here we prune back the large initial tree:

opt<-which.min(model.rpart$cptable[,'xerror'])
cp<-model.rpart$cptable[opt,'CP']

model.rpart.prune <- prune(model.rpart, cp = cp)

plot(as.party(model.rpart.prune),
     tp_args = list(id = FALSE))


fancyRpartPlot(model.rpart.prune)


# summary(model.rpart.prune)
# test_pred <- predict(model.rpart.prune, all.data.scale, type = "class")
# model.rpart.prune$frame
# model.rpart.prune$where


all.data.scale$NODE <- 0
all.data.scale[as.vector(model.rpart.prune$where==4),]$NODE <- 4
all.data.scale[as.vector(model.rpart.prune$where==5),]$NODE <- 5
all.data.scale[as.vector(model.rpart.prune$where==2),]$NODE <- 2


####################################
# find the no of split distribution by cross validations (25)

no.split <- vector(mode = 'integer', length=25)

for ( i in 1:length(no.split)) {
  cp <- rpart(formula
              , data = all.data.scale)$cptable
  no.split[i] <- cp[which.min(cp[,"xerror"]), "nsplit"]
}
table(no.split)
#####################################




all.data.scale[as.vector(model.rpart.prune$where==2),] # node 1
all.data.scale[as.vector(model.rpart.prune$where==4),] # node 4
head(all.data.scale[as.vector(model.rpart.prune$where==5),]) # node 5




library(lattice)
### breakdown of characterisics
barchart( CAT ~ Freq | NODE, data = d1 , group = CV, stack = T)


splom(~all.data.scale[as.vector(model.rpart.prune$where==5),c("OFFER","S.SD","CV","S.MIN")]
      ,pscale = 0, type = c("g", "p", "smooth"))
splom(~all.data.scale[as.vector(model.rpart.prune$where==5),c("OFFER","S.SD","CV","S.MIN")])


splom(~all.data.scale[as.vector(model.rpart.prune$where==4),c("OFFER","S.SD","CV","S.MIN")]
      ,pscale = 0, type = c("g", "p", "smooth"))
splom(~all.data.scale[as.vector(model.rpart.prune$where==4),c("OFFER","S.SD","CV","S.MIN")])



splom(~all.data.scale[as.vector(model.rpart.prune$where==2),c("OFFER","S.SD","CV","S.MIN")]
      ,pscale = 0, type = c("g", "p", "smooth"))
splom(~all.data.scale[as.vector(model.rpart.prune$where==2),c("OFFER","S.SD","CV","S.MIN")])





### customer characteristic distribution for node = 5
barchart( ~ CV | C1 + C2 + C3
          ,data = all.data.scale[as.vector(model.rpart.prune$where==5)
                                 ,c("OFFER","D.SD","CV","C1","C2","C3")]
          ,group = OFFER, stack = T)


### SD on price is different from different nodes
histogram( CV ~ SD | C1+C2+C3
           ,data = all.data.scale[as.vector(model.rpart.prune$where==5)
                                  ,c("OFFER","MIN","SD","D.SD","CV","C1","C2","C3")])




#Price distribution of each node group
histogram( CV ~ MIN | C1 + C2 + C3
           ,data = all.data.scale[as.vector(model.rpart.prune$where==4)
                                  ,c("OFFER","MIN","D.SD","CV","C1","C2","C3")]
)


histogram( CV ~ MIN | C1 + C2 + C3
           ,data = all.data.scale[as.vector(model.rpart.prune$where==4)&all.data.scale$CV==1,c("OFFER","MIN","D.SD","CV","C1","C2","C3")]
)




### distribution of SD
histogram(  ~ SD | CV
            ,data = all.data.scale[as.vector(model.rpart.prune$where==5)
                                   ,c("OFFER","D.SD","SD","CV","C1","C2","C3")])

bwplot(CV~ SD|C1+C2+C3
       ,data = all.data.scale[as.vector(model.rpart.prune$where==5)
                              ,c("OFFER","D.SD","SD","CV","C1","C2","C3")])



### no of offer affecting CV. Not much affects.
histogram(  ~ OFFER |  CV
            ,data = all.data.scale[as.vector(model.rpart.prune$where==4)
                                   ,c("OFFER","D.SD","CV","C1","C2","C3")])
histogram(~CV | OFFER
          ,data = all.data.scale[as.vector(model.rpart.prune$where==5),c("OFFER","D.SD","SD","CV","C1","C2","C3")])



histogram( ~SD | C1+C2+C3
           ,data = all.data.scale[as.vector(model.rpart.prune$where==4)&all.data.scale$CV==0, c("OFFER","D.SD","SD","CV","C1","C2","C3")])





#################################################################
#################################################################
# build decision tree including SD feature

library(rpart)
library("partykit")

formula <- CV ~ S.OFFER + S.SD
formula <- CV ~ C11 + C22 + C33 + S.OFFER + S.SD
#formula <- CV ~ X + Y + A + B + C + D + E + F + G + H + I + S.OFFER + S.SD

model.rpart <-rpart(formula, data=all.data.scale)

#print(model.rpart$cptable)
# here we prune back the large initial tree:

opt<-which.min(model.rpart$cptable[,'xerror'])
cp<-model.rpart$cptable[opt,'CP']

model.rpart.prune <- prune(model.rpart, cp = cp)

plot(as.party(model.rpart.prune),
     tp_args = list(id = FALSE))
fancyRpartPlot(model.rpart.prune)


####################################
# find the no of split distribution by cross validations (25)

no.split <- vector(mode = 'integer', length=25)

for ( i in 1:length(no.split)) {
  cp <- rpart(formula, data = all.data.scale)$cptable
  no.split[i] <- cp[which.min(cp[,"xerror"]), "nsplit"]
}
table(no.split)
#####################################