Monday, April 14, 2014



# Date: 4/10/2014
# Ted Kim
# Answer Financial Inc. Data Project
# This code tries to categorize customer base per their attributes to find out each cluster's propensity to
# purchase the products offered.
# Decision tree models were used for classifications and to validate the statistically significant divergence
# found in offered products' price standard deviation


############ get your data
setwd('C:\\Users\\Ted\\Desktop\\Kaggle\\Answer Financial'); getwd()
all.data <-read.csv(file='AFIDA_Data.csv', header=T)
############


############ find min, sd, and no. of product offered
all.data$MIN <- apply(all.data[,c(6,7,8)], 1, min, na.rm=T)
all.data$MIN[all.data$MIN==Inf] <- 0
all.data$OFFER <- 3-apply(apply(all.data,1,is.na),2,sum)
all.data$SD <- apply(all.data[, c(6,7,8)], 1, sd, na.rm=T)
all.data$SD[is.na(all.data$SD)] <- 0
############


#### transform categorical data to binary format
cat <- levels(all.data$C1);cat
binarize <- function(x) {return(all.data$C1 == x)}
newcols <- --sapply(cat, binarize)
colnames(newcols) <- cat
all.data <- cbind(all.data, newcols)
newcols[1:15,]; all.data[1:15,]

cat <- levels(all.data$C2)
binarize <- function(x) {return(all.data$C2 == x)}
newcols <- --sapply(cat, binarize)
colnames(newcols) <- cat
all.data <- cbind(all.data, newcols)

cat <- levels(all.data$C3)
binarize <- function(x) {return(all.data$C3 == x)}
newcols <- --sapply(cat, binarize)
colnames(newcols) <- cat
all.data <- cbind(all.data, newcols)


#### scale offer min price, and sd data
all.data.scale <- cbind(all.data, scale(all.data[,9:11]))
colnames(all.data.scale)[23] <- "S.MIN"
colnames(all.data.scale)[24] <- "S.OFFER"
colnames(all.data.scale)[25] <- "S.SD"


#### discretize SD into 5 buckets
segments <- 5
maxL <- max(all.data.scale$SD)
minL <- min(all.data.scale$SD)
theBreaks <- seq(minL, maxL, by=(maxL-minL)/segments)
all.data.scale$D.SD <- cut(all.data.scale$SD, breaks = theBreaks, include.lowest=F)


rm(newcols); rm(all.data)
rm(cat); rm(binarize)

#head(all.data.scale)
#all.data.scale <- subset(all.data.scale, select = -c(C11))


#### Convert categorical values to numeric values
all.data.scale$C11[all.data.scale$C1=="X"] <- 0
all.data.scale$C11[all.data.scale$C1=="Y"] <- 1
all.data.scale$C22[all.data.scale$C2=="A"] <- 0
all.data.scale$C22[all.data.scale$C2=="B"] <- 1
all.data.scale$C22[all.data.scale$C2=="C"] <- 2
all.data.scale$C22[all.data.scale$C2=="D"] <- 3
all.data.scale$C22[all.data.scale$C2=="E"] <- 4
all.data.scale$C22[all.data.scale$C2=="F"] <- 5
all.data.scale$C33[all.data.scale$C3=="G"] <- 0
all.data.scale$C33[all.data.scale$C3=="H"] <- 1
all.data.scale$C33[all.data.scale$C3=="I"] <- 2

CAT <-do.call(paste, c(all.data.scale[c("C1","C2","C3")], sep=""))
all.data.scale$CAT <- CAT
rm(CAT)


CATID <-do.call(paste, c(all.data.scale[c("CV","C1","C2","C3")], sep=""))
all.data.scale$CATID <- CATID
rm(CATID)

dat1 <- as.data.frame(all.data.scale[,c("CATID","CV","CAT")])


library(reshape)
dat1 <- cast(dat1, CAT ~ CV)
d1 <- as.data.frame(table(all.data.scale[,c(30,2)]))


library(lattice)
barchart( CAT ~ Freq, data = d1 , group = CV, stack = T)






### randomForest model
library(randomForest)
formula <- CV ~ C1+C2+C3
#formula <- CV ~ C11+C22+C33
rf.model <-  randomForest(formula = formula #CV ~ C1+ C2 + C3 #+ MIN + OFFER + SD
                          ,data = all.data.scale
                          ,ntree = 100
                          ,importance=T)
importance(rf.model)



### conditional tree model
library(partykit)
ctree.model <- ctree(CV ~ C1+ C2 + C3 + C1:C2 + C1:C3 + C2:C3 #+ MIN + OFFER + SD
                     ,data = all.data.scale)

ctree.model <- ctree(CV ~ C1 + C2 + C3 #+ S.SD #+ C1:C2 + C1:C3 + C2:C3 #+ MIN + OFFER + SD
                     ,data = all.data.scale)

ctree.model <- ctree(CV ~ C11+ C22 + C33 # + C1:C2 + C1:C3 + C2:C3 #+ MIN + OFFER + SD
                     ,data = all.data.scale)

#plot(ctree.model)
plot(ctree.model, gp = gpar(fontsize = 10)
     ,inner_panel=node_inner
     ,ip_agrs=list(abbreviate = T, id = F)
    )










#Prune tree methods
library(rpart)
library(rpart.plot)
library(RColorBrewer)
library(rattle)
library(partykit)


formula <- CV ~ C1 + C2 + C3
#formula <- CV ~ C11 + C22 + C33

model.rpart <-rpart(formula, data=all.data.scale) #print(model.rpart$cptable)

# here we prune back the large initial tree:

opt<-which.min(model.rpart$cptable[,'xerror'])
cp<-model.rpart$cptable[opt,'CP']

model.rpart.prune <- prune(model.rpart, cp = cp)

plot(as.party(model.rpart.prune),
     tp_args = list(id = FALSE))


fancyRpartPlot(model.rpart.prune)


# summary(model.rpart.prune)
# test_pred <- predict(model.rpart.prune, all.data.scale, type = "class")
# model.rpart.prune$frame
# model.rpart.prune$where


all.data.scale$NODE <- 0
all.data.scale[as.vector(model.rpart.prune$where==4),]$NODE <- 4
all.data.scale[as.vector(model.rpart.prune$where==5),]$NODE <- 5
all.data.scale[as.vector(model.rpart.prune$where==2),]$NODE <- 2


####################################
# find the no of split distribution by cross validations (25)

no.split <- vector(mode = 'integer', length=25)

for ( i in 1:length(no.split)) {
  cp <- rpart(formula
              , data = all.data.scale)$cptable
  no.split[i] <- cp[which.min(cp[,"xerror"]), "nsplit"]
}
table(no.split)
#####################################




all.data.scale[as.vector(model.rpart.prune$where==2),] # node 1
all.data.scale[as.vector(model.rpart.prune$where==4),] # node 4
head(all.data.scale[as.vector(model.rpart.prune$where==5),]) # node 5




library(lattice)
### breakdown of characterisics
barchart( CAT ~ Freq | NODE, data = d1 , group = CV, stack = T)


splom(~all.data.scale[as.vector(model.rpart.prune$where==5),c("OFFER","S.SD","CV","S.MIN")]
      ,pscale = 0, type = c("g", "p", "smooth"))
splom(~all.data.scale[as.vector(model.rpart.prune$where==5),c("OFFER","S.SD","CV","S.MIN")])


splom(~all.data.scale[as.vector(model.rpart.prune$where==4),c("OFFER","S.SD","CV","S.MIN")]
      ,pscale = 0, type = c("g", "p", "smooth"))
splom(~all.data.scale[as.vector(model.rpart.prune$where==4),c("OFFER","S.SD","CV","S.MIN")])



splom(~all.data.scale[as.vector(model.rpart.prune$where==2),c("OFFER","S.SD","CV","S.MIN")]
      ,pscale = 0, type = c("g", "p", "smooth"))
splom(~all.data.scale[as.vector(model.rpart.prune$where==2),c("OFFER","S.SD","CV","S.MIN")])





### customer characteristic distribution for node = 5
barchart( ~ CV | C1 + C2 + C3
          ,data = all.data.scale[as.vector(model.rpart.prune$where==5)
                                 ,c("OFFER","D.SD","CV","C1","C2","C3")]
          ,group = OFFER, stack = T)


### SD on price is different from different nodes
histogram( CV ~ SD | C1+C2+C3
           ,data = all.data.scale[as.vector(model.rpart.prune$where==5)
                                  ,c("OFFER","MIN","SD","D.SD","CV","C1","C2","C3")])




#Price distribution of each node group
histogram( CV ~ MIN | C1 + C2 + C3
           ,data = all.data.scale[as.vector(model.rpart.prune$where==4)
                                  ,c("OFFER","MIN","D.SD","CV","C1","C2","C3")]
)


histogram( CV ~ MIN | C1 + C2 + C3
           ,data = all.data.scale[as.vector(model.rpart.prune$where==4)&all.data.scale$CV==1,c("OFFER","MIN","D.SD","CV","C1","C2","C3")]
)




### distribution of SD
histogram(  ~ SD | CV
            ,data = all.data.scale[as.vector(model.rpart.prune$where==5)
                                   ,c("OFFER","D.SD","SD","CV","C1","C2","C3")])

bwplot(CV~ SD|C1+C2+C3
       ,data = all.data.scale[as.vector(model.rpart.prune$where==5)
                              ,c("OFFER","D.SD","SD","CV","C1","C2","C3")])



### no of offer affecting CV. Not much affects.
histogram(  ~ OFFER |  CV
            ,data = all.data.scale[as.vector(model.rpart.prune$where==4)
                                   ,c("OFFER","D.SD","CV","C1","C2","C3")])
histogram(~CV | OFFER
          ,data = all.data.scale[as.vector(model.rpart.prune$where==5),c("OFFER","D.SD","SD","CV","C1","C2","C3")])



histogram( ~SD | C1+C2+C3
           ,data = all.data.scale[as.vector(model.rpart.prune$where==4)&all.data.scale$CV==0, c("OFFER","D.SD","SD","CV","C1","C2","C3")])





#################################################################
#################################################################
# build decision tree including SD feature

library(rpart)
library("partykit")

formula <- CV ~ S.OFFER + S.SD
formula <- CV ~ C11 + C22 + C33 + S.OFFER + S.SD
#formula <- CV ~ X + Y + A + B + C + D + E + F + G + H + I + S.OFFER + S.SD

model.rpart <-rpart(formula, data=all.data.scale)

#print(model.rpart$cptable)
# here we prune back the large initial tree:

opt<-which.min(model.rpart$cptable[,'xerror'])
cp<-model.rpart$cptable[opt,'CP']

model.rpart.prune <- prune(model.rpart, cp = cp)

plot(as.party(model.rpart.prune),
     tp_args = list(id = FALSE))
fancyRpartPlot(model.rpart.prune)


####################################
# find the no of split distribution by cross validations (25)

no.split <- vector(mode = 'integer', length=25)

for ( i in 1:length(no.split)) {
  cp <- rpart(formula, data = all.data.scale)$cptable
  no.split[i] <- cp[which.min(cp[,"xerror"]), "nsplit"]
}
table(no.split)
#####################################