# Date: 4/10/2014
# Ted Kim
# Answer Financial Inc. Data Project
# This code tries to categorize customer base per their attributes to find out each cluster's propensity to
# purchase the products offered.
# Decision tree models were used for classifications and to validate the statistically significant divergence
# found in offered products' price standard deviation
############ get your data
setwd('C:\\Users\\Ted\\Desktop\\Kaggle\\Answer Financial'); getwd()
all.data <-read.csv(file='AFIDA_Data.csv', header=T)
############
############ find min, sd, and no. of product offered
all.data$MIN <- apply(all.data[,c(6,7,8)], 1, min, na.rm=T)
all.data$MIN[all.data$MIN==Inf] <- 0
all.data$OFFER <- 3-apply(apply(all.data,1,is.na),2,sum)
all.data$SD <- apply(all.data[, c(6,7,8)], 1, sd, na.rm=T)
all.data$SD[is.na(all.data$SD)] <- 0
############
#### transform categorical data to binary format
cat <- levels(all.data$C1);cat
binarize <- function(x) {return(all.data$C1 == x)}
newcols <- --sapply(cat, binarize)
colnames(newcols) <- cat
all.data <- cbind(all.data, newcols)
newcols[1:15,]; all.data[1:15,]
cat <- levels(all.data$C2)
binarize <- function(x) {return(all.data$C2 == x)}
newcols <- --sapply(cat, binarize)
colnames(newcols) <- cat
all.data <- cbind(all.data, newcols)
cat <- levels(all.data$C3)
binarize <- function(x) {return(all.data$C3 == x)}
newcols <- --sapply(cat, binarize)
colnames(newcols) <- cat
all.data <- cbind(all.data, newcols)
#### scale offer min price, and sd data
all.data.scale <- cbind(all.data, scale(all.data[,9:11]))
colnames(all.data.scale)[23] <- "S.MIN"
colnames(all.data.scale)[24] <- "S.OFFER"
colnames(all.data.scale)[25] <- "S.SD"
#### discretize SD into 5 buckets
segments <- 5
maxL <- max(all.data.scale$SD)
minL <- min(all.data.scale$SD)
theBreaks <- seq(minL, maxL, by=(maxL-minL)/segments)
all.data.scale$D.SD <- cut(all.data.scale$SD, breaks = theBreaks, include.lowest=F)
rm(newcols); rm(all.data)
rm(cat); rm(binarize)
#head(all.data.scale)
#all.data.scale <- subset(all.data.scale, select = -c(C11))
#### Convert categorical values to numeric values
all.data.scale$C11[all.data.scale$C1=="X"] <- 0
all.data.scale$C11[all.data.scale$C1=="Y"] <- 1
all.data.scale$C22[all.data.scale$C2=="A"] <- 0
all.data.scale$C22[all.data.scale$C2=="B"] <- 1
all.data.scale$C22[all.data.scale$C2=="C"] <- 2
all.data.scale$C22[all.data.scale$C2=="D"] <- 3
all.data.scale$C22[all.data.scale$C2=="E"] <- 4
all.data.scale$C22[all.data.scale$C2=="F"] <- 5
all.data.scale$C33[all.data.scale$C3=="G"] <- 0
all.data.scale$C33[all.data.scale$C3=="H"] <- 1
all.data.scale$C33[all.data.scale$C3=="I"] <- 2
CAT <-do.call(paste, c(all.data.scale[c("C1","C2","C3")], sep=""))
all.data.scale$CAT <- CAT
rm(CAT)
CATID <-do.call(paste, c(all.data.scale[c("CV","C1","C2","C3")], sep=""))
all.data.scale$CATID <- CATID
rm(CATID)
dat1 <- as.data.frame(all.data.scale[,c("CATID","CV","CAT")])
library(reshape)
dat1 <- cast(dat1, CAT ~ CV)
d1 <- as.data.frame(table(all.data.scale[,c(30,2)]))
library(lattice)
barchart( CAT ~ Freq, data = d1 , group = CV, stack = T)
### randomForest model
library(randomForest)
formula <- CV ~ C1+C2+C3
#formula <- CV ~ C11+C22+C33
rf.model <- randomForest(formula = formula #CV ~ C1+ C2 + C3 #+ MIN + OFFER + SD
,data = all.data.scale
,ntree = 100
,importance=T)
importance(rf.model)
### conditional tree model
library(partykit)
ctree.model <- ctree(CV ~ C1+ C2 + C3 + C1:C2 + C1:C3 + C2:C3 #+ MIN + OFFER + SD
,data = all.data.scale)
ctree.model <- ctree(CV ~ C1 + C2 + C3 #+ S.SD #+ C1:C2 + C1:C3 + C2:C3 #+ MIN + OFFER + SD
,data = all.data.scale)
ctree.model <- ctree(CV ~ C11+ C22 + C33 # + C1:C2 + C1:C3 + C2:C3 #+ MIN + OFFER + SD
,data = all.data.scale)
#plot(ctree.model)
plot(ctree.model, gp = gpar(fontsize = 10)
,inner_panel=node_inner
,ip_agrs=list(abbreviate = T, id = F)
)
#Prune tree methods
library(rpart)
library(rpart.plot)
library(RColorBrewer)
library(rattle)
library(partykit)
formula <- CV ~ C1 + C2 + C3
#formula <- CV ~ C11 + C22 + C33
model.rpart <-rpart(formula, data=all.data.scale) #print(model.rpart$cptable)
# here we prune back the large initial tree:
opt<-which.min(model.rpart$cptable[,'xerror'])
cp<-model.rpart$cptable[opt,'CP']
model.rpart.prune <- prune(model.rpart, cp = cp)
plot(as.party(model.rpart.prune),
tp_args = list(id = FALSE))
fancyRpartPlot(model.rpart.prune)
# summary(model.rpart.prune)
# test_pred <- predict(model.rpart.prune, all.data.scale, type = "class")
# model.rpart.prune$frame
# model.rpart.prune$where
all.data.scale$NODE <- 0
all.data.scale[as.vector(model.rpart.prune$where==4),]$NODE <- 4
all.data.scale[as.vector(model.rpart.prune$where==5),]$NODE <- 5
all.data.scale[as.vector(model.rpart.prune$where==2),]$NODE <- 2
####################################
# find the no of split distribution by cross validations (25)
no.split <- vector(mode = 'integer', length=25)
for ( i in 1:length(no.split)) {
cp <- rpart(formula
, data = all.data.scale)$cptable
no.split[i] <- cp[which.min(cp[,"xerror"]), "nsplit"]
}
table(no.split)
#####################################
all.data.scale[as.vector(model.rpart.prune$where==2),] # node 1
all.data.scale[as.vector(model.rpart.prune$where==4),] # node 4
head(all.data.scale[as.vector(model.rpart.prune$where==5),]) # node 5
library(lattice)
### breakdown of characterisics
barchart( CAT ~ Freq | NODE, data = d1 , group = CV, stack = T)
splom(~all.data.scale[as.vector(model.rpart.prune$where==5),c("OFFER","S.SD","CV","S.MIN")]
,pscale = 0, type = c("g", "p", "smooth"))
splom(~all.data.scale[as.vector(model.rpart.prune$where==5),c("OFFER","S.SD","CV","S.MIN")])
splom(~all.data.scale[as.vector(model.rpart.prune$where==4),c("OFFER","S.SD","CV","S.MIN")]
,pscale = 0, type = c("g", "p", "smooth"))
splom(~all.data.scale[as.vector(model.rpart.prune$where==4),c("OFFER","S.SD","CV","S.MIN")])
splom(~all.data.scale[as.vector(model.rpart.prune$where==2),c("OFFER","S.SD","CV","S.MIN")]
,pscale = 0, type = c("g", "p", "smooth"))
splom(~all.data.scale[as.vector(model.rpart.prune$where==2),c("OFFER","S.SD","CV","S.MIN")])
### customer characteristic distribution for node = 5
barchart( ~ CV | C1 + C2 + C3
,data = all.data.scale[as.vector(model.rpart.prune$where==5)
,c("OFFER","D.SD","CV","C1","C2","C3")]
,group = OFFER, stack = T)
### SD on price is different from different nodes
histogram( CV ~ SD | C1+C2+C3
,data = all.data.scale[as.vector(model.rpart.prune$where==5)
,c("OFFER","MIN","SD","D.SD","CV","C1","C2","C3")])
#Price distribution of each node group
histogram( CV ~ MIN | C1 + C2 + C3
,data = all.data.scale[as.vector(model.rpart.prune$where==4)
,c("OFFER","MIN","D.SD","CV","C1","C2","C3")]
)
histogram( CV ~ MIN | C1 + C2 + C3
,data = all.data.scale[as.vector(model.rpart.prune$where==4)&all.data.scale$CV==1,c("OFFER","MIN","D.SD","CV","C1","C2","C3")]
)
### distribution of SD
histogram( ~ SD | CV
,data = all.data.scale[as.vector(model.rpart.prune$where==5)
,c("OFFER","D.SD","SD","CV","C1","C2","C3")])
bwplot(CV~ SD|C1+C2+C3
,data = all.data.scale[as.vector(model.rpart.prune$where==5)
,c("OFFER","D.SD","SD","CV","C1","C2","C3")])
### no of offer affecting CV. Not much affects.
histogram( ~ OFFER | CV
,data = all.data.scale[as.vector(model.rpart.prune$where==4)
,c("OFFER","D.SD","CV","C1","C2","C3")])
histogram(~CV | OFFER
,data = all.data.scale[as.vector(model.rpart.prune$where==5),c("OFFER","D.SD","SD","CV","C1","C2","C3")])
histogram( ~SD | C1+C2+C3
,data = all.data.scale[as.vector(model.rpart.prune$where==4)&all.data.scale$CV==0, c("OFFER","D.SD","SD","CV","C1","C2","C3")])
#################################################################
#################################################################
# build decision tree including SD feature
library(rpart)
library("partykit")
formula <- CV ~ S.OFFER + S.SD
formula <- CV ~ C11 + C22 + C33 + S.OFFER + S.SD
#formula <- CV ~ X + Y + A + B + C + D + E + F + G + H + I + S.OFFER + S.SD
model.rpart <-rpart(formula, data=all.data.scale)
#print(model.rpart$cptable)
# here we prune back the large initial tree:
opt<-which.min(model.rpart$cptable[,'xerror'])
cp<-model.rpart$cptable[opt,'CP']
model.rpart.prune <- prune(model.rpart, cp = cp)
plot(as.party(model.rpart.prune),
tp_args = list(id = FALSE))
fancyRpartPlot(model.rpart.prune)
####################################
# find the no of split distribution by cross validations (25)
no.split <- vector(mode = 'integer', length=25)
for ( i in 1:length(no.split)) {
cp <- rpart(formula, data = all.data.scale)$cptable
no.split[i] <- cp[which.min(cp[,"xerror"]), "nsplit"]
}
table(no.split)
#####################################