Thursday, May 1, 2014

Neural Network - Single & Multiple output nodes

##### AllState Prediction Model using Neural Network Algorithm
##### Working in progress

rm(list=ls())

###### read in file
setwd('C:\\Users\\Ted\\Desktop\\Kaggle\\AllState')
dat1 <- read.csv(file="train.csv", header=T)
test1 <- read.csv(file="test_v2.csv", header=T)

##### overview of data
str(dat1);summary(dat1);nrow(dat1);head(dat1,2)
apply(apply(dat1,2,is.na),2,sum)


##### dat2 is for only purchase record
dat1$p <- apply(dat1[,c("A", "B", "C", "D", "E", "F", "G")],1,paste, collapse='')

##### convert factors into numbers #dat1$st <- NULL
a <- data.frame(sort(unique(dat1$state)), order(sort(unique(dat1$state))))
colnames(a) <- c("state","state_no")
b <- data.frame(sort(unique(dat1$p)), order(sort(unique(dat1$p))))
colnames(b) <- c("p","p_no")
dat1 <- merge(dat1,a, by='state');rm(a)
dat1 <- merge(dat1,b, by="p");rm(b)


##### binarize car value
cat <- levels(dat1$car_value)
cat[1] <- c("u")
#for (i in cat) {cat[i] <- paste("car_value",i, collapse=" ")};cat

binarize <- function(x) {return(dat1$car_value == x)}
newcols <- --sapply(cat, binarize)
colnames(newcols) <- cat
dat1 <- cbind(dat1, newcols)
rm(cat);rm(newcols);rm(binarize)



##### do the same manipulation on test set
apply(apply(test1,2,is.na),2,sum)
test1$p <- apply(test1[,c("A", "B", "C", "D", "E", "F", "G")], 1, paste, collapse='')

a <- data.frame(sort(unique(test1$state)), order(sort(unique(test1$state))))
colnames(a) <- c("state","state_no")
b <- data.frame(sort(unique(test1$p)), order(sort(unique(test1$p))))
colnames(b) <- c("p","p_no")
test1 <- merge(test1,a, by='state');rm(a)
test1 <- merge(test1,b, by="p");rm(b)

cat <- levels(test1$car_value)
cat[1] <- c("u")
binarize <- function(x) {return(test1$car_value == x)}
newcols <- --sapply(cat, binarize)
colnames(newcols) <- cat
test1 <- cbind(test1, newcols)
rm(cat);rm(newcols);rm(binarize)






##### final cut for record type = 1
dat2 <- dat1[dat1$record_type==1,]






##### graph data distribution
hist(dat2$p_no)





##### random forest model
library(randomForest)
formula <- p ~ day + state + location + group_size + homeowner + car_age + car_value + age_oldest +
               age_youngest + married_couple + cost # + risk_factor +  c_previous + duration_previous
model.rf <- randomForest(formula = formula, data=dat2, ntree=100, importance=T)


head(dat2)
##### neural network model
library(neuralnet); args(neuralnet)
model.nn <- neuralnet(p_no ~ day + state_no + location + group_size + homeowner + car_age + car_value + age_oldest +
                        age_youngest + married_couple + cost # + risk_factor +  c_previous + duration_previous
                      , data=dat2, hidden=3, act.fct="logistic",rep = 3, linear.output = F)


m <- model.matrix( ~ p_no + day + state_no + location + group_size + homeowner + car_age +
                     u + a + b + c + d + e + f + g + h + i + age_oldest +
                    age_youngest + married_couple + cost # + risk_factor +  c_previous + duration_previous
                   ,data = dat2)

model.nn <- neuralnet(p_no ~ day + state_no + location + group_size + homeowner + car_age +
                        u + a + b + c + d + e + f + g + h + i + age_oldest +
                        age_youngest + married_couple + cost # + risk_factor +  c_previous + duration_previous
                      ,data=m , hidden = 2, threshold=0.01, linear.output=F)


pred.bin <- prediction(model.nn)
pred.bin$rep1
plot(model.nn, rep="best")