##### Working in progress
rm(list=ls())
###### read in file
setwd('C:\\Users\\Ted\\Desktop\\Kaggle\\AllState')
dat1 <- read.csv(file="train.csv", header=T)
test1 <- read.csv(file="test_v2.csv", header=T)
##### overview of data
str(dat1);summary(dat1);nrow(dat1);head(dat1,2)
apply(apply(dat1,2,is.na),2,sum)
##### dat2 is for only purchase record
dat1$p <- apply(dat1[,c("A", "B", "C", "D", "E", "F", "G")],1,paste, collapse='')
##### convert factors into numbers #dat1$st <- NULL
a <- data.frame(sort(unique(dat1$state)), order(sort(unique(dat1$state))))
colnames(a) <- c("state","state_no")
b <- data.frame(sort(unique(dat1$p)), order(sort(unique(dat1$p))))
colnames(b) <- c("p","p_no")
dat1 <- merge(dat1,a, by='state');rm(a)
dat1 <- merge(dat1,b, by="p");rm(b)
##### binarize car value
cat <- levels(dat1$car_value)
cat[1] <- c("u")
#for (i in cat) {cat[i] <- paste("car_value",i, collapse=" ")};cat
binarize <- function(x) {return(dat1$car_value == x)}
newcols <- --sapply(cat, binarize)
colnames(newcols) <- cat
dat1 <- cbind(dat1, newcols)
rm(cat);rm(newcols);rm(binarize)
##### do the same manipulation on test set
apply(apply(test1,2,is.na),2,sum)
test1$p <- apply(test1[,c("A", "B", "C", "D", "E", "F", "G")], 1, paste, collapse='')
a <- data.frame(sort(unique(test1$state)), order(sort(unique(test1$state))))
colnames(a) <- c("state","state_no")
b <- data.frame(sort(unique(test1$p)), order(sort(unique(test1$p))))
colnames(b) <- c("p","p_no")
test1 <- merge(test1,a, by='state');rm(a)
test1 <- merge(test1,b, by="p");rm(b)
cat <- levels(test1$car_value)
cat[1] <- c("u")
binarize <- function(x) {return(test1$car_value == x)}
newcols <- --sapply(cat, binarize)
colnames(newcols) <- cat
test1 <- cbind(test1, newcols)
rm(cat);rm(newcols);rm(binarize)
##### final cut for record type = 1
dat2 <- dat1[dat1$record_type==1,]
##### graph data distribution
hist(dat2$p_no)
##### random forest model
library(randomForest)
formula <- p ~ day + state + location + group_size + homeowner + car_age + car_value + age_oldest +
age_youngest + married_couple + cost # + risk_factor + c_previous + duration_previous
model.rf <- randomForest(formula = formula, data=dat2, ntree=100, importance=T)
head(dat2)
##### neural network model
library(neuralnet); args(neuralnet)
model.nn <- neuralnet(p_no ~ day + state_no + location + group_size + homeowner + car_age + car_value + age_oldest +
age_youngest + married_couple + cost # + risk_factor + c_previous + duration_previous
, data=dat2, hidden=3, act.fct="logistic",rep = 3, linear.output = F)
m <- model.matrix( ~ p_no + day + state_no + location + group_size + homeowner + car_age +
u + a + b + c + d + e + f + g + h + i + age_oldest +
age_youngest + married_couple + cost # + risk_factor + c_previous + duration_previous
,data = dat2)
model.nn <- neuralnet(p_no ~ day + state_no + location + group_size + homeowner + car_age +
u + a + b + c + d + e + f + g + h + i + age_oldest +
age_youngest + married_couple + cost # + risk_factor + c_previous + duration_previous
,data=m , hidden = 2, threshold=0.01, linear.output=F)
pred.bin <- prediction(model.nn)
pred.bin$rep1
plot(model.nn, rep="best")