Saturday, August 31, 2013

Kaggle Titanic Machine Learning



Missing age modeling in response to kaggle.com
Titanic Machine Learning Competition

I was working on this and would like to post the algorithms I used for preliminary data preparation.
Quick coding and graphical plots for my selection of model to fill the missing age in train dataset.

Comparison between linear model, random forest and condition random forest

train.data <-  read.csv(file.choose(),header=T)


apply(apply(train.data,2,is.na),2,sum)


##### Categorize Cabin #####
train.data$CabinDT<-substr(train.data$Cabin,1,1)
CabinDT.lookup<-cbind(unique(train.data$CabinDT),
                      seq(1:length(unique(train.data$CabinDT))))
CabinDT.lookup<-as.data.frame(CabinDT.lookup)
CabinDT.lookup$V2<-as.numeric(CabinDT.lookup$V2)
train.data<-merge(train.data, CabinDT.lookup, by.x=c('CabinDT'), by.y=c('V1'))
train.data$CabinDT<-train.data$V2
train.data$V2<-NULL

#### name prefix set up

train.data$prefix<-substr(train.data[,'Name'],
                         regexpr(',',train.data[,'Name'])+2,
                         regexpr('\\.\\s',train.data[,'Name']))

train.data$prefix
prefix.lookup<-cbind(unique(train.data$prefix),
                     seq(1:length(unique(train.data$prefix))))
prefix.lookup<-as.data.frame(prefix.lookup)
prefix.lookup$V2<-as.numeric(prefix.lookup$V2)

train.data<-merge(train.data, prefix.lookup, by.x=c('prefix'), by.y=c('V1'))
train.data$prefix<-train.data$V2
train.data$V2<-NULL
str(train.data)






##### predict age by linear model #####
fm<-Age~Pclass + SibSp + Fare + Parch + prefix

age.model.lm = lm(fm, data=train.data)


library(randomForest)
age.model.rf<-randomForest(fm,
                           data=train.data[complete.cases(train.data),]
                           ,method='anova')


age.model.cf<-cforest(fm, data=train.data[complete.cases(train.data),])


pred.age.lm<-predict(age.model.lm, newdata=train.data)
pred.age.rf<-predict(age.model.rf, newdata=train.data)
pred.age.cf<-predict(age.model.cf, newdata=train.data)


pred.age.lm <-as.data.frame(pred.age.lm)
pred.age.rf <-as.data.frame(pred.age.rf)
pred.age.cf <-as.data.frame(pred.age.cf)

pred.comp<-cbind(train.data[,'Age'],pred.age.lm[,1], pred.age.rf[,1], pred.age.cf[,1])
colnames(pred.comp)=c('train.data','lm','rf','cf')
?col.names
summary(pred.comp)
nrow(pred.comp)
head(pred.comp)
library(ggplot2)
library(reshape2)

pred.comp.melt<-melt(pred.comp, na.rm=F)
colnames(pred.comp.melt) =c('ID','model','age')
head(pred.comp.melt)

#qplot(Value~Var1|Var2, data=pred.comp.melt)


qplot(ID, age, data=pred.comp.melt, color=model) +
  geom_smooth(method='lm', level = 0,size=I(1.2))

qplot(ID, age, data=pred.comp.melt, color=model) +
  stat_smooth(level = 0.5, size=I(1.2))

qplot(ID, age, data=pred.comp.melt, color=model) +
  geom_smooth(level = 0,size=I(1.2))

boxplot(age~model, data=pred.comp.melt)

#par(mfrow=c(1,4))
#layout(c=(1,4))
#par(mfrow=c(1,1))



##### age prediction validation #####
#nrow(train.data[!complete.cases(train.data),])
apply(apply(train.data,2,is.na),2,sum)