Missing age modeling in response to kaggle.com
Titanic Machine Learning Competition
I was working on this and would like to post the algorithms I used for preliminary data preparation.
Quick coding and graphical plots for my selection of model to fill the missing age in train dataset.
Comparison between linear model, random forest and condition random forest
train.data <- read.csv(file.choose(),header=T)
apply(apply(train.data,2,is.na),2,sum)
##### Categorize Cabin #####
train.data$CabinDT<-substr(train.data$Cabin,1,1)
CabinDT.lookup<-cbind(unique(train.data$CabinDT),
seq(1:length(unique(train.data$CabinDT))))
CabinDT.lookup<-as.data.frame(CabinDT.lookup)
CabinDT.lookup$V2<-as.numeric(CabinDT.lookup$V2)
train.data<-merge(train.data, CabinDT.lookup, by.x=c('CabinDT'), by.y=c('V1'))
train.data$CabinDT<-train.data$V2
train.data$V2<-NULL
#### name prefix set up
train.data$prefix<-substr(train.data[,'Name'],
regexpr(',',train.data[,'Name'])+2,
regexpr('\\.\\s',train.data[,'Name']))
train.data$prefix
prefix.lookup<-cbind(unique(train.data$prefix),
seq(1:length(unique(train.data$prefix))))
prefix.lookup<-as.data.frame(prefix.lookup)
prefix.lookup$V2<-as.numeric(prefix.lookup$V2)
train.data<-merge(train.data, prefix.lookup, by.x=c('prefix'), by.y=c('V1'))
train.data$prefix<-train.data$V2
train.data$V2<-NULL
str(train.data)
##### predict age by linear model #####
fm<-Age~Pclass + SibSp + Fare + Parch + prefix
age.model.lm = lm(fm, data=train.data)
library(randomForest)
age.model.rf<-randomForest(fm,
data=train.data[complete.cases(train.data),]
,method='anova')
age.model.cf<-cforest(fm, data=train.data[complete.cases(train.data),])
pred.age.lm<-predict(age.model.lm, newdata=train.data)
pred.age.rf<-predict(age.model.rf, newdata=train.data)
pred.age.cf<-predict(age.model.cf, newdata=train.data)
pred.age.lm <-as.data.frame(pred.age.lm)
pred.age.rf <-as.data.frame(pred.age.rf)
pred.age.cf <-as.data.frame(pred.age.cf)
pred.comp<-cbind(train.data[,'Age'],pred.age.lm[,1], pred.age.rf[,1], pred.age.cf[,1])
colnames(pred.comp)=c('train.data','lm','rf','cf')
?col.names
summary(pred.comp)
nrow(pred.comp)
head(pred.comp)
library(ggplot2)
library(reshape2)
pred.comp.melt<-melt(pred.comp, na.rm=F)
colnames(pred.comp.melt) =c('ID','model','age')
head(pred.comp.melt)
#qplot(Value~Var1|Var2, data=pred.comp.melt)
qplot(ID, age, data=pred.comp.melt, color=model) +
geom_smooth(method='lm', level = 0,size=I(1.2))
qplot(ID, age, data=pred.comp.melt, color=model) +
stat_smooth(level = 0.5, size=I(1.2))
qplot(ID, age, data=pred.comp.melt, color=model) +
geom_smooth(level = 0,size=I(1.2))
boxplot(age~model, data=pred.comp.melt)
#par(mfrow=c(1,4))
#layout(c=(1,4))
#par(mfrow=c(1,1))
##### age prediction validation #####
#nrow(train.data[!complete.cases(train.data),])
apply(apply(train.data,2,is.na),2,sum)