######
# Data Manipulation
######
train.data<-read.csv(file='train.csv',header=T)
train.data <- read.csv(file.choose(),header=T)
apply(apply(train.data,2,is.na),2,sum)
##### categorize Cabin #####
train.data$CabinDT<-substr(train.data$Cabin,1,1)
CabinDT.lookup<-cbind(unique(train.data$CabinDT),
seq(1:length(unique(train.data$CabinDT))))
CabinDT.lookup<-as.data.frame(CabinDT.lookup)
CabinDT.lookup$V2<-as.numeric(CabinDT.lookup$V2)
train.data<-merge(train.data, CabinDT.lookup, by.x=c('CabinDT'), by.y=c('V1'))
train.data$CabinDT<-train.data$V2
train.data$V2<-NULL
#### name prefix set up
train.data$prefix<-substr(train.data[,'Name'],
regexpr(',',train.data[,'Name'])+2,
regexpr('\\.\\s',train.data[,'Name']))
train.data$prefix
prefix.lookup<-cbind(unique(train.data$prefix),
seq(1:length(unique(train.data$prefix))))
prefix.lookup<-as.data.frame(prefix.lookup)
prefix.lookup$V2<-as.numeric(prefix.lookup$V2)
train.data<-merge(train.data, prefix.lookup, by.x=c('prefix'), by.y=c('V1'))
train.data$prefix<-train.data$V2
train.data$V2<-NULL
str(train.data)
xtabs(~ Sex+Pclass+Survived, data=dt)
densityplot(~Age | factor(Pclass) + factor(Survived)
,data=dt
,plot.points=FALSE
,ref=TRUE)
densityplot(~ Age | Sex
,data=dt
,group=Pclass
,plot.points=FALSE
,ref=TRUE
,auto.key=list(title='PClass',columns=3))
histogram(~factor(Pclass) | Sex, data=dt)
histogram(~factor(Survived) | factor(Pclass)+ Sex
,data=train.data)
### this barchart need clean up ###
barchart( Pclass ~ i | Sex,
data = train.data,
#groups= as.factor(Survived),
groups= Survived,
stack = TRUE,
#par.settings=list(axis.line=list(col=NA)),
auto.key=list(title='Survived', columns=2),
scale=list(x='free'))
##
#Visualize the correlations among features
#Dendrogram shows natural clustering of 5 or 7
#The height of dendrogram represents the differences in sum of square in euclidean distances
#Finally, iterative clustering graph confirms the optimal number of clustering at 5 and 7.