Monday, September 9, 2013

Data Manipulation and visual clustering analysis


######
# Data Manipulation
######

train.data<-read.csv(file='train.csv',header=T)
train.data <-  read.csv(file.choose(),header=T)


apply(apply(train.data,2,is.na),2,sum)


##### categorize Cabin #####
train.data$CabinDT<-substr(train.data$Cabin,1,1)
CabinDT.lookup<-cbind(unique(train.data$CabinDT),
                      seq(1:length(unique(train.data$CabinDT))))
CabinDT.lookup<-as.data.frame(CabinDT.lookup)
CabinDT.lookup$V2<-as.numeric(CabinDT.lookup$V2)
train.data<-merge(train.data, CabinDT.lookup, by.x=c('CabinDT'), by.y=c('V1'))
train.data$CabinDT<-train.data$V2
train.data$V2<-NULL

#### name prefix set up

train.data$prefix<-substr(train.data[,'Name'],
                         regexpr(',',train.data[,'Name'])+2,
                         regexpr('\\.\\s',train.data[,'Name']))

train.data$prefix
prefix.lookup<-cbind(unique(train.data$prefix),
                     seq(1:length(unique(train.data$prefix))))
prefix.lookup<-as.data.frame(prefix.lookup)
prefix.lookup$V2<-as.numeric(prefix.lookup$V2)

train.data<-merge(train.data, prefix.lookup, by.x=c('prefix'), by.y=c('V1'))
train.data$prefix<-train.data$V2
train.data$V2<-NULL
str(train.data)



xtabs(~ Sex+Pclass+Survived, data=dt)


densityplot(~Age | factor(Pclass) + factor(Survived)
            ,data=dt
            ,plot.points=FALSE
            ,ref=TRUE)

densityplot(~ Age | Sex 
            ,data=dt
            ,group=Pclass
            ,plot.points=FALSE
            ,ref=TRUE
            ,auto.key=list(title='PClass',columns=3))


histogram(~factor(Pclass) | Sex, data=dt)
histogram(~factor(Survived) | factor(Pclass)+ Sex
          ,data=train.data)


### this barchart need clean up ###  
barchart( Pclass ~ i | Sex,
          data = train.data,
          #groups= as.factor(Survived),
          groups= Survived,
          stack = TRUE,
          #par.settings=list(axis.line=list(col=NA)),
          auto.key=list(title='Survived', columns=2),
          scale=list(x='free'))



##
#Visualize the correlations among features
#Dendrogram shows natural clustering of 5 or 7 
#The height of dendrogram represents the differences in sum of square in euclidean distances
#Finally, iterative clustering graph confirms the optimal number of clustering at 5 and 7.





No comments:

Post a Comment