Monday, September 9, 2013

Data Manipulation and visual clustering analysis


######
# Data Manipulation
######

train.data<-read.csv(file='train.csv',header=T)
train.data <-  read.csv(file.choose(),header=T)


apply(apply(train.data,2,is.na),2,sum)


##### categorize Cabin #####
train.data$CabinDT<-substr(train.data$Cabin,1,1)
CabinDT.lookup<-cbind(unique(train.data$CabinDT),
                      seq(1:length(unique(train.data$CabinDT))))
CabinDT.lookup<-as.data.frame(CabinDT.lookup)
CabinDT.lookup$V2<-as.numeric(CabinDT.lookup$V2)
train.data<-merge(train.data, CabinDT.lookup, by.x=c('CabinDT'), by.y=c('V1'))
train.data$CabinDT<-train.data$V2
train.data$V2<-NULL

#### name prefix set up

train.data$prefix<-substr(train.data[,'Name'],
                         regexpr(',',train.data[,'Name'])+2,
                         regexpr('\\.\\s',train.data[,'Name']))

train.data$prefix
prefix.lookup<-cbind(unique(train.data$prefix),
                     seq(1:length(unique(train.data$prefix))))
prefix.lookup<-as.data.frame(prefix.lookup)
prefix.lookup$V2<-as.numeric(prefix.lookup$V2)

train.data<-merge(train.data, prefix.lookup, by.x=c('prefix'), by.y=c('V1'))
train.data$prefix<-train.data$V2
train.data$V2<-NULL
str(train.data)



xtabs(~ Sex+Pclass+Survived, data=dt)


densityplot(~Age | factor(Pclass) + factor(Survived)
            ,data=dt
            ,plot.points=FALSE
            ,ref=TRUE)

densityplot(~ Age | Sex 
            ,data=dt
            ,group=Pclass
            ,plot.points=FALSE
            ,ref=TRUE
            ,auto.key=list(title='PClass',columns=3))


histogram(~factor(Pclass) | Sex, data=dt)
histogram(~factor(Survived) | factor(Pclass)+ Sex
          ,data=train.data)


### this barchart need clean up ###  
barchart( Pclass ~ i | Sex,
          data = train.data,
          #groups= as.factor(Survived),
          groups= Survived,
          stack = TRUE,
          #par.settings=list(axis.line=list(col=NA)),
          auto.key=list(title='Survived', columns=2),
          scale=list(x='free'))



##
#Visualize the correlations among features
#Dendrogram shows natural clustering of 5 or 7 
#The height of dendrogram represents the differences in sum of square in euclidean distances
#Finally, iterative clustering graph confirms the optimal number of clustering at 5 and 7.





Monday, September 2, 2013

KNN on large data set in R parallel computing HPC (ff, ffbase, doSNOW)



I was running into problem of running data mining model on big dataset.
There are many solutions available in HPC (High Performance Computing) solutions.

This KNN (Kth nearest neighborhood) method utilizes multi-core parallel computing and data size in order of 10^7 rows.




# Accelerometer knn test
library(ff)
library(ffbase)
library(doSNOW)

registerDoSNOW(makeCluster(4, type = "SOCK"))
getDoParWorkers();getDoParName();getDoParVersion()

wd <- setwd('C:/Users/Ted/Desktop/Kaggle/Accelerometer Biometric');wd
td<-tempfile();td #dir(td)
#td <- "C:\\Users\\Ted\\AppData\\Local\\Temp\\RtmpELKYXT\\file218468623d1d"
dir(td)

ff.train <- read.csv.ffdf(file='train.csv')
ff.test <- read.csv.ffdf(file='test.csv')
ff.questions <- read.csv.ffdf(file='questions.csv')



save.ffdf(ff.train, dir='./ffdb')
save.ffdf(ff.test, dir='./ffdb')
save.ffdf(ff.questions, dir='./ffdb')
#load.ffdf(dir='./ffdb')




x <- tapply(ff.train$X[], ff.train$Device[],
                  mean, trim=0.05,nr.rm=T)

y <- tapply(ff.train$Y[], ff.train$Device[],
                  mean, trim=0.05,nr.rm=T)

z <- tapply(ff.train$Z[], ff.train$Device[],
                  mean, trim=0.05,nr.rm=T)

mat.train <- cbind(x,y,z)
rm(x,y,z)


x <- tapply(ff.test$X[], ff.test$SequenceId[],
            mean, trim=0.05,nr.rm=T)

y <- tapply(ff.test$Y[], ff.test$SequenceId[],
            mean, trim=0.05,nr.rm=T)

z <- tapply(ff.test$Z[], ff.test$SequenceId[],
            mean, trim=0.05,nr.rm=T)


mat.test <- cbind(x,y,z)
rm(x,y,z)



# Accelerometer knn test
#library(plyr)

# train <- ddply(train, .(Device), summarize,
#                x = mean(X), y = mean(Y), z = mean(Z))
#
# test <- ddply(test, .(SequenceId), summarize,
#               x = mean(X), y = mean(Y), z = mean(Z))






# this is equivalent of df[1,]
ff.questions[1,]
ff.questions[][1,]
ff.test[][1,]

# this is equivalent of df$Sequence
ff.questions$SequenceId[]
ff.questions$SequenceId

ff.questions[][,c=(1,2,3)]??
ff.questions[][2,1:3]
ff.test[2,1:3]
ff.test[][2,1:3]
ff.test[2,2:4]


library(class)
?knn


outdata <- lapply(1:nrow(ff.questions), function(i) {
  cat("Working on question", i, "\n")
  this.q <- ff.questions[][i,]

  this.test <- ff.test[][ff.test$SequenceId[] == this.q$SequenceId, 2:4]

  y <- ff.train$Device[] == this.q$QuizDevice

  knn(ff.train[,2:4], this.test, cl = y)
  #knn(ff.train[][,2:4], this.test, cl = y)
  #knn(train[c("x", "y", "z")], this.test, cl = y)
})