Monday, September 9, 2013

Data Manipulation and visual clustering analysis


######
# Data Manipulation
######

train.data<-read.csv(file='train.csv',header=T)
train.data <-  read.csv(file.choose(),header=T)


apply(apply(train.data,2,is.na),2,sum)


##### categorize Cabin #####
train.data$CabinDT<-substr(train.data$Cabin,1,1)
CabinDT.lookup<-cbind(unique(train.data$CabinDT),
                      seq(1:length(unique(train.data$CabinDT))))
CabinDT.lookup<-as.data.frame(CabinDT.lookup)
CabinDT.lookup$V2<-as.numeric(CabinDT.lookup$V2)
train.data<-merge(train.data, CabinDT.lookup, by.x=c('CabinDT'), by.y=c('V1'))
train.data$CabinDT<-train.data$V2
train.data$V2<-NULL

#### name prefix set up

train.data$prefix<-substr(train.data[,'Name'],
                         regexpr(',',train.data[,'Name'])+2,
                         regexpr('\\.\\s',train.data[,'Name']))

train.data$prefix
prefix.lookup<-cbind(unique(train.data$prefix),
                     seq(1:length(unique(train.data$prefix))))
prefix.lookup<-as.data.frame(prefix.lookup)
prefix.lookup$V2<-as.numeric(prefix.lookup$V2)

train.data<-merge(train.data, prefix.lookup, by.x=c('prefix'), by.y=c('V1'))
train.data$prefix<-train.data$V2
train.data$V2<-NULL
str(train.data)



xtabs(~ Sex+Pclass+Survived, data=dt)


densityplot(~Age | factor(Pclass) + factor(Survived)
            ,data=dt
            ,plot.points=FALSE
            ,ref=TRUE)

densityplot(~ Age | Sex 
            ,data=dt
            ,group=Pclass
            ,plot.points=FALSE
            ,ref=TRUE
            ,auto.key=list(title='PClass',columns=3))


histogram(~factor(Pclass) | Sex, data=dt)
histogram(~factor(Survived) | factor(Pclass)+ Sex
          ,data=train.data)


### this barchart need clean up ###  
barchart( Pclass ~ i | Sex,
          data = train.data,
          #groups= as.factor(Survived),
          groups= Survived,
          stack = TRUE,
          #par.settings=list(axis.line=list(col=NA)),
          auto.key=list(title='Survived', columns=2),
          scale=list(x='free'))



##
#Visualize the correlations among features
#Dendrogram shows natural clustering of 5 or 7 
#The height of dendrogram represents the differences in sum of square in euclidean distances
#Finally, iterative clustering graph confirms the optimal number of clustering at 5 and 7.





Monday, September 2, 2013

KNN on large data set in R parallel computing HPC (ff, ffbase, doSNOW)



I was running into problem of running data mining model on big dataset.
There are many solutions available in HPC (High Performance Computing) solutions.

This KNN (Kth nearest neighborhood) method utilizes multi-core parallel computing and data size in order of 10^7 rows.




# Accelerometer knn test
library(ff)
library(ffbase)
library(doSNOW)

registerDoSNOW(makeCluster(4, type = "SOCK"))
getDoParWorkers();getDoParName();getDoParVersion()

wd <- setwd('C:/Users/Ted/Desktop/Kaggle/Accelerometer Biometric');wd
td<-tempfile();td #dir(td)
#td <- "C:\\Users\\Ted\\AppData\\Local\\Temp\\RtmpELKYXT\\file218468623d1d"
dir(td)

ff.train <- read.csv.ffdf(file='train.csv')
ff.test <- read.csv.ffdf(file='test.csv')
ff.questions <- read.csv.ffdf(file='questions.csv')



save.ffdf(ff.train, dir='./ffdb')
save.ffdf(ff.test, dir='./ffdb')
save.ffdf(ff.questions, dir='./ffdb')
#load.ffdf(dir='./ffdb')




x <- tapply(ff.train$X[], ff.train$Device[],
                  mean, trim=0.05,nr.rm=T)

y <- tapply(ff.train$Y[], ff.train$Device[],
                  mean, trim=0.05,nr.rm=T)

z <- tapply(ff.train$Z[], ff.train$Device[],
                  mean, trim=0.05,nr.rm=T)

mat.train <- cbind(x,y,z)
rm(x,y,z)


x <- tapply(ff.test$X[], ff.test$SequenceId[],
            mean, trim=0.05,nr.rm=T)

y <- tapply(ff.test$Y[], ff.test$SequenceId[],
            mean, trim=0.05,nr.rm=T)

z <- tapply(ff.test$Z[], ff.test$SequenceId[],
            mean, trim=0.05,nr.rm=T)


mat.test <- cbind(x,y,z)
rm(x,y,z)



# Accelerometer knn test
#library(plyr)

# train <- ddply(train, .(Device), summarize,
#                x = mean(X), y = mean(Y), z = mean(Z))
#
# test <- ddply(test, .(SequenceId), summarize,
#               x = mean(X), y = mean(Y), z = mean(Z))






# this is equivalent of df[1,]
ff.questions[1,]
ff.questions[][1,]
ff.test[][1,]

# this is equivalent of df$Sequence
ff.questions$SequenceId[]
ff.questions$SequenceId

ff.questions[][,c=(1,2,3)]??
ff.questions[][2,1:3]
ff.test[2,1:3]
ff.test[][2,1:3]
ff.test[2,2:4]


library(class)
?knn


outdata <- lapply(1:nrow(ff.questions), function(i) {
  cat("Working on question", i, "\n")
  this.q <- ff.questions[][i,]

  this.test <- ff.test[][ff.test$SequenceId[] == this.q$SequenceId, 2:4]

  y <- ff.train$Device[] == this.q$QuizDevice

  knn(ff.train[,2:4], this.test, cl = y)
  #knn(ff.train[][,2:4], this.test, cl = y)
  #knn(train[c("x", "y", "z")], this.test, cl = y)
})

Saturday, August 31, 2013

Kaggle Titanic Machine Learning



Missing age modeling in response to kaggle.com
Titanic Machine Learning Competition

I was working on this and would like to post the algorithms I used for preliminary data preparation.
Quick coding and graphical plots for my selection of model to fill the missing age in train dataset.

Comparison between linear model, random forest and condition random forest

train.data <-  read.csv(file.choose(),header=T)


apply(apply(train.data,2,is.na),2,sum)


##### Categorize Cabin #####
train.data$CabinDT<-substr(train.data$Cabin,1,1)
CabinDT.lookup<-cbind(unique(train.data$CabinDT),
                      seq(1:length(unique(train.data$CabinDT))))
CabinDT.lookup<-as.data.frame(CabinDT.lookup)
CabinDT.lookup$V2<-as.numeric(CabinDT.lookup$V2)
train.data<-merge(train.data, CabinDT.lookup, by.x=c('CabinDT'), by.y=c('V1'))
train.data$CabinDT<-train.data$V2
train.data$V2<-NULL

#### name prefix set up

train.data$prefix<-substr(train.data[,'Name'],
                         regexpr(',',train.data[,'Name'])+2,
                         regexpr('\\.\\s',train.data[,'Name']))

train.data$prefix
prefix.lookup<-cbind(unique(train.data$prefix),
                     seq(1:length(unique(train.data$prefix))))
prefix.lookup<-as.data.frame(prefix.lookup)
prefix.lookup$V2<-as.numeric(prefix.lookup$V2)

train.data<-merge(train.data, prefix.lookup, by.x=c('prefix'), by.y=c('V1'))
train.data$prefix<-train.data$V2
train.data$V2<-NULL
str(train.data)






##### predict age by linear model #####
fm<-Age~Pclass + SibSp + Fare + Parch + prefix

age.model.lm = lm(fm, data=train.data)


library(randomForest)
age.model.rf<-randomForest(fm,
                           data=train.data[complete.cases(train.data),]
                           ,method='anova')


age.model.cf<-cforest(fm, data=train.data[complete.cases(train.data),])


pred.age.lm<-predict(age.model.lm, newdata=train.data)
pred.age.rf<-predict(age.model.rf, newdata=train.data)
pred.age.cf<-predict(age.model.cf, newdata=train.data)


pred.age.lm <-as.data.frame(pred.age.lm)
pred.age.rf <-as.data.frame(pred.age.rf)
pred.age.cf <-as.data.frame(pred.age.cf)

pred.comp<-cbind(train.data[,'Age'],pred.age.lm[,1], pred.age.rf[,1], pred.age.cf[,1])
colnames(pred.comp)=c('train.data','lm','rf','cf')
?col.names
summary(pred.comp)
nrow(pred.comp)
head(pred.comp)
library(ggplot2)
library(reshape2)

pred.comp.melt<-melt(pred.comp, na.rm=F)
colnames(pred.comp.melt) =c('ID','model','age')
head(pred.comp.melt)

#qplot(Value~Var1|Var2, data=pred.comp.melt)


qplot(ID, age, data=pred.comp.melt, color=model) +
  geom_smooth(method='lm', level = 0,size=I(1.2))

qplot(ID, age, data=pred.comp.melt, color=model) +
  stat_smooth(level = 0.5, size=I(1.2))

qplot(ID, age, data=pred.comp.melt, color=model) +
  geom_smooth(level = 0,size=I(1.2))

boxplot(age~model, data=pred.comp.melt)

#par(mfrow=c(1,4))
#layout(c=(1,4))
#par(mfrow=c(1,1))



##### age prediction validation #####
#nrow(train.data[!complete.cases(train.data),])
apply(apply(train.data,2,is.na),2,sum)