Monday, September 2, 2013
KNN on large data set in R parallel computing HPC (ff, ffbase, doSNOW)
I was running into problem of running data mining model on big dataset.
There are many solutions available in HPC (High Performance Computing) solutions.
This KNN (Kth nearest neighborhood) method utilizes multi-core parallel computing and data size in order of 10^7 rows.
# Accelerometer knn test
library(ff)
library(ffbase)
library(doSNOW)
registerDoSNOW(makeCluster(4, type = "SOCK"))
getDoParWorkers();getDoParName();getDoParVersion()
wd <- setwd('C:/Users/Ted/Desktop/Kaggle/Accelerometer Biometric');wd
td<-tempfile();td #dir(td)
#td <- "C:\\Users\\Ted\\AppData\\Local\\Temp\\RtmpELKYXT\\file218468623d1d"
dir(td)
ff.train <- read.csv.ffdf(file='train.csv')
ff.test <- read.csv.ffdf(file='test.csv')
ff.questions <- read.csv.ffdf(file='questions.csv')
save.ffdf(ff.train, dir='./ffdb')
save.ffdf(ff.test, dir='./ffdb')
save.ffdf(ff.questions, dir='./ffdb')
#load.ffdf(dir='./ffdb')
x <- tapply(ff.train$X[], ff.train$Device[],
mean, trim=0.05,nr.rm=T)
y <- tapply(ff.train$Y[], ff.train$Device[],
mean, trim=0.05,nr.rm=T)
z <- tapply(ff.train$Z[], ff.train$Device[],
mean, trim=0.05,nr.rm=T)
mat.train <- cbind(x,y,z)
rm(x,y,z)
x <- tapply(ff.test$X[], ff.test$SequenceId[],
mean, trim=0.05,nr.rm=T)
y <- tapply(ff.test$Y[], ff.test$SequenceId[],
mean, trim=0.05,nr.rm=T)
z <- tapply(ff.test$Z[], ff.test$SequenceId[],
mean, trim=0.05,nr.rm=T)
mat.test <- cbind(x,y,z)
rm(x,y,z)
# Accelerometer knn test
#library(plyr)
# train <- ddply(train, .(Device), summarize,
# x = mean(X), y = mean(Y), z = mean(Z))
#
# test <- ddply(test, .(SequenceId), summarize,
# x = mean(X), y = mean(Y), z = mean(Z))
# this is equivalent of df[1,]
ff.questions[1,]
ff.questions[][1,]
ff.test[][1,]
# this is equivalent of df$Sequence
ff.questions$SequenceId[]
ff.questions$SequenceId
ff.questions[][,c=(1,2,3)]??
ff.questions[][2,1:3]
ff.test[2,1:3]
ff.test[][2,1:3]
ff.test[2,2:4]
library(class)
?knn
outdata <- lapply(1:nrow(ff.questions), function(i) {
cat("Working on question", i, "\n")
this.q <- ff.questions[][i,]
this.test <- ff.test[][ff.test$SequenceId[] == this.q$SequenceId, 2:4]
y <- ff.train$Device[] == this.q$QuizDevice
knn(ff.train[,2:4], this.test, cl = y)
#knn(ff.train[][,2:4], this.test, cl = y)
#knn(train[c("x", "y", "z")], this.test, cl = y)
})
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment