Sunday, February 28, 2016

PCA & Clustering


names(train)
pca <- prcomp(train[,-c(1,2)], scale=TRUE)

#names(pca); pca$sdev; pca$rotation; pca$center; pca$scale; pca$x
#summary(pca); biplot(pca, scale=0); head(pca$x)

###############################################
### Select number of PCA
###############################################
cluster = cbind(train[,c(1,2)],pca$x[,1:10]) #head(cluster)



fn.cv.kmean = function(x,y){
  set.seed(12345)
  error = NULL
  for(i in 1:y){
    km=kmeans(x[,!names(x) %in% c('INDEX', 'TARGET')], i, iter.max=1e6, nstart=50, algorithm='Lloyd')
    error = rbind(error, data.frame(i, km$tot.withinss, km$totss))
    #table(km$cluster, x$TARGET)
    #plot(x[,3], x[,4], col=km$cluster)
  }
  return(error)
}
cv=fn.cv.kmean(cluster,12)
plot(cv$i, cv$km.tot.withinss, type='l')



set.seed(12345)
km=kmeans(cluster[,!names(cluster) %in% c('INDEX', 'TARGET')], 6, iter.max=1e6, nstart=50, algorithm='Lloyd')
cluster$kmean = km$cluster
table(km$cluster, cluster$TARGET)
#summary(km); head(cluster)
#names(km)


plot(table(km$cluster, cluster$TARGET))
plot(km$cluster, cluster$TARGET)





library(scatterplot3d)
scatterplot3d(cluster$PC1, cluster$PC2, cluster$kmean, main='',
              highlight.3d=TRUE, color='green', col.grid='lightblue', col.axis = 'blue')

scatterplot3d(cluster$PC1, cluster$PC2, cluster$TARGET,
              highlight.3d=TRUE, color='green', col.grid='lightblue', col.axis = 'blue')

scatterplot3d(cluster$PC1, cluster$PC2, cluster$km, pch=1,
              color=cluster$TARGET, col.grid='lightblue', col.axis='blue')

scatterplot3d(cluster$PC1, cluster$PC2, cluster$TARGET,
              color=cluster$km, col.grid='lightblue', col.axis='blue')

error = abs(cluster$km - cluster$TARGET)
mean(error)



Add caption




############################
############################

hc.complete = hclust(dist(cluster[,-c(1,2)]), method='complete')
hc.average  = hclust(dist(cluster[,-c(1,2)]), method='average')
hc.single   = hclust(dist(cluster[,-c(1,2)]), method='single')
#summary(hc.average); names(hc.average)


plot(hc.complete)
plot(hc.average)
plot(hc.single)








No comments:

Post a Comment