Sunday, February 28, 2016

Tree based imputation Part 1 of 2


Missing value imputation via rpart

rm(list=ls())
library(rpart)

setwd ('C:\\Users\\Ted\\Documents\\MSPA\\2016 Winter\\Pred 411\\Unit_03')
train = read.csv('wine.csv')
str(train); class(train)
colnames(train)[1] = 'INDEX'


apply(apply(train,2,is.na),2,sum)
missCol = names(which(apply(is.na(train),2,any)))
# "ResidualSugar"      "Chlorides"          "FreeSulfurDioxide"
# "TotalSulfurDioxide" "pH"                 "Sulphates"        
# "Alcohol"            "STARS"



for (i in 1:length(missCol)) {
  var     = missCol[i]
  MIFlag  = paste0(var,'_MIFLAG')
  formula = paste(var, '~ .')
 
  model.rpart =
    rpart(formula,
          data=train[!is.na(train[,missCol[i]]),
                     !names(train) %in% c('INDEX', 'TARGET')]      
    )
  opt<-which.min(model.rpart$cptable[,'xerror'])
  cp<-model.rpart$cptable[opt,'CP']
  model.rpart.prune <- prune(model.rpart, cp = cp)
  #assign(paste0('model.rpart.prune.',var), model.rpart.prune)
 
  train[[MIFlag]] = ifelse(is.na(train[[var]]), 1, 0)
  train[,var]   =
    ifelse(is.na(train[,var]), predict(model.rpart.prune), train[,var])
}

#check if the missing values have been imputed
apply(apply(train,2,is.na),2,sum) #apply(is.na(train),2,any)

write.table(train, "wine_train.csv", sep=",", row.names=FALSE)

library(rattle)
asRules(model.rpart.prune.STARS)
asRules(model.rpart.prune.ResidualSugar)

## http://stats.stackexchange.com/questions/72251/an-example-lasso-regression-using-glmnet-for-binary-outcome
## http://stats.stackexchange.com/questions/77546/how-to-interpret-glmnet

No comments:

Post a Comment