Sunday, February 28, 2016
Tree based imputation Part 1 of 2
Missing value imputation via rpart
rm(list=ls())
library(rpart)
setwd ('C:\\Users\\Ted\\Documents\\MSPA\\2016 Winter\\Pred 411\\Unit_03')
train = read.csv('wine.csv')
str(train); class(train)
colnames(train)[1] = 'INDEX'
apply(apply(train,2,is.na),2,sum)
missCol = names(which(apply(is.na(train),2,any)))
# "ResidualSugar" "Chlorides" "FreeSulfurDioxide"
# "TotalSulfurDioxide" "pH" "Sulphates"
# "Alcohol" "STARS"
for (i in 1:length(missCol)) {
var = missCol[i]
MIFlag = paste0(var,'_MIFLAG')
formula = paste(var, '~ .')
model.rpart =
rpart(formula,
data=train[!is.na(train[,missCol[i]]),
!names(train) %in% c('INDEX', 'TARGET')]
)
opt<-which.min(model.rpart$cptable[,'xerror'])
cp<-model.rpart$cptable[opt,'CP']
model.rpart.prune <- prune(model.rpart, cp = cp)
#assign(paste0('model.rpart.prune.',var), model.rpart.prune)
train[[MIFlag]] = ifelse(is.na(train[[var]]), 1, 0)
train[,var] =
ifelse(is.na(train[,var]), predict(model.rpart.prune), train[,var])
}
#check if the missing values have been imputed
apply(apply(train,2,is.na),2,sum) #apply(is.na(train),2,any)
write.table(train, "wine_train.csv", sep=",", row.names=FALSE)
library(rattle)
asRules(model.rpart.prune.STARS)
asRules(model.rpart.prune.ResidualSugar)
## http://stats.stackexchange.com/questions/72251/an-example-lasso-regression-using-glmnet-for-binary-outcome
## http://stats.stackexchange.com/questions/77546/how-to-interpret-glmnet
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment