setwd('C:\\Users\\Ted\\Documents\\Kaggle\\RedHat\\Data')
library(data.table)
library(ggplot2)
library(randomForest)
library(lattice)
library(caret)
people = fread("people.csv")
p_logi <- names(people)[which(sapply(people, is.logical))]
for (col in p_logi) set(people, j = col,
value = as.integer(people[[col]]))
train = fread("act_train.csv")
dt = merge(x=people, y=train, by = "people_id", all.x = T)
#dt$outcome = as.factor(dt$outcome)
rm(people);rm(train)
set.seed(12345)
s = sample(nrow(dt), replace=FALSE, floor(nrow(dt)*0.1))
dt = dt[s,]; rm(s)
dt = dt[!is.na(outcome),]
#############################################################
#############################################################
### one-hot encoding can be implemented instead of impact coding ###
dt.level=sapply(lapply(dt, unique), length)
var = names(dt.level)[!names(dt.level) %in%
c('people_id','date.x','date.y','activity_id','char_10.y','group_1','outcome')]
pvalue = NULL
for (i in var) {
a = table(dt[[i]])
a = a[a > 100]
b = chisq.test(table(
dt[get(i) %in% names(a) & !is.na(outcome), .(get(i), outcome)]
)
)
isNum = ifelse(is.numeric(dt[,get(i)]),'numeric','character')
level = length(unique(dt[,get(i)]))
pvalue = rbind(pvalue, data.frame(i, b$p.value, isNum, level))
}
pvalue
# return a model of the conditional probability
# of dependent variable (depvar) by level
# assumes outcome is logical and not null
impactModel = function(xcol, depvar) {
n = length(depvar)
p = sum(as.numeric(levels(depvar))[dt$outcome])/n
# duplicate output for NA (average NA towards grand uniform average)
x = c(xcol,xcol)
y = c(depvar, depvar)
x[(1+n):(2*n)] = NA
levelcounts = table(x, y, useNA="always")
condprobmodel = (levelcounts[,2]+p)/(levelcounts[,1]+levelcounts[,2]+1.0)
# apply model example: applyImpactModel(condprobmodel,data[,varname])
condprobmodel
} # impact_char_1.y = impactModel(dt$char_1.y, dt$outcome)
# apply model to column to essentially return condprobmodel[rawx]
# both NA's and new levels are smoothed to original grand average
applyImpactModel = function(condprobmodel, xcol) {
naval = condprobmodel[is.na(names(condprobmodel))]
dim = length(xcol)
condprobvec = numeric(dim) + naval
for(nm in names(condprobmodel)) {
if(!is.na(nm)) {
condprobvec[xcol==nm] = condprobmodel[nm]
}
}
condprobvec
} # dt$impact_char_1.y = applyImpactModel(impact_char_1.y, dt$char_1.y)
# Original - convert Address variable to its impact model outcome
# impact_char_1.y = impactModel(dt$char_1.y, dt$outcome)
# dt$impact_char_1.y = applyImpactModel(impact_char_1.y, dt$outcome)
# debugonce(function_name); function_name()
var1 = subset(pvalue, isNum =='character' & level > 6 & level < 2000)
var1 = var1$i
for (i in var1){
a = paste0('impact_', i)
print(a)
dt[[a]] = applyImpactModel(impactModel(dt[[i]], dt$outcome),
dt[[i]])
}
# a = 'impact_char_8.y'
# b = paste0('impact_', a)
# dt[[b]] = applyImpactModel(impactModel(dt[[a]], dt$outcome),
# dt$outcome)
var2 = subset(pvalue, isNum =='character' & level <=6)
var2 = var2$i
a = paste('~', paste(var2, collapse=' + '))
dmy = dummyVars(a, data=dt, fullRank=TRUE); # trsf = data.table(predict(dmy, newdata=dt))
var2 = names(data.table(predict(dmy, newdata=dt)))
dt = cbind(dt, data.table(predict(dmy, newdata=dt)))
fn_remove_na = function(DT) {
# either of the following for loops
# by name :
# for (j in names(DT))
# set(DT,which(is.na(DT[[j]])),j,0)
# or by number (slightly faster than by name) :
for (j in seq_len(ncol(DT)))
set(DT,which(is.na(DT[[j]])),j,0)
}
fn_remove_na(dt)
#############################################################
#############################################################
library(xgboost)
library(Matrix)
fn.cv.xgb = function(x,kfold,ntree,depth,eta){
set.seed(12345)
x$s = ceiling(runif(nrow(x), 0, kfold))
cv = NULL
n = names(x)
# formula <- as.formula(paste("TARGET ~", paste(n[!n %in% c('TARGET','INDEX')], collapse = " + ")))
for (kfold_idx in 1:kfold){
train = x[!x$s==kfold_idx,]
valid = x[x$s==kfold_idx,]
Ytrain = sparse.model.matrix(train[['outcome']])
Yvalid = sparse.model.matrix(valid[['outcome']])
train = sparse.model.matrix(train[,!colnames(train) %in% c('s','outcome'), with=F])
valid = sparse.model.matrix(valid[,!colnames(valid) %in% c('s','outcome'), with=F])
for(ntree_idx in seq(10, ntree, by=10)){
for(depth_idx in seq(1, depth, by=1)){
for(eta_idx in seq(0.2, eta, by=0.1)){
# sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df)
model.xgb <- xgboost(data=train, label=Ytrain,
max.depth=depth, nround=ntree, eta=eta_idx,
objective = "binary:logistic", verbose=0)
P_xgb =predict(model.xgb, train)
error = (P_xgb - Ytrain)^2
error_train = sum(error); mean_error_train = mean(error)
n_train = nrow(train)
P_xgb = predict(model.xgb, valid)
error = (P_xgb - Yvalid)^2
error_valid = sum(error); mean_error_valid = mean(error)
n_valid = nrow(valid)
cv = rbind(cv, data.frame(kfold_idx, ntree_idx, depth_idx, eta_idx,
n_train, error_train, mean_error_train,
n_valid, error_valid, mean_error_valid))
}
}
}
}
return(cv)
}
#function(x,kfold,ntree,depth,eta)
# cv=fn.cv.xgb(dt, 5, 50, 6, 1.2)
cv=fn.cv.xgb(dt, 5, 50, 4, 0.4)