Saturday, February 4, 2017

xgboost and impact coding

rm(list=ls())
setwd('C:\\Users\\Ted\\Documents\\Kaggle\\RedHat\\Data')

library(data.table)
library(ggplot2)
library(randomForest)
library(lattice)
library(caret)


people = fread("people.csv")
p_logi <- names(people)[which(sapply(people, is.logical))]

for (col in p_logi) set(people, j = col,
                        value = as.integer(people[[col]]))

train  = fread("act_train.csv")
dt     = merge(x=people, y=train, by = "people_id", all.x = T)
#dt$outcome = as.factor(dt$outcome)

rm(people);rm(train)
set.seed(12345)
s = sample(nrow(dt), replace=FALSE, floor(nrow(dt)*0.1))
dt = dt[s,]; rm(s)
dt = dt[!is.na(outcome),]

#############################################################
#############################################################
### one-hot encoding can be implemented instead of impact coding ###


dt.level=sapply(lapply(dt, unique), length)
var = names(dt.level)[!names(dt.level) %in% 
                          c('people_id','date.x','date.y','activity_id','char_10.y','group_1','outcome')]

pvalue = NULL
for (i in var) {
  a = table(dt[[i]])
  a = a[a > 100]
  b = chisq.test(table(
        dt[get(i) %in% names(a) & !is.na(outcome), .(get(i), outcome)]
        )
      )
  isNum = ifelse(is.numeric(dt[,get(i)]),'numeric','character')
  level = length(unique(dt[,get(i)]))
  pvalue = rbind(pvalue, data.frame(i, b$p.value, isNum, level))
}
pvalue


# return a model of the conditional probability 
# of dependent variable (depvar) by level 
# assumes outcome is logical and not null
impactModel = function(xcol, depvar) {
  n = length(depvar)
  p = sum(as.numeric(levels(depvar))[dt$outcome])/n
  # duplicate output for NA (average NA towards grand uniform average) 
  x = c(xcol,xcol)
  y = c(depvar, depvar)
  x[(1+n):(2*n)] = NA
  levelcounts = table(x, y, useNA="always")
  condprobmodel = (levelcounts[,2]+p)/(levelcounts[,1]+levelcounts[,2]+1.0) 
  # apply model example: applyImpactModel(condprobmodel,data[,varname])
  condprobmodel
}  # impact_char_1.y = impactModel(dt$char_1.y, dt$outcome)



# apply model to column to essentially return condprobmodel[rawx]
# both NA's and new levels are smoothed to original grand average 
applyImpactModel = function(condprobmodel, xcol) {
  naval = condprobmodel[is.na(names(condprobmodel))]
  dim = length(xcol)
  condprobvec = numeric(dim) + naval
  for(nm in names(condprobmodel)) {
    if(!is.na(nm)) {
      condprobvec[xcol==nm] = condprobmodel[nm]
    }
  }
  condprobvec
} # dt$impact_char_1.y = applyImpactModel(impact_char_1.y, dt$char_1.y)

# Original - convert Address variable to its impact model outcome
# impact_char_1.y = impactModel(dt$char_1.y, dt$outcome)
# dt$impact_char_1.y = applyImpactModel(impact_char_1.y, dt$outcome)
# debugonce(function_name); function_name()


var1 = subset(pvalue, isNum =='character' & level > 6 & level < 2000)
var1 = var1$i

for (i in var1){
  a = paste0('impact_', i)
  print(a)
  dt[[a]] = applyImpactModel(impactModel(dt[[i]], dt$outcome),
                             dt[[i]])
}

# a = 'impact_char_8.y'
# b = paste0('impact_', a)
# dt[[b]] = applyImpactModel(impactModel(dt[[a]], dt$outcome),
#                            dt$outcome)


var2 = subset(pvalue, isNum =='character' & level <=6)
var2 = var2$i

a = paste('~', paste(var2, collapse=' + '))

dmy = dummyVars(a, data=dt, fullRank=TRUE); # trsf = data.table(predict(dmy, newdata=dt))
var2 = names(data.table(predict(dmy, newdata=dt)))
dt = cbind(dt, data.table(predict(dmy, newdata=dt)))


fn_remove_na = function(DT) {
# either of the following for loops
# by name :
#   for (j in names(DT))
#     set(DT,which(is.na(DT[[j]])),j,0)
  
# or by number (slightly faster than by name) :
  for (j in seq_len(ncol(DT)))
    set(DT,which(is.na(DT[[j]])),j,0)
}
fn_remove_na(dt)





#############################################################
#############################################################

library(xgboost)
library(Matrix)

fn.cv.xgb = function(x,kfold,ntree,depth,eta){
  set.seed(12345)
  x$s = ceiling(runif(nrow(x), 0, kfold))
  cv = NULL
  n = names(x)
  # formula <- as.formula(paste("TARGET ~", paste(n[!n %in% c('TARGET','INDEX')], collapse = " + ")))
  
  
  for (kfold_idx in 1:kfold){
    train = x[!x$s==kfold_idx,] 
    valid = x[x$s==kfold_idx,]
    Ytrain = sparse.model.matrix(train[['outcome']])
    Yvalid = sparse.model.matrix(valid[['outcome']])
    train = sparse.model.matrix(train[,!colnames(train) %in% c('s','outcome'), with=F])
    valid = sparse.model.matrix(valid[,!colnames(valid) %in% c('s','outcome'), with=F])
    
    for(ntree_idx in seq(10, ntree, by=10)){
      
      for(depth_idx in seq(1, depth, by=1)){
        
        for(eta_idx in seq(0.2, eta, by=0.1)){
          
          # sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df)
          model.xgb <- xgboost(data=train, label=Ytrain,
                               max.depth=depth, nround=ntree, eta=eta_idx,
                               objective = "binary:logistic", verbose=0)
          
          
          P_xgb =predict(model.xgb, train)
          error = (P_xgb - Ytrain)^2
          error_train = sum(error); mean_error_train = mean(error)
          n_train = nrow(train)
          
          
          P_xgb = predict(model.xgb, valid)
          error = (P_xgb - Yvalid)^2
          error_valid = sum(error); mean_error_valid = mean(error)
          n_valid = nrow(valid)
          
          cv = rbind(cv, data.frame(kfold_idx, ntree_idx, depth_idx, eta_idx,
                                    n_train, error_train, mean_error_train,
                                    n_valid, error_valid, mean_error_valid))
        }
      }
    }
  }
  return(cv)
}
#function(x,kfold,ntree,depth,eta)
# cv=fn.cv.xgb(dt, 5, 50, 6, 1.2)

cv=fn.cv.xgb(dt, 5, 50, 4, 0.4)




No comments:

Post a Comment