Wednesday, December 3, 2014

Avazu CTR Competition

Kaggle Avazu CTR

#This code attempts to utilize the following concepts:
#Gradient Boosting Model, Stochastic Simulation, Learning rate (shrinkage) optimization, bootstraping

rm(list=ls())
setwd("C:/Users/Ted/Documents/Kaggle/Avazu/Data")

library("ff")
library("ffbase")
library("doParallel")
library("data.table")
library(ggplot2)
library(lattice)



csvfile <- file.path(getwd(),'ff.train.pct.sample','train_5pct.csv')
train <- fread(csvfile, header=TRUE, stringsAsFactors=TRUE)


summary(train);str(train);head(train) #stringsASFactors doesn't work
rm(list=ls())


## train.1p[,site_id := as.factor(site_id)]
names_factors<- colnames(train)
for (col in names_factors) set(train, j=col, value=as.factor(train[[col]]))



#### get day & 3hr time zone ####


train$date <- paste('20',substring(train$hour, first = 1, last = 2),
                       substring(train$hour, first=3, last=8), sep='')

train$date <- paste(substring(train$date, first=1, last=4), '-',
                       substring(train$date, first=5, last=6), '-',
                       substring(train$date, first=7, last=8), ' ',
                       substring(train$date, first=9, last=19), sep='')

## train$date <- as.POSIXct(train, strptime(train$date, "%Y-%M-%D %H"))
## train$date <- as.Date(train$date, "%Y-%M-%D %H")

train$day <- weekdays(as.Date(train$date))

train$day1 <- train$day
train$day1 <- gsub("Monday"   , 1 ,train$day1)
train$day1 <- gsub("Tuesday"  , 2, train$day1)
train$day1 <- gsub("Wednesday", 3, train$day1)
train$day1 <- gsub("Thursday" , 4, train$day1)
train$day1 <- gsub("Friday"   , 5, train$day1)
train$day1 <- gsub("Saturday" , 6, train$day1)
train$day1 <- gsub("Sunday"   , 7, train$day1)


train$hour <- substring(train$hour, first=7, last=8)
train$hour <- gsub("00", "Q1", train$hour); train$hour <- gsub("01", "Q1", train$hour); train$hour <- gsub("02", "Q1", train$hour);
train$hour <- gsub("03", "Q1", train$hour); train$hour <- gsub("04", "Q1", train$hour); train$hour <- gsub("05", "Q1", train$hour);
train$hour <- gsub("06", "Q2", train$hour); train$hour <- gsub("07", "Q2", train$hour); train$hour <- gsub("08", "Q2", train$hour);
train$hour <- gsub("09", "Q2", train$hour); train$hour <- gsub("10", "Q2", train$hour); train$hour <- gsub("11", "Q2", train$hour);
train$hour <- gsub("12", "Q3", train$hour); train$hour <- gsub("13", "Q3", train$hour); train$hour <- gsub("14", "Q3", train$hour);
train$hour <- gsub("15", "Q3", train$hour); train$hour <- gsub("16", "Q3", train$hour); train$hour <- gsub("17", "Q3", train$hour);
train$hour <- gsub("18", "Q4", train$hour); train$hour <- gsub("19", "Q4", train$hour); train$hour <- gsub("20", "Q4", train$hour);
train$hour <- gsub("21", "Q4", train$hour); train$hour <- gsub("22", "Q4", train$hour); train$hour <- gsub("23", "Q4", train$hour);


names_factors<- colnames(train)
for (col in names_factors) set(train, j=col, value=as.factor(train[[col]]))


##final check
summary(train);str(train);head(train)


## save model feed data
csvfile <- file.path(getwd(),'ff.train.pct.modelfeed','train_5pct.csv')
system.time(
  write.table(train, file=csvfile, row.names=FALSE, col.names=TRUE)
)

gc()
help(memory.size);memory.limit();memory.size()


############################################################################
############################################################################

rm(list=ls())

library(data.table)

## load model feed data
setwd("C:/Users/Ted/Documents/Kaggle/Avazu/Data")
csvfile <- file.path(getwd(),'ff.train.pct.modelfeed','train_3pct.csv')
train <- fread(csvfile, header=TRUE)


## find distinct levels in each columns
names_factors<- colnames(train)
for (col in names_factors) set(train, j=col, value=as.factor(train[[col]]))

lvl_cnt <- NULL
for (col in names_factors)  { lvl_cnt[col] <- (length(levels(train[[col]]))) }
lvl_cnt <- as.data.frame(lvl_cnt); lvl_cnt$name <- rownames(lvl_cnt)
colnames(lvl_cnt)[1] <- 'count'; lvl_cnt[with(lvl_cnt, order(count)),]
##xtabs(C21 ~ click, data=train)


## imipact modeling
impactModel = function(xcol, ycol){
  n = length(ycol)
  p = sum(as.numeric(ycol))/n
  #duplicate output for NA (average NA towards grand uniform average)
  x = c(xcol, xcol)
  y = c(ycol, ycol)
  x[(1+n):(2*n)] = NA
  levelcounts = table(x, y, useNA="always")
  condprobmodel = (levelcounts[,2]+p)/(levelcounts[,1]+levelcounts[,2]+1.0)
  # apply model example: applyImpactModel(condprobmodel, data[,varname])
  condprobmodel
}

applyImpactModel = function(condprobmodel, xcol) {
  naval = condprobmodel[is.na(names(condprobmodel))]
  dim = length(xcol)
  condprobvec = numeric(dim) + naval
  for(nm in names(condprobmodel)) {
    if(!is.na(nm)) {
      condprobvec[xcol==nm] = condprobmodel[nm]
    }
  }
  condprobvec
}


impact_c20 = impactModel(train$c20, train$click)
# train <- as.data.frame(train)
train$impact_c20 = applyImpactModel(impact_c20, train$c20)



## randomForest model
library(randomForest)
set.seed(12345)
formula <- click ~ hour + device_type + device_conn_type + C18 + C1 + banner_pos + C15 +
                   C16 + site_category + app_category  + day  ## C21 + C19

formula1 <- click ~ device_type + device_conn_type + C18 + C1 + banner_pos + C15 +
  C16 + site_category * app_category  + day + hour ## C21 + C19


system.time(
model.rf <- randomForest(formula = formula,
                         data = train,
                         ntree=150, importance=T))

##   1pct
##   user  system elapsed
## 249.31    4.13  253.62



print(model.rf)
attributes(model.rf)
importance(model.rf)
varImpPlot(model.rf)
plot(model.rf)
summary(model.rf)

## first formula
table(predict(model.rf), train.1p$click)
###### with hour x1 #######
# 0      1
# 0 334812  67290
# 1    801   1386
###### with hour x2 #######
# 0      1
# 0 334911  67361
# 1    702   1315


## second formula
table(predict(model.rf), train.1p$click)
# 0      1
# 0 334896  67421
# 1    717   1255


# test random forest using test data
irisPred <- predict(rf, newdata=testData)

# check the results
table(irisPred, testData$Species)
plot(margin(rf, testData$Species))


head(train)


names_factors<- colnames(test)
for (col in names_factors) set(test, j=col, value=as.factor(test[[col]]))


## predict
predict.rf <- predict(model.rf, newdata=test)


#fit the randomforest model
model <- randomForest(Sepal.Length~.,
                      data = training,
                      importance=TRUE,
                      keep.forest=TRUE
)
print(model)

#what are the important variables (via permutation)
varImpPlot(model, type=1)

#predict the outcome of the testing data
predicted <- predict(model, newdata=testing[ ,-1])

# what is the proportion variation explained in the outcome of the testing data?
# i.e., what is 1-(SSerror/SStotal)
actual <- testing$Sepal.Length
rsq <- 1-sum((actual-predicted)^2)/sum((actual-mean(actual))^2)
print(rsq)



# gbm model fitting

library(gbm)
library(dplyr)
library(data.table)

setwd("C:\\Users\\intrepid-honor-803\\Documents\\Kaggle\\Avazu\\Data")
csvfile <- file.path(getwd(),'ff.train.pct.modelfeed','train_50pct.csv')
train <- fread(csvfile, header=TRUE)



## click <- train$click 
click <- as.numeric(train$click)

## train <- select(train, -click)
train <- select(train, 
                device_type, device_conn_type, C18, C1, banner_pos, C15, 
                C16, site_category, app_category, day, hour)


names_factors<- colnames(train)
for (col in names_factors) set(train, j=col, value=as.factor(train[[col]]))

?gbm
model.gbm = gbm.fit(x=train,
                    y=click,      
                    distribution = "bernoulli", 
                    # gaussian for GBM regression or adaboost
                    n.trees=10000,
                    shrinkage=0.05, 
                    # smaller values of shrinkage typically give slighly better performance
                    # the cost is that the model takes longer to run for smaller values
                    interaction.depth=3,
                    #use CV to choose interaction delpth
                    n.minobsinnode=500,
                    # n.minobsinmode has an importnt effect on overfitting!
                    # decrease in this number may result the overfitting
                    nTrain=round(nrow(train) * 0.8),
                    # var.monotone=c(),
                    # can help with overfitting, will smooth bumpy curves
                    verbose=TRUE
)

summary(model.gbm)
gbm.perf(model.gbm) ## gbm.perf(model.gbm, method="test")
save(model.gbm, file="gbm_depth3_minobs500_n10000")
load("C:\\Users\\intrepid-honor-803\\Documents\\Kaggle\\Avazu\\Data\\predicted\\1gbm_depth3_minobs500_n5000\\gbm_depth3_minobs500_n5000.rda")
rm(train)

R packages


List of R packages updates for tracking

    • ggplot2 / ggvis (interactive)
    • dplyr
    • ff/bigglm
    • reshape2
    • data.table
    • gmb
    • doParallel
    • rattle/caret
    • stringr

http://www-bcf.usc.edu/~gareth/ISL/
http://adv-r.had.co.nz/
http://www.louisaslett.com/RStudio_AMI/
http://www.r-bloggers.com/in-depth-introduction-to-machine-learning-in-15-hours-of-expert-videos/