Blog: Avazu CTR Competition

Kaggle Avazu CTR

#This code attempts to utilize the following concepts:
#Gradient Boosting Model, Stochastic Simulation, Learning rate (shrinkage) optimization, bootstraping

rm(list=ls())
setwd("C:/Users/Ted/Documents/Kaggle/Avazu/Data")

library("ff")
library("ffbase")
library("doParallel")
library("data.table")
library(ggplot2)
library(lattice)

csvfile <- file.path(getwd(),'ff.train.pct.sample','train_5pct.csv')
train <- fread(csvfile, header=TRUE, stringsAsFactors=TRUE)

summary(train);str(train);head(train) #stringsASFactors doesn't work
rm(list=ls())

## train.1p[,site_id := as.factor(site_id)]
names_factors<- colnames(train)
for (col in names_factors) set(train, j=col, value=as.factor(train[[col]]))

#### get day & 3hr time zone ####

train$date <- paste('20',substring(train$hour, first = 1, last = 2),
substring(train$hour, first=3, last=8), sep='')

train$date <- paste(substring(train$date, first=1, last=4), '-',
substring(train$date, first=5, last=6), '-',
substring(train$date, first=7, last=8), ' ',
substring(train$date, first=9, last=19), sep='')

## train$date <- as.POSIXct(train, strptime(train$date, "%Y-%M-%D %H"))
## train$date <- as.Date(train$date, "%Y-%M-%D %H")

train$day <- weekdays(as.Date(train$date))

train$day1 <- train$day
train$day1 <- gsub("Monday" , 1 ,train$day1)
train$day1 <- gsub("Tuesday" , 2, train$day1)
train$day1 <- gsub("Wednesday", 3, train$day1)
train$day1 <- gsub("Thursday" , 4, train$day1)
train$day1 <- gsub("Friday" , 5, train$day1)
train$day1 <- gsub("Saturday" , 6, train$day1)
train$day1 <- gsub("Sunday" , 7, train$day1)

train$hour <- substring(train$hour, first=7, last=8)
train$hour <- gsub("00", "Q1", train$hour); train$hour <- gsub("01", "Q1", train$hour); train$hour <- gsub("02", "Q1", train$hour);
train$hour <- gsub("03", "Q1", train$hour); train$hour <- gsub("04", "Q1", train$hour); train$hour <- gsub("05", "Q1", train$hour);
train$hour <- gsub("06", "Q2", train$hour); train$hour <- gsub("07", "Q2", train$hour); train$hour <- gsub("08", "Q2", train$hour);
train$hour <- gsub("09", "Q2", train$hour); train$hour <- gsub("10", "Q2", train$hour); train$hour <- gsub("11", "Q2", train$hour);
train$hour <- gsub("12", "Q3", train$hour); train$hour <- gsub("13", "Q3", train$hour); train$hour <- gsub("14", "Q3", train$hour);
train$hour <- gsub("15", "Q3", train$hour); train$hour <- gsub("16", "Q3", train$hour); train$hour <- gsub("17", "Q3", train$hour);
train$hour <- gsub("18", "Q4", train$hour); train$hour <- gsub("19", "Q4", train$hour); train$hour <- gsub("20", "Q4", train$hour);
train$hour <- gsub("21", "Q4", train$hour); train$hour <- gsub("22", "Q4", train$hour); train$hour <- gsub("23", "Q4", train$hour);

names_factors<- colnames(train)
for (col in names_factors) set(train, j=col, value=as.factor(train[[col]]))

##final check
summary(train);str(train);head(train)

## save model feed data
csvfile <- file.path(getwd(),'ff.train.pct.modelfeed','train_5pct.csv')
system.time(
write.table(train, file=csvfile, row.names=FALSE, col.names=TRUE)
)

gc()
help(memory.size);memory.limit();memory.size()

############################################################################
############################################################################

rm(list=ls())

library(data.table)

## load model feed data
setwd("C:/Users/Ted/Documents/Kaggle/Avazu/Data")
csvfile <- file.path(getwd(),'ff.train.pct.modelfeed','train_3pct.csv')
train <- fread(csvfile, header=TRUE)

## find distinct levels in each columns
names_factors<- colnames(train)
for (col in names_factors) set(train, j=col, value=as.factor(train[[col]]))

lvl_cnt <- NULL
for (col in names_factors) { lvl_cnt[col] <- (length(levels(train[[col]]))) }
lvl_cnt <- as.data.frame(lvl_cnt); lvl_cnt$name <- rownames(lvl_cnt)
colnames(lvl_cnt)[1] <- 'count'; lvl_cnt[with(lvl_cnt, order(count)),]
##xtabs(C21 ~ click, data=train)

## imipact modeling
impactModel = function(xcol, ycol){
n = length(ycol)
p = sum(as.numeric(ycol))/n
#duplicate output for NA (average NA towards grand uniform average)
x = c(xcol, xcol)
y = c(ycol, ycol)
x[(1+n):(2*n)] = NA
levelcounts = table(x, y, useNA="always")
condprobmodel = (levelcounts[,2]+p)/(levelcounts[,1]+levelcounts[,2]+1.0)
# apply model example: applyImpactModel(condprobmodel, data[,varname])
condprobmodel
}

applyImpactModel = function(condprobmodel, xcol) {
naval = condprobmodel[is.na(names(condprobmodel))]
dim = length(xcol)
condprobvec = numeric(dim) + naval
for(nm in names(condprobmodel)) {
if(!is.na(nm)) {
condprobvec[xcol==nm] = condprobmodel[nm]
}
}
condprobvec
}

impact_c20 = impactModel(train$c20, train$click)
# train <- as.data.frame(train)
train$impact_c20 = applyImpactModel(impact_c20, train$c20)

## randomForest model
library(randomForest)
set.seed(12345)
formula <- click ~ hour + device_type + device_conn_type + C18 + C1 + banner_pos + C15 +
C16 + site_category + app_category + day ## C21 + C19

formula1 <- click ~ device_type + device_conn_type + C18 + C1 + banner_pos + C15 +
C16 + site_category * app_category + day + hour ## C21 + C19

system.time(
model.rf <- randomForest(formula = formula,
data = train,
ntree=150, importance=T))

## 1pct
## user system elapsed
## 249.31 4.13 253.62

print(model.rf)
attributes(model.rf)
importance(model.rf)
varImpPlot(model.rf)
plot(model.rf)
summary(model.rf)

## first formula
table(predict(model.rf), train.1p$click)
###### with hour x1 #######
# 0 1
# 0 334812 67290
# 1 801 1386
###### with hour x2 #######
# 0 1
# 0 334911 67361
# 1 702 1315

## second formula
table(predict(model.rf), train.1p$click)
# 0 1
# 0 334896 67421
# 1 717 1255

# test random forest using test data
irisPred <- predict(rf, newdata=testData)

# check the results
table(irisPred, testData$Species)
plot(margin(rf, testData$Species))

head(train)

names_factors<- colnames(test)
for (col in names_factors) set(test, j=col, value=as.factor(test[[col]]))

## predict
predict.rf <- predict(model.rf, newdata=test)

#fit the randomforest model
model <- randomForest(Sepal.Length~.,
data = training,
importance=TRUE,
keep.forest=TRUE
)
print(model)

#what are the important variables (via permutation)
varImpPlot(model, type=1)

#predict the outcome of the testing data
predicted <- predict(model, newdata=testing[ ,-1])

# what is the proportion variation explained in the outcome of the testing data?
# i.e., what is 1-(SSerror/SStotal)
actual <- testing$Sepal.Length
rsq <- 1-sum((actual-predicted)^2)/sum((actual-mean(actual))^2)
print(rsq)

# gbm model fitting

library(gbm)

library(dplyr)

library(data.table)

setwd("C:\\Users\\intrepid-honor-803\\Documents\\Kaggle\\Avazu\\Data")

csvfile <- file.path(getwd(),'ff.train.pct.modelfeed','train_50pct.csv')

train <- fread(csvfile, header=TRUE)

## click <- train$click

click <- as.numeric(train$click)

## train <- select(train, -click)

train <- select(train,

device_type, device_conn_type, C18, C1, banner_pos, C15,

C16, site_category, app_category, day, hour)

names_factors<- colnames(train)

for (col in names_factors) set(train, j=col, value=as.factor(train[[col]]))

?gbm

model.gbm = gbm.fit(x=train,

y=click,

distribution = "bernoulli",

# gaussian for GBM regression or adaboost

n.trees=10000,

shrinkage=0.05,

# smaller values of shrinkage typically give slighly better performance

# the cost is that the model takes longer to run for smaller values

interaction.depth=3,

#use CV to choose interaction delpth

n.minobsinnode=500,

# n.minobsinmode has an importnt effect on overfitting!

# decrease in this number may result the overfitting

nTrain=round(nrow(train) * 0.8),

# var.monotone=c(),

# can help with overfitting, will smooth bumpy curves

verbose=TRUE

)

summary(model.gbm)

gbm.perf(model.gbm) ## gbm.perf(model.gbm, method="test")

save(model.gbm, file="gbm_depth3_minobs500_n10000")

load("C:\\Users\\intrepid-honor-803\\Documents\\Kaggle\\Avazu\\Data\\predicted\\1gbm_depth3_minobs500_n5000\\gbm_depth3_minobs500_n5000.rda")

rm(train)

Blog

Wednesday, December 3, 2014

Avazu CTR Competition

Kaggle Avazu CTR

No comments:

Post a Comment