Kaggle Avazu CTR
#This code attempts to utilize the following concepts:
#Gradient Boosting Model, Stochastic Simulation, Learning rate (shrinkage) optimization, bootstraping
rm(list=ls())
setwd("C:/Users/Ted/Documents/Kaggle/Avazu/Data")
library("ff")
library("ffbase")
library("doParallel")
library("data.table")
library(ggplot2)
library(lattice)
csvfile <- file.path(getwd(),'ff.train.pct.sample','train_5pct.csv')
train <- fread(csvfile, header=TRUE, stringsAsFactors=TRUE)
summary(train);str(train);head(train) #stringsASFactors doesn't work
rm(list=ls())
## train.1p[,site_id := as.factor(site_id)]
names_factors<- colnames(train)
for (col in names_factors) set(train, j=col, value=as.factor(train[[col]]))
#### get day & 3hr time zone ####
train$date <- paste('20',substring(train$hour, first = 1, last = 2),
substring(train$hour, first=3, last=8), sep='')
train$date <- paste(substring(train$date, first=1, last=4), '-',
substring(train$date, first=5, last=6), '-',
substring(train$date, first=7, last=8), ' ',
substring(train$date, first=9, last=19), sep='')
## train$date <- as.POSIXct(train, strptime(train$date, "%Y-%M-%D %H"))
## train$date <- as.Date(train$date, "%Y-%M-%D %H")
train$day <- weekdays(as.Date(train$date))
train$day1 <- train$day
train$day1 <- gsub("Monday" , 1 ,train$day1)
train$day1 <- gsub("Tuesday" , 2, train$day1)
train$day1 <- gsub("Wednesday", 3, train$day1)
train$day1 <- gsub("Thursday" , 4, train$day1)
train$day1 <- gsub("Friday" , 5, train$day1)
train$day1 <- gsub("Saturday" , 6, train$day1)
train$day1 <- gsub("Sunday" , 7, train$day1)
train$hour <- substring(train$hour, first=7, last=8)
train$hour <- gsub("00", "Q1", train$hour); train$hour <- gsub("01", "Q1", train$hour); train$hour <- gsub("02", "Q1", train$hour);
train$hour <- gsub("03", "Q1", train$hour); train$hour <- gsub("04", "Q1", train$hour); train$hour <- gsub("05", "Q1", train$hour);
train$hour <- gsub("06", "Q2", train$hour); train$hour <- gsub("07", "Q2", train$hour); train$hour <- gsub("08", "Q2", train$hour);
train$hour <- gsub("09", "Q2", train$hour); train$hour <- gsub("10", "Q2", train$hour); train$hour <- gsub("11", "Q2", train$hour);
train$hour <- gsub("12", "Q3", train$hour); train$hour <- gsub("13", "Q3", train$hour); train$hour <- gsub("14", "Q3", train$hour);
train$hour <- gsub("15", "Q3", train$hour); train$hour <- gsub("16", "Q3", train$hour); train$hour <- gsub("17", "Q3", train$hour);
train$hour <- gsub("18", "Q4", train$hour); train$hour <- gsub("19", "Q4", train$hour); train$hour <- gsub("20", "Q4", train$hour);
train$hour <- gsub("21", "Q4", train$hour); train$hour <- gsub("22", "Q4", train$hour); train$hour <- gsub("23", "Q4", train$hour);
names_factors<- colnames(train)
for (col in names_factors) set(train, j=col, value=as.factor(train[[col]]))
##final check
summary(train);str(train);head(train)
## save model feed data
csvfile <- file.path(getwd(),'ff.train.pct.modelfeed','train_5pct.csv')
system.time(
write.table(train, file=csvfile, row.names=FALSE, col.names=TRUE)
)
gc()
help(memory.size);memory.limit();memory.size()
############################################################################
############################################################################
rm(list=ls())
library(data.table)
## load model feed data
setwd("C:/Users/Ted/Documents/Kaggle/Avazu/Data")
csvfile <- file.path(getwd(),'ff.train.pct.modelfeed','train_3pct.csv')
train <- fread(csvfile, header=TRUE)
## find distinct levels in each columns
names_factors<- colnames(train)
for (col in names_factors) set(train, j=col, value=as.factor(train[[col]]))
lvl_cnt <- NULL
for (col in names_factors) { lvl_cnt[col] <- (length(levels(train[[col]]))) }
lvl_cnt <- as.data.frame(lvl_cnt); lvl_cnt$name <- rownames(lvl_cnt)
colnames(lvl_cnt)[1] <- 'count'; lvl_cnt[with(lvl_cnt, order(count)),]
##xtabs(C21 ~ click, data=train)
## imipact modeling
impactModel = function(xcol, ycol){
n = length(ycol)
p = sum(as.numeric(ycol))/n
#duplicate output for NA (average NA towards grand uniform average)
x = c(xcol, xcol)
y = c(ycol, ycol)
x[(1+n):(2*n)] = NA
levelcounts = table(x, y, useNA="always")
condprobmodel = (levelcounts[,2]+p)/(levelcounts[,1]+levelcounts[,2]+1.0)
# apply model example: applyImpactModel(condprobmodel, data[,varname])
condprobmodel
}
applyImpactModel = function(condprobmodel, xcol) {
naval = condprobmodel[is.na(names(condprobmodel))]
dim = length(xcol)
condprobvec = numeric(dim) + naval
for(nm in names(condprobmodel)) {
if(!is.na(nm)) {
condprobvec[xcol==nm] = condprobmodel[nm]
}
}
condprobvec
}
impact_c20 = impactModel(train$c20, train$click)
# train <- as.data.frame(train)
train$impact_c20 = applyImpactModel(impact_c20, train$c20)
## randomForest model
library(randomForest)
set.seed(12345)
formula <- click ~ hour + device_type + device_conn_type + C18 + C1 + banner_pos + C15 +
C16 + site_category + app_category + day ## C21 + C19
formula1 <- click ~ device_type + device_conn_type + C18 + C1 + banner_pos + C15 +
C16 + site_category * app_category + day + hour ## C21 + C19
system.time(
model.rf <- randomForest(formula = formula,
data = train,
ntree=150, importance=T))
## 1pct
## user system elapsed
## 249.31 4.13 253.62
print(model.rf)
attributes(model.rf)
importance(model.rf)
varImpPlot(model.rf)
plot(model.rf)
summary(model.rf)
## first formula
table(predict(model.rf), train.1p$click)
###### with hour x1 #######
# 0 1
# 0 334812 67290
# 1 801 1386
###### with hour x2 #######
# 0 1
# 0 334911 67361
# 1 702 1315
## second formula
table(predict(model.rf), train.1p$click)
# 0 1
# 0 334896 67421
# 1 717 1255
# test random forest using test data
irisPred <- predict(rf, newdata=testData)
# check the results
table(irisPred, testData$Species)
plot(margin(rf, testData$Species))
head(train)
names_factors<- colnames(test)
for (col in names_factors) set(test, j=col, value=as.factor(test[[col]]))
## predict
predict.rf <- predict(model.rf, newdata=test)
#fit the randomforest model
model <- randomForest(Sepal.Length~.,
data = training,
importance=TRUE,
keep.forest=TRUE
)
print(model)
#what are the important variables (via permutation)
varImpPlot(model, type=1)
#predict the outcome of the testing data
predicted <- predict(model, newdata=testing[ ,-1])
# what is the proportion variation explained in the outcome of the testing data?
# i.e., what is 1-(SSerror/SStotal)
actual <- testing$Sepal.Length
rsq <- 1-sum((actual-predicted)^2)/sum((actual-mean(actual))^2)
print(rsq)
#Gradient Boosting Model, Stochastic Simulation, Learning rate (shrinkage) optimization, bootstraping
rm(list=ls())
setwd("C:/Users/Ted/Documents/Kaggle/Avazu/Data")
library("ff")
library("ffbase")
library("doParallel")
library("data.table")
library(ggplot2)
library(lattice)
csvfile <- file.path(getwd(),'ff.train.pct.sample','train_5pct.csv')
train <- fread(csvfile, header=TRUE, stringsAsFactors=TRUE)
summary(train);str(train);head(train) #stringsASFactors doesn't work
rm(list=ls())
## train.1p[,site_id := as.factor(site_id)]
names_factors<- colnames(train)
for (col in names_factors) set(train, j=col, value=as.factor(train[[col]]))
#### get day & 3hr time zone ####
train$date <- paste('20',substring(train$hour, first = 1, last = 2),
substring(train$hour, first=3, last=8), sep='')
train$date <- paste(substring(train$date, first=1, last=4), '-',
substring(train$date, first=5, last=6), '-',
substring(train$date, first=7, last=8), ' ',
substring(train$date, first=9, last=19), sep='')
## train$date <- as.POSIXct(train, strptime(train$date, "%Y-%M-%D %H"))
## train$date <- as.Date(train$date, "%Y-%M-%D %H")
train$day <- weekdays(as.Date(train$date))
train$day1 <- train$day
train$day1 <- gsub("Monday" , 1 ,train$day1)
train$day1 <- gsub("Tuesday" , 2, train$day1)
train$day1 <- gsub("Wednesday", 3, train$day1)
train$day1 <- gsub("Thursday" , 4, train$day1)
train$day1 <- gsub("Friday" , 5, train$day1)
train$day1 <- gsub("Saturday" , 6, train$day1)
train$day1 <- gsub("Sunday" , 7, train$day1)
train$hour <- substring(train$hour, first=7, last=8)
train$hour <- gsub("00", "Q1", train$hour); train$hour <- gsub("01", "Q1", train$hour); train$hour <- gsub("02", "Q1", train$hour);
train$hour <- gsub("03", "Q1", train$hour); train$hour <- gsub("04", "Q1", train$hour); train$hour <- gsub("05", "Q1", train$hour);
train$hour <- gsub("06", "Q2", train$hour); train$hour <- gsub("07", "Q2", train$hour); train$hour <- gsub("08", "Q2", train$hour);
train$hour <- gsub("09", "Q2", train$hour); train$hour <- gsub("10", "Q2", train$hour); train$hour <- gsub("11", "Q2", train$hour);
train$hour <- gsub("12", "Q3", train$hour); train$hour <- gsub("13", "Q3", train$hour); train$hour <- gsub("14", "Q3", train$hour);
train$hour <- gsub("15", "Q3", train$hour); train$hour <- gsub("16", "Q3", train$hour); train$hour <- gsub("17", "Q3", train$hour);
train$hour <- gsub("18", "Q4", train$hour); train$hour <- gsub("19", "Q4", train$hour); train$hour <- gsub("20", "Q4", train$hour);
train$hour <- gsub("21", "Q4", train$hour); train$hour <- gsub("22", "Q4", train$hour); train$hour <- gsub("23", "Q4", train$hour);
names_factors<- colnames(train)
for (col in names_factors) set(train, j=col, value=as.factor(train[[col]]))
##final check
summary(train);str(train);head(train)
## save model feed data
csvfile <- file.path(getwd(),'ff.train.pct.modelfeed','train_5pct.csv')
system.time(
write.table(train, file=csvfile, row.names=FALSE, col.names=TRUE)
)
gc()
help(memory.size);memory.limit();memory.size()
############################################################################
############################################################################
rm(list=ls())
library(data.table)
## load model feed data
setwd("C:/Users/Ted/Documents/Kaggle/Avazu/Data")
csvfile <- file.path(getwd(),'ff.train.pct.modelfeed','train_3pct.csv')
train <- fread(csvfile, header=TRUE)
## find distinct levels in each columns
names_factors<- colnames(train)
for (col in names_factors) set(train, j=col, value=as.factor(train[[col]]))
lvl_cnt <- NULL
for (col in names_factors) { lvl_cnt[col] <- (length(levels(train[[col]]))) }
lvl_cnt <- as.data.frame(lvl_cnt); lvl_cnt$name <- rownames(lvl_cnt)
colnames(lvl_cnt)[1] <- 'count'; lvl_cnt[with(lvl_cnt, order(count)),]
##xtabs(C21 ~ click, data=train)
## imipact modeling
impactModel = function(xcol, ycol){
n = length(ycol)
p = sum(as.numeric(ycol))/n
#duplicate output for NA (average NA towards grand uniform average)
x = c(xcol, xcol)
y = c(ycol, ycol)
x[(1+n):(2*n)] = NA
levelcounts = table(x, y, useNA="always")
condprobmodel = (levelcounts[,2]+p)/(levelcounts[,1]+levelcounts[,2]+1.0)
# apply model example: applyImpactModel(condprobmodel, data[,varname])
condprobmodel
}
applyImpactModel = function(condprobmodel, xcol) {
naval = condprobmodel[is.na(names(condprobmodel))]
dim = length(xcol)
condprobvec = numeric(dim) + naval
for(nm in names(condprobmodel)) {
if(!is.na(nm)) {
condprobvec[xcol==nm] = condprobmodel[nm]
}
}
condprobvec
}
impact_c20 = impactModel(train$c20, train$click)
# train <- as.data.frame(train)
train$impact_c20 = applyImpactModel(impact_c20, train$c20)
## randomForest model
library(randomForest)
set.seed(12345)
formula <- click ~ hour + device_type + device_conn_type + C18 + C1 + banner_pos + C15 +
C16 + site_category + app_category + day ## C21 + C19
formula1 <- click ~ device_type + device_conn_type + C18 + C1 + banner_pos + C15 +
C16 + site_category * app_category + day + hour ## C21 + C19
system.time(
model.rf <- randomForest(formula = formula,
data = train,
ntree=150, importance=T))
## 1pct
## user system elapsed
## 249.31 4.13 253.62
print(model.rf)
attributes(model.rf)
importance(model.rf)
varImpPlot(model.rf)
plot(model.rf)
summary(model.rf)
## first formula
table(predict(model.rf), train.1p$click)
###### with hour x1 #######
# 0 1
# 0 334812 67290
# 1 801 1386
###### with hour x2 #######
# 0 1
# 0 334911 67361
# 1 702 1315
## second formula
table(predict(model.rf), train.1p$click)
# 0 1
# 0 334896 67421
# 1 717 1255
# test random forest using test data
irisPred <- predict(rf, newdata=testData)
# check the results
table(irisPred, testData$Species)
plot(margin(rf, testData$Species))
head(train)
names_factors<- colnames(test)
for (col in names_factors) set(test, j=col, value=as.factor(test[[col]]))
## predict
predict.rf <- predict(model.rf, newdata=test)
#fit the randomforest model
model <- randomForest(Sepal.Length~.,
data = training,
importance=TRUE,
keep.forest=TRUE
)
print(model)
#what are the important variables (via permutation)
varImpPlot(model, type=1)
#predict the outcome of the testing data
predicted <- predict(model, newdata=testing[ ,-1])
# what is the proportion variation explained in the outcome of the testing data?
# i.e., what is 1-(SSerror/SStotal)
actual <- testing$Sepal.Length
rsq <- 1-sum((actual-predicted)^2)/sum((actual-mean(actual))^2)
print(rsq)
# gbm model fitting
library(gbm)
library(dplyr)
library(data.table)
setwd("C:\\Users\\intrepid-honor-803\\Documents\\Kaggle\\Avazu\\Data")
csvfile <- file.path(getwd(),'ff.train.pct.modelfeed','train_50pct.csv')
train <- fread(csvfile, header=TRUE)
## click <- train$click
click <- as.numeric(train$click)
## train <- select(train, -click)
train <- select(train,
device_type, device_conn_type, C18, C1, banner_pos, C15,
C16, site_category, app_category, day, hour)
names_factors<- colnames(train)
for (col in names_factors) set(train, j=col, value=as.factor(train[[col]]))
?gbm
model.gbm = gbm.fit(x=train,
y=click,
distribution = "bernoulli",
# gaussian for GBM regression or adaboost
n.trees=10000,
shrinkage=0.05,
# smaller values of shrinkage typically give slighly better performance
# the cost is that the model takes longer to run for smaller values
interaction.depth=3,
#use CV to choose interaction delpth
n.minobsinnode=500,
# n.minobsinmode has an importnt effect on overfitting!
# decrease in this number may result the overfitting
nTrain=round(nrow(train) * 0.8),
# var.monotone=c(),
# can help with overfitting, will smooth bumpy curves
verbose=TRUE
)
summary(model.gbm)
gbm.perf(model.gbm) ## gbm.perf(model.gbm, method="test")
save(model.gbm, file="gbm_depth3_minobs500_n10000")
load("C:\\Users\\intrepid-honor-803\\Documents\\Kaggle\\Avazu\\Data\\predicted\\1gbm_depth3_minobs500_n5000\\gbm_depth3_minobs500_n5000.rda")
rm(train)