Last updated: 2018-04-22
Code version: a7d2119
The goal of this script is to run XGBoost on a model which includes information about the essays.
# Libraries
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(stringr)
library(tidytext)
Warning: package 'tidytext' was built under R version 3.4.4
library(ggplot2)
library(tidyverse)
── Attaching packages ──────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
✔ tibble 1.4.2 ✔ readr 1.1.1
✔ tidyr 0.7.2 ✔ purrr 0.2.4
✔ tibble 1.4.2 ✔ forcats 0.2.0
── Conflicts ─────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
library("xgboost")
Attaching package: 'xgboost'
The following object is masked from 'package:dplyr':
slice
library(caret)
Loading required package: lattice
Attaching package: 'caret'
The following object is masked from 'package:purrr':
lift
# Open the datasets
train <- read.csv("~/Dropbox/DonorsChoose/train.csv")
test <- read.csv("~/Dropbox/DonorsChoose/test.csv")
resources <- read.csv("~/Dropbox/DonorsChoose/resources.csv")
combine_train_test <- rbind(train[,1:15], test)
# First, we want to select project id, title name, and if the project was approved or not
id_title <- c(1, 9)
train_text <- combine_train_test[,id_title]
train_text[,1] <- as.character(train_text[,1])
train_text[,2] <- as.character(train_text[,2])
train_text <- as.tibble(train_text)
tidy_books <- train_text %>% unnest_tokens(word, project_title)
freq_table <- count(tidy_books, id)
title_word_count_by_project <- as.data.frame(freq_table)
get_min_med <- function(name_of_tibble){
# Get minimium and median
janeaustensentiment <- tidy_books %>% inner_join(get_sentiments("afinn"))
#janeaustensentiment <- tidy_books %>% left_join(get_sentiments("afinn"))
check_min <- aggregate(janeaustensentiment$score, by = list(janeaustensentiment$id), FUN = min)
colnames(check_min) <- c("id", "min_score")
check_med <- aggregate(janeaustensentiment$score, by = list(janeaustensentiment$id), FUN = median)
colnames(check_med) <- c("id", "med_score")
check_corr_titles <- cbind(check_min$id, check_min$min_score, check_med$med_score)
colnames(check_corr_titles) <- c("id", "min_score", "med_score")
check_corr_titles <- as.data.frame(check_corr_titles)
return(check_corr_titles)
}
#min_med_title <- get_min_med(tidy_books)
# First, we want to select project id, title name, and if the project was approved or not
id_title <- c(1, 10)
train_text <- combine_train_test[,id_title]
train_text[,1] <- as.character(train_text[,1])
train_text[,2] <- as.character(train_text[,2])
train_text <- as.tibble(train_text)
tidy_books <- train_text %>% unnest_tokens(word, project_essay_1)
# Find how many words in essay1
freq_table <- count(tidy_books, id)
essay1_word_count_by_project <- as.data.frame(freq_table)
summary(title_word_count_by_project$id %in% essay1_word_count_by_project$id)
Mode FALSE TRUE
logical 1 260114
which((title_word_count_by_project$id %in% essay1_word_count_by_project$id) == FALSE)
[1] 121566
min_med_essay1 <- get_min_med(tidy_books)
Joining, by = "word"
# First, we want to select project id, title name, and if the project was approved or not
id_title <- c(1, 11)
train_text <- combine_train_test[,id_title]
train_text[,1] <- as.character(train_text[,1])
train_text[,2] <- as.character(train_text[,2])
train_text <- as.tibble(train_text)
tidy_books <- train_text %>% unnest_tokens(word, project_essay_2)
# Find how many words in essay1
freq_table <- count(tidy_books, id)
essay2_word_count_by_project <- as.data.frame(freq_table)
min_med_essay2 <- get_min_med(tidy_books)
Joining, by = "word"
word_count <- merge(title_word_count_by_project, essay1_word_count_by_project, by = c("id"))
colnames(word_count) <- c("id", "title_count", "essay1_count")
total_word_count <- merge(word_count, essay2_word_count_by_project, by = c("id"))
colnames(total_word_count) <- c("id", "title_count", "essay1_count", "essay2_count")
# Total price
resources[,1] <- as.character(resources[,1])
resources_total_price <- as.data.frame(cbind(resources$id, resources$quantity*resources$price), stringsAsFactors = FALSE)
resources_total_price[,2] <- as.numeric(resources_total_price[,2])
resources_total_price2 <- aggregate(resources_total_price[,2], by=list(Category=resources_total_price[,1]), FUN=sum)
# Total quantity
resources_quantity_total <- aggregate(resources$quantity, by=list(Category=resources$id), FUN=sum)
resources_together <- as.data.frame(cbind(resources_total_price2, resources_quantity_total[,2]), stringsAsFactors = FALSE)
colnames(resources_together) <- c("id", "total_amount", "total_items")
# Merge resources with training and test data
training_data <- merge(resources_together, train, by = c("id"))
testing_data <- merge(resources_together, test, by = c("id"))
# Merge word count with training and test data
training_data <- merge(total_word_count, training_data, by = c("id"))
testing_data <- merge(total_word_count, testing_data, by = c("id"))
## set the seed to make your partition reproductible
set.seed(123)
## 75% of the sample size
smp_size <- floor(0.75 * nrow(training_data))
train_ind <- sample(seq_len(nrow(training_data)), size = smp_size)
train <- training_data[train_ind, ]
test2 <- training_data[-train_ind, ]
################################## Training data #############################
#basic_features <- c(2,3,5,6,8,9,17)
basic_features <- c(2,3,4,5,6,8,9,11,12,20)
train_data <- train[,basic_features]
# XGBoost only works with numeric vectors
train_data[,1] <- as.numeric(train_data[,1])
train_data[,2] <- as.numeric(train_data[,2])
train_data[,3] <- as.numeric(train_data[,3])
train_data[,4] <- as.numeric(train_data[,4])
train_data[,5] <- as.numeric(train_data[,5])
train_data[,6] <- as.numeric(train_data[,6])
train_data[,7] <- as.numeric(train_data[,7])
train_data[,8] <- as.numeric(train_data[,8])
train_data[,9] <- as.numeric(train_data[,9])
#train_data <- as.list(train_data)
train_data <- as.matrix(train_data)
train_labels <- as.matrix(train[,21])
################################## Test data #############################
#basic_features <- c(2,3,5,6,8,9,17)
test2_data <- test2[,basic_features]
# XGBoost only works with numeric vectors
test2_data[,1] <- as.numeric(test2_data[,1])
test2_data[,2] <- as.numeric(test2_data[,2])
test2_data[,3] <- as.numeric(test2_data[,3])
test2_data[,4] <- as.numeric(test2_data[,4])
test2_data[,5] <- as.numeric(test2_data[,5])
test2_data[,6] <- as.numeric(test2_data[,6])
test2_data[,7] <- as.numeric(test2_data[,7])
test2_data[,8] <- as.numeric(test2_data[,8])
test2_data[,9] <- as.numeric(test2_data[,9])
#train_data <- as.list(train_data)
test2_data <- as.matrix(test2_data)
test2_labels <- as.matrix(test2[,21])
############ Run dtrain and dtest, weight by the unequal number of positive and negative cases ##############
dtrain <- xgb.DMatrix(data = train_data, label=train_labels)
dtest <- xgb.DMatrix(data = test2_data, label=test2_labels)
watchlist <- list(train=dtrain, test=dtest)
negative_cases <- sum(train_labels == 0)
positive_cases <- sum(train_labels == 1)
bst <- xgb.train(data=dtrain, max_depth=70, eta=1, nthread = 2, nrounds=24, watchlist=watchlist, scale_pos_weight = negative_cases/positive_cases, eval_metric = "error", eval_metric = "logloss", objective = "binary:logistic")
[1] train-error:0.244239 train-logloss:0.530805 test-error:0.354965 test-logloss:0.637852
[2] train-error:0.177652 train-logloss:0.423910 test-error:0.321793 test-logloss:0.610615
[3] train-error:0.127893 train-logloss:0.345935 test-error:0.299780 test-logloss:0.594886
[4] train-error:0.094684 train-logloss:0.287466 test-error:0.281129 test-logloss:0.583542
[5] train-error:0.071617 train-logloss:0.244014 test-error:0.271639 test-logloss:0.580794
[6] train-error:0.054387 train-logloss:0.209074 test-error:0.262720 test-logloss:0.578922
[7] train-error:0.041865 train-logloss:0.181967 test-error:0.254679 test-logloss:0.579009
[8] train-error:0.032682 train-logloss:0.160441 test-error:0.248440 test-logloss:0.580337
[9] train-error:0.025732 train-logloss:0.142391 test-error:0.244728 test-logloss:0.582083
[10] train-error:0.020533 train-logloss:0.128021 test-error:0.240993 test-logloss:0.584089
[11] train-error:0.016330 train-logloss:0.115869 test-error:0.237346 test-logloss:0.587236
[12] train-error:0.012705 train-logloss:0.105682 test-error:0.235457 test-logloss:0.590627
[13] train-error:0.009915 train-logloss:0.097113 test-error:0.231634 test-logloss:0.593656
[14] train-error:0.007850 train-logloss:0.089877 test-error:0.230470 test-logloss:0.597247
[15] train-error:0.005873 train-logloss:0.083046 test-error:0.227614 test-logloss:0.600507
[16] train-error:0.004782 train-logloss:0.077277 test-error:0.226582 test-logloss:0.603295
[17] train-error:0.003735 train-logloss:0.072467 test-error:0.224736 test-logloss:0.605002
[18] train-error:0.002841 train-logloss:0.068158 test-error:0.224802 test-logloss:0.607706
[19] train-error:0.002241 train-logloss:0.064100 test-error:0.222935 test-logloss:0.610833
[20] train-error:0.001677 train-logloss:0.060586 test-error:0.221727 test-logloss:0.613448
[21] train-error:0.001252 train-logloss:0.057545 test-error:0.220870 test-logloss:0.616021
[22] train-error:0.000974 train-logloss:0.054636 test-error:0.220540 test-logloss:0.619788
[23] train-error:0.000791 train-logloss:0.051975 test-error:0.218805 test-logloss:0.622311
[24] train-error:0.000630 train-logloss:0.049618 test-error:0.218344 test-logloss:0.624725
test_data <- testing_data[,basic_features]
test_data[,1] <- as.numeric(test_data[,1])
test_data[,2] <- as.numeric(test_data[,2])
test_data[,3] <- as.numeric(test_data[,3])
test_data[,4] <- as.numeric(test_data[,4])
test_data[,5] <- as.numeric(test_data[,5])
test_data[,6] <- as.numeric(test_data[,6])
test_data[,7] <- as.numeric(test_data[,7])
test_data[,8] <- as.numeric(test_data[,8])
test_data[,9] <- as.numeric(test_data[,9])
#train_data <- as.list(train_data)
test_data <- as.matrix(test_data)
pred <- predict(bst, test_data)
make_csv <- as.data.frame(cbind(testing_data$id, pred), stringsAsFactors = FALSE)
colnames(make_csv) <- cbind("id", "project_is_approved")
boxplot(as.numeric(make_csv$project_is_approved), ylim = c(0,1), main = "Probability of approval for each project")
order_id <- make_csv[order(match(make_csv$id, test$id)), ]
dim(order_id)
[1] 78035 2
write.csv(order_id, "../data/sample_submission_word_count2.csv", row.names = FALSE, sep= ",")
Warning in write.csv(order_id, "../data/
sample_submission_word_count2.csv", : attempt to set 'sep' ignored
The score is 0.57629.
train_data_together <- as.data.frame(rbind(train_data, test2_data))
train_labels_together <- as.factor(rbind(train_labels, test2_labels))
levels(train_labels_together) <- c("first_class", "second_class")
#colnames(train_labels_together) <- c("project_is_approved")
train_labels_together <- as.factor(train_labels_together)
fitControl <- trainControl(method="none",classProbs = TRUE)
xgbGrid <- expand.grid(nrounds = 100,
max_depth = 3,
eta = .05,
gamma = 0,
colsample_bytree = .8,
min_child_weight = 1,
subsample = 1)
control <- trainControl(method = "none", number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary)
metric <- "ROC"
fit.xgb <- train(x = train_data_together, y = train_labels_together, method="xgbTree", metric=metric, trControl=control, preProc = c("center", "scale"), tuneGrid = xgbGrid)
predictedval <- predict(fit.xgb, newdata=test_data, type = 'prob')
make_csv <- as.data.frame(cbind(testing_data$id, predictedval$second_class), stringsAsFactors = FALSE)
colnames(make_csv) <- cbind("id", "project_is_approved")
order_id <- make_csv[order(match(make_csv$id, test$id)), ]
head(order_id)
id project_is_approved
69905 p233245 0.933689400553703
28961 p096795 0.862884551286697
70819 p236235 0.895998820662498
70040 p233680 0.795785814523697
51515 p171879 0.826388254761696
4829 p016071 0.827602431178093
dim(order_id)
[1] 78035 2
write.csv(order_id, "../data/sample_submission_metric_ROC.csv", row.names = FALSE, sep= ",")
Warning in write.csv(order_id, "../data/
sample_submission_metric_ROC.csv", : attempt to set 'sep' ignored
The score is 0.64169.
sessionInfo()
R version 3.4.3 (2017-11-30)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: OS X El Capitan 10.11.6
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] bindrcpp_0.2 caret_6.0-79 lattice_0.20-35 xgboost_0.6.4.1
[5] forcats_0.2.0 purrr_0.2.4 readr_1.1.1 tidyr_0.7.2
[9] tibble_1.4.2 tidyverse_1.2.1 ggplot2_2.2.1 tidytext_0.1.8
[13] stringr_1.3.0 dplyr_0.7.4
loaded via a namespace (and not attached):
[1] httr_1.3.1 ddalpha_1.3.1.1 splines_3.4.3
[4] sfsmisc_1.1-2 jsonlite_1.5 foreach_1.4.4
[7] prodlim_1.6.1 modelr_0.1.1 assertthat_0.2.0
[10] stats4_3.4.3 DRR_0.0.3 cellranger_1.1.0
[13] yaml_2.1.18 robustbase_0.92-8 ipred_0.9-6
[16] pillar_1.1.0 backports_1.1.2 glue_1.2.0
[19] digest_0.6.15 rvest_0.3.2 colorspace_1.3-2
[22] recipes_0.1.2 htmltools_0.3.6 Matrix_1.2-13
[25] plyr_1.8.4 psych_1.7.8 timeDate_3043.102
[28] pkgconfig_2.0.1 CVST_0.2-1 broom_0.4.3
[31] haven_1.1.1 scales_0.5.0 gower_0.1.2
[34] lava_1.6 git2r_0.21.0 withr_2.1.2
[37] nnet_7.3-12 lazyeval_0.2.1 cli_1.0.0
[40] mnormt_1.5-5 survival_2.41-3 magrittr_1.5
[43] crayon_1.3.4 readxl_1.0.0 evaluate_0.10.1
[46] tokenizers_0.2.0 janeaustenr_0.1.5 nlme_3.1-131
[49] SnowballC_0.5.1 MASS_7.3-48 xml2_1.1.1
[52] dimRed_0.1.0 foreign_0.8-69 class_7.3-14
[55] tools_3.4.3 data.table_1.10.4-3 hms_0.4.0
[58] kernlab_0.9-25 munsell_0.4.3 compiler_3.4.3
[61] RcppRoll_0.2.2 rlang_0.1.6 grid_3.4.3
[64] iterators_1.0.9 rstudioapi_0.7 rmarkdown_1.9
[67] gtable_0.2.0 ModelMetrics_1.1.0 codetools_0.2-15
[70] reshape2_1.4.3 R6_2.2.2 lubridate_1.7.1
[73] knitr_1.20 bindr_0.1 rprojroot_1.3-2
[76] stringi_1.1.7 parallel_3.4.3 Rcpp_0.12.15
[79] rpart_4.1-12 tidyselect_0.2.3 DEoptimR_1.0-8
This R Markdown site was created with workflowr