Last updated: 2018-03-29
Code version: 9102019
The following RMarkdown file uses files from Donors Choose and performs preliminary sentiment analysis. The sentiment analysis was aided by the code and explanations provided in https://www.tidytextmining.com/.
# Libraries
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(stringr)
library(tidytext)
Warning: package 'tidytext' was built under R version 3.4.4
library(ggplot2)
library(tidyverse)
── Attaching packages ────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
✔ tibble 1.4.2 ✔ readr 1.1.1
✔ tidyr 0.7.2 ✔ purrr 0.2.4
✔ tibble 1.4.2 ✔ forcats 0.2.0
── Conflicts ───────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
library(cowplot)
Attaching package: 'cowplot'
The following object is masked from 'package:ggplot2':
ggsave
# Open the datasets
train <- read.csv("~/Dropbox/DonorsChoose/train.csv")
test <- read.csv("~/Dropbox/DonorsChoose/test.csv")
resources <- read.csv("~/Dropbox/DonorsChoose/resources.csv")
# First, we want to select project id, title name, and if the project was approved or not
id_title <- c(1, 16, 9)
train_text <- train[,id_title]
train_text[,1] <- as.character(train_text[,1])
train_text[,2] <- as.numeric(train_text[,2])
train_text[,3] <- as.character(train_text[,3])
train_text <- as.tibble(train_text)
tidy_books <- train_text %>% unnest_tokens(word, project_title)
# Find out how many words the title is
freq_table <- count(tidy_books, id)
title_word_count_by_project <- left_join(freq_table, train_text[,1:2], by = c("id"))
ggplot(title_word_count_by_project, aes(x = factor(project_is_approved), y = n)) + geom_boxplot()
ggplot(title_word_count_by_project, aes(x = n, fill = factor(project_is_approved))) +
geom_bar(position = "fill")
# Take out stop (common) words
tidy_books <- tidy_books %>%
anti_join(stop_words)
Joining, by = "word"
freq_table <- count(tidy_books, id)
title_word_count_by_project <- left_join(freq_table, train_text[,1:2], by = c("id"))
ggplot(title_word_count_by_project, aes(x = factor(project_is_approved), y = n)) + geom_boxplot()
title_n <- ggplot(title_word_count_by_project, aes(x = n, fill = factor(project_is_approved))) +
geom_bar(position = "fill") + ggtitle("Number of words by approval status")
# Get a sense of which words are the most common
tidy_books %>%
count(word, sort = TRUE)
# A tibble: 22,308 x 2
word n
<chr> <int>
1 learning 15587
2 technology 9677
3 classroom 9590
4 students 8486
5 reading 8053
6 books 6815
7 seating 6618
8 math 5520
9 flexible 5393
10 learn 4927
# ... with 22,298 more rows
tidy_books %>%
count(word, sort = TRUE) %>%
filter(n > 2000) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()
janeaustensentiment <- tidy_books %>% inner_join(get_sentiments("afinn"))
Joining, by = "word"
summary(janeaustensentiment)
id project_is_approved word
Length:54170 Min. :0.0000 Length:54170
Class :character 1st Qu.:1.0000 Class :character
Mode :character Median :1.0000 Mode :character
Mean :0.8403
3rd Qu.:1.0000
Max. :1.0000
score
Min. :-4.000
1st Qu.: 1.000
Median : 2.000
Mean : 1.609
3rd Qu.: 2.000
Max. : 5.000
janeaustensentiment[1:10,]
# A tibble: 10 x 4
id project_is_approved word score
<chr> <dbl> <chr> <int>
1 p036502 1.00 super 3
2 p039565 0 calm 2
3 p185307 0 inspired 2
4 p185307 0 increase 1
5 p185307 0 gain 2
6 p013780 1.00 clean 2
7 p063374 1.00 reach 1
8 p103285 1.00 active 1
9 p181781 1.00 fabulous 4
10 p114989 1.00 fidgety -2
# Is there a correlation between the average score and whether or not it gets accepted?
check_corr_titles <- aggregate(janeaustensentiment$score, by = list(janeaustensentiment$id), FUN = mean)
colnames(check_corr_titles) <- c("id", "score")
check_corr_titles2 <- left_join(check_corr_titles, train_text[,1:2], by = c("id"))
ggplot(check_corr_titles2, aes(x = factor(project_is_approved), y = score)) + geom_boxplot()
title_a <- ggplot(check_corr_titles2,aes(x = score, fill = factor(project_is_approved))) +
geom_bar(position = "fill") + ggtitle("Avg. AFINN sentiment score by approval status")
# Is there a relationship between the most positive word and whether it gets approved?
check_corr_titles <- aggregate(janeaustensentiment$score, by = list(janeaustensentiment$id), FUN = max)
colnames(check_corr_titles) <- c("id", "score")
check_corr_titles2 <- left_join(check_corr_titles, train_text[,1:2], by = c("id"))
ggplot(check_corr_titles2, aes(x = factor(project_is_approved), y = score)) + geom_boxplot()
title_b <- ggplot(check_corr_titles2,aes(x = score, fill = factor(project_is_approved))) +
geom_bar(position = "fill") + ggtitle("Max. AFINN sentiment score by approval status")
# Is there a relationship between the most negative word and whether it gets approved?
check_corr_titles <- aggregate(janeaustensentiment$score, by = list(janeaustensentiment$id), FUN = min)
colnames(check_corr_titles) <- c("id", "score")
check_corr_titles2 <- left_join(check_corr_titles, train_text[,1:2], by = c("id"))
ggplot(check_corr_titles2, aes(x = factor(project_is_approved), y = score)) + geom_boxplot()
title_c <- ggplot(check_corr_titles2,aes(x = score, fill = factor(project_is_approved))) +
geom_bar(position = "fill") + ggtitle("Min. AFINN sentiment score by approval status")
# Is there a relationship between the most common value and whether it gets approved?
check_corr_titles <- aggregate(janeaustensentiment$score, by = list(janeaustensentiment$id), FUN = median)
colnames(check_corr_titles) <- c("id", "score")
check_corr_titles2 <- left_join(check_corr_titles, train_text[,1:2], by = c("id"))
ggplot(check_corr_titles2, aes(x = factor(project_is_approved), y = score)) + geom_boxplot()
title_d <- ggplot(check_corr_titles2,aes(x = score, fill = factor(project_is_approved))) +
geom_bar(position = "fill") + ggtitle("Median AFINN sentiment score by approval status")
# Sentiment analysis - bing
janeaustensentiment <- tidy_books %>% inner_join(get_sentiments("bing"))
Joining, by = "word"
janeaustensentiment[1:10,]
# A tibble: 10 x 4
id project_is_approved word sentiment
<chr> <dbl> <chr> <chr>
1 p036502 1.00 super positive
2 p039565 0 calm positive
3 p185307 0 gain positive
4 p013780 1.00 clean positive
5 p181781 1.00 fabulous positive
6 p114989 1.00 wobble negative
7 p114989 1.00 fidgety negative
8 p226941 1.00 boost positive
9 p173555 0 love positive
10 p055350 1.00 flexible positive
# Is there a connection between the average score and whether or not it gets accepted?
length(which(janeaustensentiment$project_is_approved == 1 & janeaustensentiment$sentiment == "positive"))
[1] 43948
length(which(janeaustensentiment$project_is_approved == 0 & janeaustensentiment$sentiment == "positive"))
[1] 8586
length(which(janeaustensentiment$project_is_approved == 1 & janeaustensentiment$sentiment == "negative"))
[1] 9548
length(which(janeaustensentiment$project_is_approved == 0 & janeaustensentiment$sentiment == "negative"))
[1] 1532
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(id) %>%
summarize(words = n())
sentiment_ratio <- tidy_books %>%
semi_join(bingnegative) %>%
group_by(id, project_is_approved) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("id")) %>%
mutate(ratio = negativewords/words) %>%
top_n(1) %>%
ungroup()
Joining, by = "word"
Selecting by ratio
ggplot(sentiment_ratio, aes(x = factor(project_is_approved), y = ratio)) + geom_boxplot()
janeaustensentiment <- tidy_books %>% inner_join(get_sentiments("nrc"))
Joining, by = "word"
janeaustensentiment[1:20,]
# A tibble: 20 x 4
id project_is_approved word sentiment
<chr> <dbl> <chr> <chr>
1 p036502 1.00 word positive
2 p036502 1.00 word trust
3 p039565 0 calm positive
4 p039565 0 dance joy
5 p039565 0 dance positive
6 p039565 0 dance trust
7 p233823 1.00 learn positive
8 p185307 0 inspired joy
9 p185307 0 inspired positive
10 p185307 0 inspired surprise
11 p185307 0 inspired trust
12 p185307 0 increase positive
13 p185307 0 gain anticipation
14 p185307 0 gain joy
15 p185307 0 gain positive
16 p013780 1.00 clean joy
17 p013780 1.00 clean positive
18 p013780 1.00 clean trust
19 p013780 1.00 culinary positive
20 p013780 1.00 culinary trust
# First, we want to select project id, title name, and if the project was approved or not
id_title <- c(1, 16, 10)
train_text <- train[,id_title]
train_text[,1] <- as.character(train_text[,1])
train_text[,2] <- as.numeric(train_text[,2])
train_text[,3] <- as.character(train_text[,3])
train_text <- as.tibble(train_text)
tidy_books <- train_text %>% unnest_tokens(word, project_essay_1)
# Find how many words in essay1
freq_table <- count(tidy_books, id)
title_word_count_by_project <- left_join(freq_table, train_text[,1:2], by = c("id"))
ggplot(title_word_count_by_project, aes(x = factor(project_is_approved), y = n)) + geom_boxplot()
ggplot(title_word_count_by_project, aes(x = n, fill = factor(project_is_approved))) +
geom_bar(position = "fill")
# Take out stop (common) words
tidy_books <- tidy_books %>%
anti_join(stop_words)
Joining, by = "word"
freq_table <- count(tidy_books, id)
title_word_count_by_project <- left_join(freq_table, train_text[,1:2], by = c("id"))
ggplot(title_word_count_by_project, aes(x = factor(project_is_approved), y = n)) + geom_boxplot()
essay1_n <- ggplot(title_word_count_by_project, aes(x = n, fill = factor(project_is_approved))) +
geom_bar(position = "fill")
# Get a sense of which words are the most common
tidy_books %>%
count(word, sort = TRUE)
# A tibble: 46,131 x 2
word n
<chr> <int>
1 students 610304
2 school 310246
3 learning 136342
4 learn 118186
5 classroom 117935
6 love 77494
7 day 74593
8 grade 57162
9 class 56279
10 free 51354
# ... with 46,121 more rows
tidy_books %>%
count(word, sort = TRUE) %>%
filter(n > 20000) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()
janeaustensentiment <- tidy_books %>% inner_join(get_sentiments("afinn"))
Joining, by = "word"
summary(janeaustensentiment)
id project_is_approved word
Length:1144903 Min. :0.00 Length:1144903
Class :character 1st Qu.:1.00 Class :character
Mode :character Median :1.00 Mode :character
Mean :0.85
3rd Qu.:1.00
Max. :1.00
score
Min. :-4.000
1st Qu.: 1.000
Median : 2.000
Mean : 1.394
3rd Qu.: 2.000
Max. : 5.000
janeaustensentiment[1:10,]
# A tibble: 10 x 4
id project_is_approved word score
<chr> <dbl> <chr> <int>
1 p036502 1.00 risk -2
2 p036502 1.00 obstacles -2
3 p036502 1.00 excited 3
4 p036502 1.00 exposed -1
5 p036502 1.00 motivated 2
6 p036502 1.00 hard -1
7 p036502 1.00 excited 3
8 p039565 0 rich 2
9 p039565 0 free 1
10 p039565 0 blocks -1
# Is there a relationship between the average score and whether or not it gets accepted?
check_corr_titles <- aggregate(janeaustensentiment$score, by = list(janeaustensentiment$id), FUN = mean)
colnames(check_corr_titles) <- c("id", "score")
check_corr_titles2 <- left_join(check_corr_titles, train_text[,1:2], by = c("id"))
ggplot(check_corr_titles2, aes(x = factor(project_is_approved), y = score)) + geom_boxplot()
essay1_a <- ggplot(check_corr_titles2,aes(x = score, fill = factor(project_is_approved))) +
geom_bar(position = "fill")
# Is there a relationship between the most positive word and whether it gets approved?
check_corr_titles <- aggregate(janeaustensentiment$score, by = list(janeaustensentiment$id), FUN = max)
colnames(check_corr_titles) <- c("id", "score")
check_corr_titles2 <- left_join(check_corr_titles, train_text[,1:2], by = c("id"))
ggplot(check_corr_titles2, aes(x = factor(project_is_approved), y = score)) + geom_boxplot()
essay1_b <- ggplot(check_corr_titles2,aes(x = score, fill = factor(project_is_approved))) +
geom_bar(position = "fill")
# Is there a relationship between the most negative word and whether it gets approved?
check_corr_titles <- aggregate(janeaustensentiment$score, by = list(janeaustensentiment$id), FUN = min)
colnames(check_corr_titles) <- c("id", "score")
check_corr_titles2 <- left_join(check_corr_titles, train_text[,1:2], by = c("id"))
ggplot(check_corr_titles2, aes(x = factor(project_is_approved), y = score)) + geom_boxplot()
essay1_c <- ggplot(check_corr_titles2,aes(x = score, fill = factor(project_is_approved))) +
geom_bar(position = "fill")
# Is there a relationship between the most common value and whether it gets approved?
check_corr_titles <- aggregate(janeaustensentiment$score, by = list(janeaustensentiment$id), FUN = median)
colnames(check_corr_titles) <- c("id", "score")
check_corr_titles2 <- left_join(check_corr_titles, train_text[,1:2], by = c("id"))
ggplot(check_corr_titles2, aes(x = factor(project_is_approved), y = score)) + geom_boxplot()
essay1_d <- ggplot(check_corr_titles2,aes(x = score, fill = factor(project_is_approved))) +
geom_bar(position = "fill")
# First, we want to select project id, essay 2, and if the project was approved or not
id_title <- c(1, 16, 11)
train_text <- train[,id_title]
train_text[,1] <- as.character(train_text[,1])
train_text[,2] <- as.numeric(train_text[,2])
train_text[,3] <- as.character(train_text[,3])
train_text <- as.tibble(train_text)
tidy_books <- train_text %>% unnest_tokens(word, project_essay_2)
# Find how many words in essay1
freq_table <- count(tidy_books, id)
title_word_count_by_project <- left_join(freq_table, train_text[,1:2], by = c("id"))
ggplot(title_word_count_by_project, aes(x = factor(project_is_approved), y = n)) + geom_boxplot()
ggplot(title_word_count_by_project, aes(x = n, fill = factor(project_is_approved))) +
geom_bar(position = "fill")
# Take out stop (common) words
tidy_books <- tidy_books %>%
anti_join(stop_words)
Joining, by = "word"
freq_table <- count(tidy_books, id)
title_word_count_by_project <- left_join(freq_table, train_text[,1:2], by = c("id"))
ggplot(title_word_count_by_project, aes(x = factor(project_is_approved), y = n)) + geom_boxplot()
essay2_n <- ggplot(title_word_count_by_project, aes(x = n, fill = factor(project_is_approved))) +
geom_bar(position = "fill")
# Get a sense of which words are the most common
tidy_books %>%
count(word, sort = TRUE)
# A tibble: 64,340 x 2
word n
<chr> <int>
1 students 670368
2 classroom 158719
3 learning 154165
4 reading 105082
5 school 104381
6 learn 88350
7 books 81544
8 skills 76292
9 technology 67841
10 time 66121
# ... with 64,330 more rows
tidy_books %>%
count(word, sort = TRUE) %>%
filter(n > 20000) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()
janeaustensentiment <- tidy_books %>% inner_join(get_sentiments("afinn"))
Joining, by = "word"
summary(janeaustensentiment)
id project_is_approved word
Length:1057401 Min. :0.0000 Length:1057401
Class :character 1st Qu.:1.0000 Class :character
Mode :character Median :1.0000 Mode :character
Mean :0.8498
3rd Qu.:1.0000
Max. :1.0000
score
Min. :-5.000
1st Qu.: 1.000
Median : 2.000
Mean : 1.368
3rd Qu.: 2.000
Max. : 5.000
janeaustensentiment[1:10,]
# A tibble: 10 x 4
id project_is_approved word score
<chr> <dbl> <chr> <int>
1 p036502 1.00 favorite 2
2 p036502 1.00 dream 1
3 p036502 1.00 struggling -2
4 p039565 0 excitement 3
5 p233823 1.00 wonderful 4
6 p233823 1.00 advanced 1
7 p185307 0 inspired 2
8 p185307 0 active 1
9 p185307 0 gaining 2
10 p185307 0 inspired 2
# Is there a relationship between the average score and whether or not it gets accepted?
check_corr_titles <- aggregate(janeaustensentiment$score, by = list(janeaustensentiment$id), FUN = mean)
colnames(check_corr_titles) <- c("id", "score")
check_corr_titles2 <- left_join(check_corr_titles, train_text[,1:2], by = c("id"))
ggplot(check_corr_titles2, aes(x = factor(project_is_approved), y = score)) + geom_boxplot()
essay2_a <- ggplot(check_corr_titles2,aes(x = score, fill = factor(project_is_approved))) +
geom_bar(position = "fill")
# Is there a relationship between the most positive word and whether it gets approved?
check_corr_titles <- aggregate(janeaustensentiment$score, by = list(janeaustensentiment$id), FUN = max)
colnames(check_corr_titles) <- c("id", "score")
check_corr_titles2 <- left_join(check_corr_titles, train_text[,1:2], by = c("id"))
ggplot(check_corr_titles2, aes(x = factor(project_is_approved), y = score)) + geom_boxplot()
essay2_b <-ggplot(check_corr_titles2,aes(x = score, fill = factor(project_is_approved))) +
geom_bar(position = "fill")
# Is there a relationship between the most negative word and whether it gets approved?
check_corr_titles <- aggregate(janeaustensentiment$score, by = list(janeaustensentiment$id), FUN = min)
colnames(check_corr_titles) <- c("id", "score")
check_corr_titles2 <- left_join(check_corr_titles, train_text[,1:2], by = c("id"))
ggplot(check_corr_titles2, aes(x = factor(project_is_approved), y = score)) + geom_boxplot()
essay2_c <-ggplot(check_corr_titles2,aes(x = score, fill = factor(project_is_approved))) +
geom_bar(position = "fill")
# Is there a relationship between the most common value and whether it gets approved?
check_corr_titles <- aggregate(janeaustensentiment$score, by = list(janeaustensentiment$id), FUN = median)
colnames(check_corr_titles) <- c("id", "score")
check_corr_titles2 <- left_join(check_corr_titles, train_text[,1:2], by = c("id"))
ggplot(check_corr_titles2, aes(x = factor(project_is_approved), y = score)) + geom_boxplot()
essay2_d <- ggplot(check_corr_titles2,aes(x = score, fill = factor(project_is_approved))) +
geom_bar(position = "fill")
plot_grid(title_n, essay1_n, essay2_n, labels = c("A", "B", "C"), ncol = 1)
plot_grid(title_a, essay1_a, essay2_a, labels = c("A", "B", "C"), ncol = 1)
Warning: position_stack requires non-overlapping x intervals
Warning: position_stack requires non-overlapping x intervals
Warning: position_stack requires non-overlapping x intervals
plot_grid(title_b, essay1_b, essay2_b, labels = c("A", "B", "C"), ncol = 1)
plot_grid(title_c, essay1_c, essay2_c, labels = c("A", "B", "C"), ncol = 1)
plot_grid(title_d, essay1_d, essay2_d, labels = c("A", "B", "C"), ncol = 1)
sessionInfo()
R version 3.4.3 (2017-11-30)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: OS X El Capitan 10.11.6
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] bindrcpp_0.2 cowplot_0.9.2 forcats_0.2.0 purrr_0.2.4
[5] readr_1.1.1 tidyr_0.7.2 tibble_1.4.2 tidyverse_1.2.1
[9] ggplot2_2.2.1 tidytext_0.1.8 stringr_1.3.0 dplyr_0.7.4
loaded via a namespace (and not attached):
[1] reshape2_1.4.3 haven_1.1.1 lattice_0.20-35
[4] colorspace_1.3-2 htmltools_0.3.6 SnowballC_0.5.1
[7] yaml_2.1.18 utf8_1.1.3 rlang_0.1.6
[10] pillar_1.1.0 foreign_0.8-69 glue_1.2.0
[13] modelr_0.1.1 readxl_1.0.0 bindr_0.1
[16] plyr_1.8.4 munsell_0.4.3 gtable_0.2.0
[19] cellranger_1.1.0 rvest_0.3.2 psych_1.7.8
[22] evaluate_0.10.1 labeling_0.3 knitr_1.20
[25] parallel_3.4.3 broom_0.4.3 tokenizers_0.2.0
[28] Rcpp_0.12.15 backports_1.1.2 scales_0.5.0
[31] jsonlite_1.5 mnormt_1.5-5 hms_0.4.0
[34] digest_0.6.15 stringi_1.1.7 grid_3.4.3
[37] rprojroot_1.3-2 cli_1.0.0 tools_3.4.3
[40] magrittr_1.5 lazyeval_0.2.1 janeaustenr_0.1.5
[43] crayon_1.3.4 pkgconfig_2.0.1 Matrix_1.2-12
[46] xml2_1.1.1 lubridate_1.7.1 assertthat_0.2.0
[49] rmarkdown_1.9 httr_1.3.1 rstudioapi_0.7
[52] R6_2.2.2 nlme_3.1-131 git2r_0.21.0
[55] compiler_3.4.3
This R Markdown site was created with workflowr