Last updated: 2018-03-29
Code version: 9102019
The following RMarkdown file uses files from Donors Choose and performs preliminary sentiment analysis. The sentiment analysis was aided by the code and explanations provided in
# Libraries
# Open the datasets
train <- read.csv("~/Dropbox/DonorsChoose/train.csv")
test <- read.csv("~/Dropbox/DonorsChoose/test.csv")
resources <- read.csv("~/Dropbox/DonorsChoose/resources.csv")
# First, we want to select project id, title name, and if the project was approved or not
id_title <- c(1, 16, 9)
train_text <- train[,id_title]
train_text[,1] <- as.character(train_text[,1])
train_text[,2] <- as.numeric(train_text[,2])
train_text[,3] <- as.character(train_text[,3])
train_text <- as.tibble(train_text)
tidy_books <- train_text %>% unnest_tokens(word, project_title)
# Find out how many words the title is
freq_table <- count(tidy_books, id)
title_word_count_by_project <- left_join(freq_table, train_text[,1:2], by = c("id"))
ggplot(title_word_count_by_project, aes(x = factor(project_is_approved), y = n)) + geom_boxplot()
ggplot(title_word_count_by_project, aes(x = n, fill = factor(project_is_approved))) +
geom_bar(position = "fill")
# Take out stop (common) words
tidy_books <- tidy_books %>%
freq_table <- count(tidy_books, id)
title_word_count_by_project <- left_join(freq_table, train_text[,1:2], by = c("id"))
ggplot(title_word_count_by_project, aes(x = factor(project_is_approved), y = n)) + geom_boxplot()
title_n <- ggplot(title_word_count_by_project, aes(x = n, fill = factor(project_is_approved))) +
geom_bar(position = "fill") + ggtitle("Number of words by approval status")
# Get a sense of which words are the most common
tidy_books %>%
count(word, sort = TRUE)
# A tibble: 22,308 x 2
word n
<chr> <int>
1 learning 15587
2 technology 9677
3 classroom 9590
4 students 8486
5 reading 8053
6 books 6815
7 seating 6618
8 math 5520
9 flexible 5393
10 learn 4927
# ... with 22,298 more rows
tidy_books %>%
count(word, sort = TRUE) %>%
filter(n > 2000) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
janeaustensentiment <- tidy_books %>% inner_join(get_sentiments("afinn"))
id project_is_approved word
Length:54170 Min. :0.0000 Length:54170
Class :character 1st Qu.:1.0000 Class :character
Mode :character Median :1.0000 Mode :character
Mean :0.8403
3rd Qu.:1.0000
Max. :1.0000
Min. :-4.000
1st Qu.: 1.000
Median : 2.000
Mean : 1.609
3rd Qu.: 2.000
Max. : 5.000
# A tibble: 10 x 4
id project_is_approved word score
<chr> <dbl> <chr> <int>
1 p036502 1.00 super 3
2 p039565 0 calm 2
3 p185307 0 inspired 2
4 p185307 0 increase 1
5 p185307 0 gain 2
6 p013780 1.00 clean 2
7 p063374 1.00 reach 1
8 p103285 1.00 active 1
9 p181781 1.00 fabulous 4
10 p114989 1.00 fidgety -2
# Is there a correlation between the average score and whether or not it gets accepted?
check_corr_titles <- aggregate(janeaustensentiment$score, by = list(janeaustensentiment$id), FUN = mean)
colnames(check_corr_titles) <- c("id", "score")
check_corr_titles2 <- left_join(check_corr_titles, train_text[,1:2], by = c("id"))
ggplot(check_corr_titles2, aes(x = factor(project_is_approved), y = score)) + geom_boxplot()
title_a <- ggplot(check_corr_titles2,aes(x = score, fill = factor(project_is_approved))) +
geom_bar(position = "fill") + ggtitle("Avg. AFINN sentiment score by approval status")
# Is there a relationship between the most positive word and whether it gets approved?
check_corr_titles <- aggregate(janeaustensentiment$score, by = list(janeaustensentiment$id), FUN = max)
colnames(check_corr_titles) <- c("id", "score")
check_corr_titles2 <- left_join(check_corr_titles, train_text[,1:2], by = c("id"))
ggplot(check_corr_titles2, aes(x = factor(project_is_approved), y = score)) + geom_boxplot()
title_b <- ggplot(check_corr_titles2,aes(x = score, fill = factor(project_is_approved))) +
geom_bar(position = "fill") + ggtitle("Max. AFINN sentiment score by approval status")
# Is there a relationship between the most negative word and whether it gets approved?
check_corr_titles <- aggregate(janeaustensentiment$score, by = list(janeaustensentiment$id), FUN = min)
colnames(check_corr_titles) <- c("id", "score")
check_corr_titles2 <- left_join(check_corr_titles, train_text[,1:2], by = c("id"))
ggplot(check_corr_titles2, aes(x = factor(project_is_approved), y = score)) + geom_boxplot()
title_c <- ggplot(check_corr_titles2,aes(x = score, fill = factor(project_is_approved))) +
geom_bar(position = "fill") + ggtitle("Min. AFINN sentiment score by approval status")
# Is there a relationship between the most common value and whether it gets approved?
check_corr_titles <- aggregate(janeaustensentiment$score, by = list(janeaustensentiment$id), FUN = median)
colnames(check_corr_titles) <- c("id", "score")
check_corr_titles2 <- left_join(check_corr_titles, train_text[,1:2], by = c("id"))
ggplot(check_corr_titles2, aes(x = factor(project_is_approved), y = score)) + geom_boxplot()
title_d <- ggplot(check_corr_titles2,aes(x = score, fill = factor(project_is_approved))) +
geom_bar(position = "fill") + ggtitle("Median AFINN sentiment score by approval status")
# Sentiment analysis - bing
janeaustensentiment <- tidy_books %>% inner_join(get_sentiments("bing"))
# A tibble: 10 x 4
id project_is_approved word sentiment
<chr> <dbl> <chr> <chr>
1 p036502 1.00 super positive
2 p039565 0 calm positive
3 p185307 0 gain positive
4 p013780 1.00 clean positive
5 p181781 1.00 fabulous positive
6 p114989 1.00 wobble negative
7 p114989 1.00 fidgety negative
8 p226941 1.00 boost positive
9 p173555 0 love positive
10 p055350 1.00 flexible positive
# Is there a connection between the average score and whether or not it gets accepted?
length(which(janeaustensentiment$project_is_approved == 1 & janeaustensentiment$sentiment == "positive"))
[1] 43948
length(which(janeaustensentiment$project_is_approved == 0 & janeaustensentiment$sentiment == "positive"))
[1] 8586
length(which(janeaustensentiment$project_is_approved == 1 & janeaustensentiment$sentiment == "negative"))
[1] 9548
length(which(janeaustensentiment$project_is_approved == 0 & janeaustensentiment$sentiment == "negative"))
[1] 1532
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(id) %>%
summarize(words = n())
sentiment_ratio <- tidy_books %>%
semi_join(bingnegative) %>%
group_by(id, project_is_approved) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("id")) %>%
mutate(ratio = negativewords/words) %>%
top_n(1) %>%
ggplot(sentiment_ratio, aes(x = factor(project_is_approved), y = ratio)) + geom_boxplot()
janeaustensentiment <- tidy_books %>% inner_join(get_sentiments("nrc"))
Joining, by = "word"
# A tibble: 20 x 4
id project_is_approved word sentiment
<chr> <dbl> <chr> <chr>
1 p036502 1.00 word positive
2 p036502 1.00 word trust
3 p039565 0 calm positive
4 p039565 0 dance joy
5 p039565 0 dance positive
6 p039565 0 dance trust
7 p233823 1.00 learn positive
8 p185307 0 inspired joy
9 p185307 0 inspired positive
10 p185307 0 inspired surprise
11 p185307 0 inspired trust
12 p185307 0 increase positive
13 p185307 0 gain anticipation
14 p185307 0 gain joy
15 p185307 0 gain positive
16 p013780 1.00 clean joy
17 p013780 1.00 clean positive
18 p013780 1.00 clean trust
19 p013780 1.00 culinary positive
20 p013780 1.00 culinary trust
# First, we want to select project id, title name, and if the project was approved or not
id_title <- c(1, 16, 10)
train_text <- train[,id_title]
train_text[,1] <- as.character(train_text[,1])
train_text[,2] <- as.numeric(train_text[,2])
train_text[,3] <- as.character(train_text[,3])
train_text <- as.tibble(train_text)
tidy_books <- train_text %>% unnest_tokens(word, project_essay_1)
# Find how many words in essay1
freq_table <- count(tidy_books, id)
title_word_count_by_project <- left_join(freq_table, train_text[,1:2], by = c("id"))
ggplot(title_word_count_by_project, aes(x = factor(project_is_approved), y = n)) + geom_boxplot()
ggplot(title_word_count_by_project, aes(x = n, fill = factor(project_is_approved))) +
geom_bar(position = "fill")
# Take out stop (common) words
tidy_books <- tidy_books %>%
freq_table <- count(tidy_books, id)
title_word_count_by_project <- left_join(freq_table, train_text[,1:2], by = c("id"))
ggplot(title_word_count_by_project, aes(x = factor(project_is_approved), y = n)) + geom_boxplot()
essay1_n <- ggplot(title_word_count_by_project, aes(x = n, fill = factor(project_is_approved))) +
geom_bar(position = "fill")
# Get a sense of which words are the most common
tidy_books %>%
count(word, sort = TRUE)
# A tibble: 46,131 x 2
word n
<chr> <int>
1 students 610304
2 school 310246
3 learning 136342
4 learn 118186
5 classroom 117935
6 love 77494
7 day 74593
8 grade 57162
9 class 56279
10 free 51354
# ... with 46,121 more rows
tidy_books %>%
count(word, sort = TRUE) %>%
filter(n > 20000) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
janeaustensentiment <- tidy_books %>% inner_join(get_sentiments("afinn"))
Joining, by = "word"
id project_is_approved word
Length:1144903 Min. :0.00 Length:1144903
Class :character 1st Qu.:1.00 Class :character
Mode :character Median :1.00 Mode :character
Mean :0.85
3rd Qu.:1.00
Max. :1.00
Min. :-4.000
1st Qu.: 1.000
Median : 2.000
Mean : 1.394
3rd Qu.: 2.000
Max. : 5.000
# A tibble: 10 x 4
id project_is_approved word score
<chr> <dbl> <chr> <int>
1 p036502 1.00 risk -2
2 p036502 1.00 obstacles -2
3 p036502 1.00 excited 3
4 p036502 1.00 exposed -1
5 p036502 1.00 motivated 2
6 p036502 1.00 hard -1
7 p036502 1.00 excited 3
8 p039565 0 rich 2
9 p039565 0 free 1
10 p039565 0 blocks -1
# Is there a relationship between the average score and whether or not it gets accepted?
check_corr_titles <- aggregate(janeaustensentiment$score, by = list(janeaustensentiment$id), FUN = mean)
colnames(check_corr_titles) <- c("id", "score")
check_corr_titles2 <- left_join(check_corr_titles, train_text[,1:2], by = c("id"))
ggplot(check_corr_titles2, aes(x = factor(project_is_approved), y = score)) + geom_boxplot()
essay1_a <- ggplot(check_corr_titles2,aes(x = score, fill = factor(project_is_approved))) +
geom_bar(position = "fill")
# Is there a relationship between the most positive word and whether it gets approved?
check_corr_titles <- aggregate(janeaustensentiment$score, by = list(janeaustensentiment$id), FUN = max)
colnames(check_corr_titles) <- c("id", "score")
check_corr_titles2 <- left_join(check_corr_titles, train_text[,1:2], by = c("id"))
ggplot(check_corr_titles2, aes(x = factor(project_is_approved), y = score)) + geom_boxplot()
essay1_b <- ggplot(check_corr_titles2,aes(x = score, fill = factor(project_is_approved))) +
geom_bar(position = "fill")
# Is there a relationship between the most negative word and whether it gets approved?
check_corr_titles <- aggregate(janeaustensentiment$score, by = list(janeaustensentiment$id), FUN = min)
colnames(check_corr_titles) <- c("id", "score")
check_corr_titles2 <- left_join(check_corr_titles, train_text[,1:2], by = c("id"))
ggplot(check_corr_titles2, aes(x = factor(project_is_approved), y = score)) + geom_boxplot()
essay1_c <- ggplot(check_corr_titles2,aes(x = score, fill = factor(project_is_approved))) +
geom_bar(position = "fill")
# Is there a relationship between the most common value and whether it gets approved?
check_corr_titles <- aggregate(janeaustensentiment$score, by = list(janeaustensentiment$id), FUN = median)
colnames(check_corr_titles) <- c("id", "score")
check_corr_titles2 <- left_join(check_corr_titles, train_text[,1:2], by = c("id"))
ggplot(check_corr_titles2, aes(x = factor(project_is_approved), y = score)) + geom_boxplot()
essay1_d <- ggplot(check_corr_titles2,aes(x = score, fill = factor(project_is_approved))) +
geom_bar(position = "fill")
# First, we want to select project id, essay 2, and if the project was approved or not
id_title <- c(1, 16, 11)
train_text <- train[,id_title]
train_text[,1] <- as.character(train_text[,1])
train_text[,2] <- as.numeric(train_text[,2])
train_text[,3] <- as.character(train_text[,3])
train_text <- as.tibble(train_text)
tidy_books <- train_text %>% unnest_tokens(word, project_essay_2)
# Find how many words in essay1
freq_table <- count(tidy_books, id)
title_word_count_by_project <- left_join(freq_table, train_text[,1:2], by = c("id"))
ggplot(title_word_count_by_project, aes(x = factor(project_is_approved), y = n)) + geom_boxplot()
ggplot(title_word_count_by_project, aes(x = n, fill = factor(project_is_approved))) +
geom_bar(position = "fill")
# Take out stop (common) words
tidy_books <- tidy_books %>%
freq_table <- count(tidy_books, id)
title_word_count_by_project <- left_join(freq_table, train_text[,1:2], by = c("id"))
ggplot(title_word_count_by_project, aes(x = factor(project_is_approved), y = n)) + geom_boxplot()
essay2_n <- ggplot(title_word_count_by_project, aes(x = n, fill = factor(project_is_approved))) +
geom_bar(position = "fill")
# Get a sense of which words are the most common
tidy_books %>%
count(word, sort = TRUE)
# A tibble: 64,340 x 2
word n
<chr> <int>
1 students 670368
2 classroom 158719
3 learning 154165
4 reading 105082
5 school 104381
6 learn 88350
7 books 81544
8 skills 76292
9 technology 67841
10 time 66121
# ... with 64,330 more rows
tidy_books %>%
count(word, sort = TRUE) %>%
filter(n > 20000) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
janeaustensentiment <- tidy_books %>% inner_join(get_sentiments("afinn"))
id project_is_approved word
Length:1057401 Min. :0.0000 Length:1057401
Class :character 1st Qu.:1.0000 Class :character
Mode :character Median :1.0000 Mode :character
Mean :0.8498
3rd Qu.:1.0000
Max. :1.0000
Min. :-5.000
1st Qu.: 1.000
Median : 2.000
Mean : 1.368
3rd Qu.: 2.000
Max. : 5.000
# A tibble: 10 x 4
id project_is_approved word score
<chr> <dbl> <chr> <int>
1 p036502 1.00 favorite 2
2 p036502 1.00 dream 1
3 p036502 1.00 struggling -2
4 p039565 0 excitement 3
5 p233823 1.00 wonderful 4
6 p233823 1.00 advanced 1
7 p185307 0 inspired 2
8 p185307 0 active 1
9 p185307 0 gaining 2
10 p185307 0 inspired 2
# Is there a relationship between the average score and whether or not it gets accepted?
check_corr_titles <- aggregate(janeaustensentiment$score, by = list(janeaustensentiment$id), FUN = mean)
colnames(check_corr_titles) <- c("id", "score")
check_corr_titles2 <- left_join(check_corr_titles, train_text[,1:2], by = c("id"))
ggplot(check_corr_titles2, aes(x = factor(project_is_approved), y = score)) + geom_boxplot()
essay2_a <- ggplot(check_corr_titles2,aes(x = score, fill = factor(project_is_approved))) +
geom_bar(position = "fill")
# Is there a relationship between the most positive word and whether it gets approved?
check_corr_titles <- aggregate(janeaustensentiment$score, by = list(janeaustensentiment$id), FUN = max)
colnames(check_corr_titles) <- c("id", "score")
check_corr_titles2 <- left_join(check_corr_titles, train_text[,1:2], by = c("id"))
ggplot(check_corr_titles2, aes(x = factor(project_is_approved), y = score)) + geom_boxplot()
essay2_b <-ggplot(check_corr_titles2,aes(x = score, fill = factor(project_is_approved))) +
geom_bar(position = "fill")
# Is there a relationship between the most negative word and whether it gets approved?
check_corr_titles <- aggregate(janeaustensentiment$score, by = list(janeaustensentiment$id), FUN = min)
colnames(check_corr_titles) <- c("id", "score")
check_corr_titles2 <- left_join(check_corr_titles, train_text[,1:2], by = c("id"))
ggplot(check_corr_titles2, aes(x = factor(project_is_approved), y = score)) + geom_boxplot()
essay2_c <-ggplot(check_corr_titles2,aes(x = score, fill = factor(project_is_approved))) +
geom_bar(position = "fill")
# Is there a relationship between the most common value and whether it gets approved?
check_corr_titles <- aggregate(janeaustensentiment$score, by = list(janeaustensentiment$id), FUN = median)
colnames(check_corr_titles) <- c("id", "score")
check_corr_titles2 <- left_join(check_corr_titles, train_text[,1:2], by = c("id"))
ggplot(check_corr_titles2, aes(x = factor(project_is_approved), y = score)) + geom_boxplot()
essay2_d <- ggplot(check_corr_titles2,aes(x = score, fill = factor(project_is_approved))) +
geom_bar(position = "fill")
plot_grid(title_n, essay1_n, essay2_n, labels = c("A", "B", "C"), ncol = 1)
plot_grid(title_a, essay1_a, essay2_a, labels = c("A", "B", "C"), ncol = 1)
plot_grid(title_b, essay1_b, essay2_b, labels = c("A", "B", "C"), ncol = 1)
plot_grid(title_c, essay1_c, essay2_c, labels = c("A", "B", "C"), ncol = 1)
plot_grid(title_d, essay1_d, essay2_d, labels = c("A", "B", "C"), ncol = 1)
