-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmotor_system.Rmd
126 lines (110 loc) · 2.79 KB
/
motor_system.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
---
title: "motor system"
author: "Timing Liu"
date: "11/04/2021"
output: html_document
---
## Day 1: Processing and word cloud
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r}
# install.packages("pdftools")
library(pdftools)
knitr::opts_chunk$set(dpi=300)
```
```{r}
text <- pdf_text("Motor System.pdf")
typeof(text)
length(text)
# remove lecturer's information
text <- text[2:length(text)]
```
```{r}
library(tidyverse)
text_df <- tibble(page = seq_along(text), text = text)
```
```{r}
library(tidytext)
data(stop_words)
tidy_text_df <- text_df %>%
unnest_tokens(word, text) %>%
# remove stop words
anti_join(stop_words) %>%
# remove digits
filter(!str_detect(word, "[:digit:]"))
```
### Stemming
```{r}
# install.packages("SnowballC")
library(SnowballC)
stemmed_tidy_text_df <-
tidy_text_df %>%
mutate(word = wordStem(word))
```
### word cloud
I decided to continue with unstemmed texts this time to differentiate plural vs non-plural usages by the lecturer.
```{r}
# install.packages("wordcloud")
library(wordcloud)
png("motor.png")
wordcloud_plot <- stemmed_tidy_text_df %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
wordcloud_plot
dev.off()
```
## Day 2: Word frequency
### ggrough
N.B. because `ggrough` was last updated 2 years ago, it no longer supports most of the ggplot2 engine.
```{r}
# remotes::install_github("xvrdm/ggrough")
# install.packages("showtext")
library(ggrough)
library(showtext)
font_add_google("Rock Salt", "Rock Salt")
showtext::showtext_auto()
p <- stemmed_tidy_text_df %>%
count(word, sort = TRUE) %>%
filter(n > 100) %>%
mutate(word = fct_reorder(word, n)) %>%
ggplot() +
aes(word, n)+
geom_col() +
labs(title="Word frequency in motor lectures", y = NULL, x = NULL) +
theme(text = element_text(colour = "snow", family = "sans"),
axis.text = element_text(colour = "snow"),
plot.background = element_rect(fill="#223583"),
panel.background = element_rect(fill="#223583"))
p
options <- list(
Background=list(roughness=2),
GeomCol=list(fill_style="zigzag", angle_noise=0.5, fill_weight=2))
get_rough_chart(p, options, font_size_booster = 1)
ggsave("tmp.jpeg")
```
### Replot rough plot with `rroughviz`
```{r}
# devtools::install_github('tidyss/rroughviz')
library(rroughviz)
stemmed_tidy_text_df %>%
count(word, sort = TRUE) %>%
filter(n > 100) %>%
inner_join(stemmed_tidy_text_df) %>%
pull(word) %>%
rough_bar()
```
### Sentiment analysis
```{r}
afinn <- get_sentiments("afinn")
tidy_text_df %>%
inner_join(afinn) %>%
group_by(page) %>%
summarize(sentiment = sum(value)) %>%
ggplot() +
aes(x = page, y = sentiment) %>%
geom_line() +
theme_minimal(base_size = 30) +
labs(title = "Sentiment analysis of motor lectures using AFINN")
ggsave("sentiment.png")
```