-
Notifications
You must be signed in to change notification settings - Fork 0
/
quote_top_words.R
46 lines (35 loc) · 1.52 KB
/
quote_top_words.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#' - TIDF = total number of documents / number of documents which contain the word
#' - indicates how common a term is across all documents
#' @param t A dataframe of quotes.
#' @return A dataframe of quotes, with the top three words for each quote appended at the end as columns.
#' If a quote does not have three top words, it returns NA.
top_words <- function(t) {
library(tm)
library(magrittr)
#get text of content
content <- sapply(t$Text, as.character)
#convert content into a corpus
corpus <- VectorSource(content) %>% Corpus
#pre-processing text
corpus <- corpus %>% tm_map(removePunctuation) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeNumbers) %>%
tm_map(removeWords, stopwords("english")) %>%
tm_map(stripWhitespace) %>%
tm_map(stemDocument)
#generate document term matrix
dtm <- DocumentTermMatrix(corpus)
#calculate term frequency-inverse document frequency
tfidf <- DocumentTermMatrix(corpus, control = list(weighting = weightTfIdf)) %>% as.matrix() %>% as.data.frame
t <- cbind(t, first_word = rep(NA, nrow(t)), second_word = rep(NA, nrow(t)), third_word = rep(NA, nrow(t)))
for (i in 1:nrow(t)) {
temp <- sort(tfidf[i,], decreasing = TRUE)
index_favorites <- which(colnames(t)=="Favorites")
for (j in 1:3) {
if (temp[j]!= 0) {
t[i, index_favorites +j] <- colnames(temp[j])
}
}
}
return(t)
}