install.packages("topicmodels", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Li Xi/Documents/R/win-library/4.1'
## (as 'lib' is unspecified)
## package 'topicmodels' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'topicmodels'
## Warning in file.copy(savedcopy, lib, recursive =
## TRUE): problem copying C:\Users\Li Xi\Documents\R\win-
## library\4.1\00LOCK\topicmodels\libs\x64\topicmodels.dll to C:\Users\Li
## Xi\Documents\R\win-library\4.1\topicmodels\libs\x64\topicmodels.dll: Permission
## denied
## Warning: restored 'topicmodels'
##
## The downloaded binary packages are in
## C:\Users\Li Xi\AppData\Local\Temp\Rtmp6pjtWP\downloaded_packages
install.packages("reshape2", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Li Xi/Documents/R/win-library/4.1'
## (as 'lib' is unspecified)
## package 'reshape2' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'reshape2'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\Li Xi\Documents\R\win-library\4.1\00LOCK\reshape2\libs\x64\reshape2.dll
## to C:\Users\Li Xi\Documents\R\win-library\4.1\reshape2\libs\x64\reshape2.dll:
## Permission denied
## Warning: restored 'reshape2'
##
## The downloaded binary packages are in
## C:\Users\Li Xi\AppData\Local\Temp\Rtmp6pjtWP\downloaded_packages
install.packages("pals", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Li Xi/Documents/R/win-library/4.1'
## (as 'lib' is unspecified)
## package 'pals' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Li Xi\AppData\Local\Temp\Rtmp6pjtWP\downloaded_packages
install.packages("tidytext", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Li Xi/Documents/R/win-library/4.1'
## (as 'lib' is unspecified)
## package 'tidytext' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Li Xi\AppData\Local\Temp\Rtmp6pjtWP\downloaded_packages
install.packages("dplyr", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Li Xi/Documents/R/win-library/4.1'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'dplyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\Li Xi\Documents\R\win-library\4.1\00LOCK\dplyr\libs\x64\dplyr.dll to C:
## \Users\Li Xi\Documents\R\win-library\4.1\dplyr\libs\x64\dplyr.dll: Permission
## denied
## Warning: restored 'dplyr'
##
## The downloaded binary packages are in
## C:\Users\Li Xi\AppData\Local\Temp\Rtmp6pjtWP\downloaded_packages
library(tm)
## Loading required package: NLP
library(topicmodels)
library(reshape2)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(pals)
library(tidytext)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
filename = "https://ximarketing.github.io/class/teachingfiles/lda.csv"
x <- read.csv(filename, header = TRUE)
corpus <- Corpus(DataframeSource(x))
dtm <- DocumentTermMatrix(corpus)
processedCorpus <- tm_map(corpus, content_transformer(tolower))
processedCorpus <- tm_map(processedCorpus, removeWords, stopwords("english"))
processedCorpus <- tm_map(processedCorpus, removePunctuation, preserve_intra_word_dashes = TRUE)
processedCorpus <- tm_map(processedCorpus, removeNumbers)
processedCorpus <- tm_map(processedCorpus, stemDocument, language = "en")
processedCorpus <- tm_map(processedCorpus, stripWhitespace)
minimumFrequency <- 1
DTM <- DocumentTermMatrix(processedCorpus, control = list(bounds = list(global = c(minimumFrequency, Inf))))
# have a look at the number of documents and terms in the matrix
dim(DTM)
## [1] 8134 5015
raw.sum=apply(DTM,1,FUN=sum)
DTM = DTM[raw.sum!=0,]
SEED <- 10112
BURNIN = 1000
ITER = 1000
ntopics = 2
model_lda <- LDA(DTM, k = ntopics, method = "Gibbs", control = list(seed = SEED, burnin = BURNIN, iter = ITER))
terms(model_lda, 10)
## Topic 1 Topic 2
## [1,] "servic" "clinic"
## [2,] "use" "onlin"
## [3,] "quick" "medic"
## [4,] "recommend" "time"
## [5,] "deliveri" "day"
## [6,] "excel" "get"
## [7,] "fast" "need"
## [8,] "profession" "doctor"
## [9,] "easi" "prescript"
## [10,] "order" "appoint"
topics <- tidy(model_lda, matrix = "beta")
top_terms <- topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered()