install.packages("topicmodels", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Li Xi/Documents/R/win-library/4.1'
## (as 'lib' is unspecified)
## package 'topicmodels' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'topicmodels'
## Warning in file.copy(savedcopy, lib, recursive =
## TRUE): problem copying C:\Users\Li Xi\Documents\R\win-
## library\4.1\00LOCK\topicmodels\libs\x64\topicmodels.dll to C:\Users\Li
## Xi\Documents\R\win-library\4.1\topicmodels\libs\x64\topicmodels.dll: Permission
## denied
## Warning: restored 'topicmodels'
## 
## The downloaded binary packages are in
##  C:\Users\Li Xi\AppData\Local\Temp\Rtmp6pjtWP\downloaded_packages
install.packages("reshape2", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Li Xi/Documents/R/win-library/4.1'
## (as 'lib' is unspecified)
## package 'reshape2' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'reshape2'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\Li Xi\Documents\R\win-library\4.1\00LOCK\reshape2\libs\x64\reshape2.dll
## to C:\Users\Li Xi\Documents\R\win-library\4.1\reshape2\libs\x64\reshape2.dll:
## Permission denied
## Warning: restored 'reshape2'
## 
## The downloaded binary packages are in
##  C:\Users\Li Xi\AppData\Local\Temp\Rtmp6pjtWP\downloaded_packages
install.packages("pals", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Li Xi/Documents/R/win-library/4.1'
## (as 'lib' is unspecified)
## package 'pals' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Li Xi\AppData\Local\Temp\Rtmp6pjtWP\downloaded_packages
install.packages("tidytext", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Li Xi/Documents/R/win-library/4.1'
## (as 'lib' is unspecified)
## package 'tidytext' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Li Xi\AppData\Local\Temp\Rtmp6pjtWP\downloaded_packages
install.packages("dplyr", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Li Xi/Documents/R/win-library/4.1'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'dplyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\Li Xi\Documents\R\win-library\4.1\00LOCK\dplyr\libs\x64\dplyr.dll to C:
## \Users\Li Xi\Documents\R\win-library\4.1\dplyr\libs\x64\dplyr.dll: Permission
## denied
## Warning: restored 'dplyr'
## 
## The downloaded binary packages are in
##  C:\Users\Li Xi\AppData\Local\Temp\Rtmp6pjtWP\downloaded_packages
library(tm)
## Loading required package: NLP
library(topicmodels)
library(reshape2)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(pals)
library(tidytext)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
filename = "https://ximarketing.github.io/class/teachingfiles/lda.csv"
x <- read.csv(filename, header = TRUE)
corpus <- Corpus(DataframeSource(x))
dtm <- DocumentTermMatrix(corpus)
processedCorpus <- tm_map(corpus, content_transformer(tolower))
processedCorpus <- tm_map(processedCorpus, removeWords, stopwords("english"))
processedCorpus <- tm_map(processedCorpus, removePunctuation, preserve_intra_word_dashes = TRUE)
processedCorpus <- tm_map(processedCorpus, removeNumbers)
processedCorpus <- tm_map(processedCorpus, stemDocument, language = "en")
processedCorpus <- tm_map(processedCorpus, stripWhitespace)
minimumFrequency <- 1
DTM <- DocumentTermMatrix(processedCorpus, control = list(bounds = list(global = c(minimumFrequency, Inf))))
# have a look at the number of documents and terms in the matrix
dim(DTM)
## [1] 8134 5015
raw.sum=apply(DTM,1,FUN=sum)
DTM = DTM[raw.sum!=0,]
SEED <- 10112
BURNIN = 1000
ITER = 1000
ntopics = 2
model_lda <- LDA(DTM, k = ntopics, method = "Gibbs", control = list(seed = SEED, burnin = BURNIN, iter = ITER))
terms(model_lda, 10)
##       Topic 1      Topic 2    
##  [1,] "servic"     "clinic"   
##  [2,] "use"        "onlin"    
##  [3,] "quick"      "medic"    
##  [4,] "recommend"  "time"     
##  [5,] "deliveri"   "day"      
##  [6,] "excel"      "get"      
##  [7,] "fast"       "need"     
##  [8,] "profession" "doctor"   
##  [9,] "easi"       "prescript"
## [10,] "order"      "appoint"
topics <- tidy(model_lda, matrix = "beta")
top_terms <- topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()