Word Cloud Generation

In this file we are going to generate a word cloud based on HKU’s wikipedia page. Please install the following packages before proceeding: tm, SnowballC, wordcloud, RColorBrewer:

Then we load the packages:

library("tm")
## Warning: package 'tm' was built under R version 4.3.2
## Loading required package: NLP
library("SnowballC")
library("wordcloud")
## Warning: package 'wordcloud' was built under R version 4.3.2
## Loading required package: RColorBrewer
library("RColorBrewer")

Then load the text, which is available online:

text <- readLines("https://ximarketing.github.io/class/ABOM/wiki.txt")
docs <- Corpus(VectorSource(text))

We need to process the document before analyzing it:

# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs, content_transformer(tolower)):
## transformation drops documents
# Remove numbers
docs <- tm_map(docs, removeNumbers)
## Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
## documents
# Remove english common stopwords (e.g., the, it, an)
docs <- tm_map(docs, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs, removeWords, stopwords("english")):
## transformation drops documents
# Remove punctuations (e.g., ?, !, .)
docs <- tm_map(docs, removePunctuation)
## Warning in tm_map.SimpleCorpus(docs, removePunctuation): transformation drops
## documents
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
## Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
## documents

Analyzing the most frequent 20 words:

dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 20)
##                    word freq
## hong               hong   34
## university   university   33
## kong               kong   31
## faculty         faculty   13
## council         council   12
## medicine       medicine   10
## students       students   10
## chinese         chinese    8
## first             first    7
## law                 law    7
## college         college    6
## hku                 hku    6
## china             china    6
## founded         founded    5
## school           school    5
## universitys universitys    5
## government   government    5
## number           number    5
## members         members    5
## chan               chan    5

Generating a word cloud:

set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

Generating a word frequency plot:

barplot(d[1:15,]$freq, las = 2, names.arg = d[1:15,]$word,
        col ="lightblue", main ="Most frequent words",
        ylab = "Word frequencies")