In this file we are going to generate a word cloud based on HKU’s wikipedia page. Please install the following packages before proceeding: tm, SnowballC, wordcloud, RColorBrewer:
Then we load the packages:
library("tm")
## Warning: package 'tm' was built under R version 4.3.2
## Loading required package: NLP
library("SnowballC")
library("wordcloud")
## Warning: package 'wordcloud' was built under R version 4.3.2
## Loading required package: RColorBrewer
library("RColorBrewer")
Then load the text, which is available online:
text <- readLines("https://ximarketing.github.io/class/ABOM/wiki.txt")
docs <- Corpus(VectorSource(text))
We need to process the document before analyzing it:
# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs, content_transformer(tolower)):
## transformation drops documents
# Remove numbers
docs <- tm_map(docs, removeNumbers)
## Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
## documents
# Remove english common stopwords (e.g., the, it, an)
docs <- tm_map(docs, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs, removeWords, stopwords("english")):
## transformation drops documents
# Remove punctuations (e.g., ?, !, .)
docs <- tm_map(docs, removePunctuation)
## Warning in tm_map.SimpleCorpus(docs, removePunctuation): transformation drops
## documents
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
## Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
## documents
Analyzing the most frequent 20 words:
dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 20)
## word freq
## hong hong 34
## university university 33
## kong kong 31
## faculty faculty 13
## council council 12
## medicine medicine 10
## students students 10
## chinese chinese 8
## first first 7
## law law 7
## college college 6
## hku hku 6
## china china 6
## founded founded 5
## school school 5
## universitys universitys 5
## government government 5
## number number 5
## members members 5
## chan chan 5
Generating a word cloud:
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
Generating a word frequency plot:
barplot(d[1:15,]$freq, las = 2, names.arg = d[1:15,]$word,
col ="lightblue", main ="Most frequent words",
ylab = "Word frequencies")