Inspired by a wordcloud from Obama’s State of the Union address, let’s look at creating a wordcloud for Singapore’s National Day Rally 2014 speech using R.
library(rvest)
library(RCurl)
## Loading required package: bitops
url <- "http://www.pmo.gov.sg/mediacentre/prime-minister-lee-hsien-loongs-national-day-rally-2014-speech-english"
# scrapes the speech from the URL above
curlSpeech<- getURL(url)
speech <- curlSpeech %>%
html() %>% html_nodes(".view-mode-full") %>% html_text()
library(RWeka)
library(tm)
## Loading required package: NLP
text <- Corpus(VectorSource(speech))
# pre-process the speech for tokenization
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
text <- text %>% tm_map(., content_transformer(tolower)) %>%
tm_map(., toSpace, "<[^<>]+>") %>%
tm_map(., toSpace, "n") %>%
tm_map(., toSpace, "[^a-zA-Z0-9]") %>%
tm_map(., toSpace, "/|@||") %>%
tm_map(., removeNumbers) %>%
tm_map(., stripWhitespace) %>%
tm_map(., removePunctuation) %>%
tm_map(., removeWords, stopwords("english"))
# tokenize the speech and create word-frequency table
Tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
dtm <- DocumentTermMatrix(text, control = list(tokenize = Tokenizer))
tm_freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
library(wordcloud)
## Loading required package: RColorBrewer
set.seed(39)
# create the wordcloud
wordcloud(names(tm_freq), tm_freq, max.words=200, colors=brewer.pal(6, "Dark2"))