-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimdb_usingR.R
131 lines (102 loc) · 3.09 KB
/
imdb_usingR.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# load packages
install.packages("rvest")
library(rvest)
install.packages("XML")
library(XML)
install.packages("magrittr")
library(magrittr)
install.packages("tm")
library(tm)
install.packages("wordcloud")
library(wordcloud)
Sys.setenv(JAVA_HOME="")
install.packages("RWeka")
library(RWeka)
# load imdb url
imdbUrl <- "https://www.imdb.com/title/tt7767422/reviews?ref_=tt_urv"
imdbReview <- NULL
html <- read_html(imdbUrl)
imdbReview <- html %>% html_nodes(".show-more__control") %>% html_text()
imdbReview[1]
write.table(imdbReview, "IMDBReview.txt")
getwd()
str(imdbReview)
length(imdbReview)
# corpus
corpusData <- Corpus(VectorSource(imdbReview))
inspect(corpusData[1])
corpusData <- tm_map(corpusData, function(corpusData) iconv(enc2utf8(corpusData), sub = "byte"))
# data cleaning
data <- tm_map(corpusData, tolower)
inspect(data[1])
data <- tm_map(data, removePunctuation)
data <- tm_map(data, removeNumbers)
inspect(data[1])
# remove stopwords
data <- tm_map(data, removeWords, stopwords(kind = "en"))
inspect(data[1])
data <- tm_map(data, stripWhitespace)
inspect(data[1])
# TDM
tdm <- TermDocumentMatrix(data)
tdm
nonSparseData <- removeSparseTerms(tdm, 0.90)
nonSparseData
tdm <- as.matrix(tdm)
dim(tdm)
wordSum <- rowSums(tdm)
wordSum
max(wordSum)
wordSubset <- subset(wordSum, wordSum > 20)
wordSubset
# plot
barplot(wordSubset, las=2, col = rainbow(30))
# remove useless word
data <- tm_map(data, stripWhitespace)
tdm <- TermDocumentMatrix(data)
tdm <- as.matrix(tdm)
wordSum <- rowSums(tdm)
wordSubset <- subset(wordSum, wordSum > 10)
barplot(wordSubset, las=2, col = rainbow(30))
# remove useless word
data <- tm_map(data, stripWhitespace)
tdm <- TermDocumentMatrix(data)
tdm <- as.matrix(tdm)
wordSum <- rowSums(tdm)
wordSubset <- subset(wordSum, wordSum > 5)
barplot(wordSubset, las=2, col = rainbow(30))
# remove useless word
data <- tm_map(data, stripWhitespace)
tdm <- TermDocumentMatrix(data)
tdm <- as.matrix(tdm)
wordSum <- rowSums(tdm)
wordSubset <- subset(wordSum, wordSum > 5)
barplot(wordSubset, las=2, col = rainbow(30))
# word cloud
windows()
wordcloud(words = names(wordSubset), freq = wordSubset, random.order = F, colors = rainbow(30))
# Bigram
bitoken <- NGramTokenizer(data, Weka_control(min = 2, max = 2))
biWord <- data.frame(table(bitoken))
sortBiword <- biWord[order(biWord$Freq, decreasing = TRUE),]
windows()
wordcloud(sortBiword$bitoken, sortBiword$Freq, colors = rainbow(30))
# sentiment analysis
# load data
positiveWords <- readLines(file.choose())
negativeWords <- readLines(file.choose())
stopwords <- readLines(file.choose())
positiveMatch <- match(names(wordSum), positiveWords)
positiveMatch
positiveMatch <- !is.na(positiveMatch)
freq <- wordSum[positiveMatch]
freq
name <- names(freq)
wordcloud(name, freq = freq, colors = rainbow(30))
negativeMatch <- match(names(wordSum), negativeWords)
negativeMatch
negativeMatch <- !is.na(negativeMatch)
freq <- wordSum[negativeMatch]
freq
name <- names(freq)
wordcloud(name, freq = freq, colors = rainbow(30))