-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAmazon saree_ana_task_1_using r.R
144 lines (99 loc) · 3.37 KB
/
Amazon saree_ana_task_1_using r.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
install.packages("rvest")
install.packages("XML")
install.packages("magrittr")
library(rvest)
library(XML)
library(magrittr)
######### Amazon URL ###########
aurl <- "https://www.amazon.in/Devangi-Fashion-pure-Saree-DF_Patola_106_Red/product-reviews/B07YFHJ92P/ref=cm_cr_othr_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
amazon_reviews <- NULL
for (i in 1:20){
murl <- read_html(as.character(paste(aurl,i,sep="=")))
rev <- murl %>% html_nodes(".review-text") %>% html_text()
amazon_reviews <- c(amazon_reviews,rev)
}
write.table(amazon_reviews,"saree.txt")
getwd()
#### Sentiment Analysis ####
txt <- amazon_reviews
str(txt)
length(txt)
View(txt)
# install.packages("tm")
library(tm)
# Convert the character data to corpus type
x <- Corpus(VectorSource(txt))
inspect(x[1])
x <- tm_map(x, function(x) iconv(enc2utf8(x), sub='byte'))
# Data Cleansing
x1 <- tm_map(x, tolower)
inspect(x1[1])
x1 <- tm_map(x1, removePunctuation)
inspect(x1[1])
inspect(x1[5])
x1 <- tm_map(x1, removeNumbers)
inspect(x1[1])
x1 <- tm_map(x1, removeWords, stopwords('english'))
inspect(x1[1])
# striping white spaces
x1 <- tm_map(x1, stripWhitespace)
inspect(x1[1])
# Term document matrix
# converting unstructured data to structured format using TDM
tdm <- TermDocumentMatrix(x1)
tdm
dtm <- t(tdm) # transpose
dtm <- DocumentTermMatrix(x1)
# To remove sparse entries upon a specific value
corpus.dtm.frequent <- removeSparseTerms(tdm, 0.98)
tdm <- as.matrix(tdm)
dim(tdm)
tdm[1:100, 1:100]
inspect(x[1])
# Bar plot
w <- rowSums(tdm)
w
w_sub <- subset(w, w >= 30)
w_sub
barplot(w_sub, las=2, col = rainbow(30))
##### Word cloud #####
install.packages("wordcloud")
library(wordcloud)
wordcloud(words = names(w_sub), freq = w_sub)
w_sub1 <- sort(rowSums(tdm), decreasing = TRUE)
head(w_sub1)
wordcloud(words = names(w_sub1), freq = w_sub1) # all words are considered
# better visualization
wordcloud(words = names(w_sub1), freq = w_sub1, random.order=F, colors=rainbow(30), scale = c(2,0.5), rot.per = 0.4)
windows()
wordcloud(words = names(w_sub1), freq = w_sub1, random.order=F, colors= rainbow(30),scale=c(3,0.5),rot.per=0.3)
windowsFonts(JP1 = windowsFont("MS Gothic"))
par(family = "JP1")
wordcloud(x1, scale= c(2,0.5))
############# Wordcloud2 ###############
installed.packages("wordcloud2")
library(wordcloud2)
w1 <- data.frame(names(w_sub), w_sub)
colnames(w1) <- c('word', 'freq')
wordcloud2(w1, size=0.3, shape='circle')
wordcloud2(w1, size=0.3, shape = 'triangle')
wordcloud2(w1, size=0.3, shape = 'star')
#####################################
# lOADING Positive and Negative words
pos.words <- readLines(file.choose()) # read-in positive-words.txt
neg.words <- readLines(file.choose()) # read-in negative-words.txt
stopwdrds <- readLines(file.choose())
### Positive word cloud ###
pos.matches <- match(names(w_sub1), pos.words)
pos.matches <- !is.na(pos.matches)
freq_pos <- w_sub1[pos.matches]
names <- names(freq_pos)
windows()
wordcloud(names, freq_pos, scale=c(4,1), colors = brewer.pal(8,"Dark2"))
### Matching Negative words ###
neg.matches <- match(names(w_sub1), neg.words)
neg.matches <- !is.na(neg.matches)
freq_neg <- w_sub1[neg.matches]
names <- names(freq_neg)
windows()
wordcloud(names, freq_neg, scale=c(4,.5), colors = brewer.pal(8, "Dark2"))