-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAmazon saree_ana_task_1_using python.py
166 lines (121 loc) · 5.81 KB
/
Amazon saree_ana_task_1_using python.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import requests # Importing requests to extract content from a url
from bs4 import BeautifulSoup as bs # Beautifulsoup is for web scrapping...used to scrap specific content
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
# creating empty reviews list
saree_reviews=[]
for i in range(1,150):
ip=[]
url="https://www.amazon.in/Devangi-Fashion-pure-Saree-DF_Patola_106_Red/product-reviews/B07YFHJ92P/ref=cm_cr_othr_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"+str(i)
response = requests.get(url)
soup = bs(response.content,"html.parser")# creating soup object to iterate over the extracted content
reviews = soup.find_all("span",attrs={"class","a-size-base review-text review-text-content"})# Extracting the content under specific tags
for i in range(len(reviews)):
ip.append(reviews[i].text)
saree_reviews=saree_reviews+ip # adding the reviews of one page to empty list which in future contains all the reviews
# writng reviews in a text file
with open("saree.txt","w",encoding='utf8') as output:
output.write(str(saree_reviews))
# Joinining all the reviews into single paragraph
ip_rev_string = " ".join(saree_reviews)
import nltk
# from nltk.corpus import stopwords
# Removing unwanted symbols incase if exists
ip_rev_string = re.sub("[^A-Za-z" "]+"," ", ip_rev_string).lower()
ip_rev_string = re.sub("[0-9" "]+"," ", ip_rev_string)
# words that contained in iphone XR reviews
ip_reviews_words = ip_rev_string.split(" ")
#TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ip_reviews_words, use_idf=True,ngram_range=(1, 3))
X = vectorizer.fit_transform(ip_reviews_words)
with open(r"C:\Users\UMANG\OneDrive\Desktop\text m\Datasets NLP (2)\stop.txt","r") as sw:
stop_words = sw.read()
stop_words = stop_words.split("\n")
#stop_words.extend(["oneplus","mobile","time","android","phone","device","screen","battery","product","good","day","price"])
ip_reviews_words = [w for w in ip_reviews_words if not w in stop_words]
# Joinining all the reviews into single paragraph
ip_rev_string = " ".join(ip_reviews_words)
# WordCloud can be performed on the string inputs.
# Corpus level word cloud
wordcloud_ip = WordCloud(
background_color='White',
width=1800,
height=1400
).generate(ip_rev_string)
plt.imshow(wordcloud_ip)
# positive words # Choose the path for +ve words stored in system
with open(r"C:\Users\UMANG\OneDrive\Desktop\text m\Datasets NLP (2)\positive-words.txt","r") as pos:
poswords = pos.read().split("\n")
# Positive word cloud
# Choosing the only words which are present in positive words
ip_pos_in_pos = " ".join ([w for w in ip_reviews_words if w in poswords])
wordcloud_pos_in_pos = WordCloud(
background_color='White',
width=1800,
height=1400
).generate(ip_pos_in_pos)
plt.figure(2)
plt.imshow(wordcloud_pos_in_pos)
# negative words Choose path for -ve words stored in system
with open(r"C:\Users\UMANG\OneDrive\Desktop\text m\Datasets NLP (2)\negative-words.txt", "r") as neg:
negwords = neg.read().split("\n")
# negative word cloud
# Choosing the only words which are present in negwords
ip_neg_in_neg = " ".join ([w for w in ip_reviews_words if w in negwords])
wordcloud_neg_in_neg = WordCloud(
background_color='black',
width=1800,
height=1400
).generate(ip_neg_in_neg)
plt.figure(3)
plt.imshow(wordcloud_neg_in_neg)
# wordcloud with bigram
nltk.download('punkt')
from wordcloud import WordCloud, STOPWORDS
WNL = nltk.WordNetLemmatizer()
# Lowercase and tokenize
text = ip_rev_string.lower()
# Remove single quote early since it causes problems with the tokenizer.
text = text.replace("'", "")
tokens = nltk.word_tokenize(text)
text1 = nltk.Text(tokens)
# Remove extra chars and remove stop words.
text_content = [''.join(re.split("[ .,;:!?‘’``''@#$%^_&*()<>{}~\n\t\\\-]", word)) for word in text1]
# Create a set of stopwords
stopwords_wc = set(STOPWORDS)
#customised_words = ['price', 'great'] # If you want to remove any particular word form text which does not contribute much in meaning
#new_stopwords = stopwords_wc.union(customised_words)
# Remove stop words
text_content = [word for word in text_content if word not in stopwords_wc]
# Take only non-empty entries
text_content = [s for s in text_content if len(s) != 0]
# Best to get the lemmas of each word to reduce the number of similar words
text_content = [WNL.lemmatize(t) for t in text_content]
nltk_tokens = nltk.word_tokenize(text)
bigrams_list = list(nltk.bigrams(text_content))
print(bigrams_list)
dictionary2 = [' '.join(tup) for tup in bigrams_list]
print (dictionary2)
# Using count vectoriser to view the frequency of bigrams
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(2, 2))
bag_of_words = vectorizer.fit_transform(dictionary2)
vectorizer.vocabulary_
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
print(words_freq[:100])
# Generating wordcloud
words_dict = dict(words_freq)
WC_height = 1000
WC_width = 1500
WC_max_words = 200
wordCloud = WordCloud(max_words=WC_max_words, height=WC_height, width=WC_width, stopwords=stopwords_wc)
wordCloud.generate_from_frequencies(words_dict)
plt.figure(4)
plt.title('Most frequently occurring bigrams connected by same colour and font size')
plt.imshow(wordCloud, interpolation='bilinear')
plt.axis("off")
plt.show()