-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimdb_using python.py
119 lines (87 loc) · 3.35 KB
/
imdb_using python.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# load packages
import requests
from bs4 import BeautifulSoup as bs
import re
import matplotlib.pyplot as plt
# pip install wordcloud
from wordcloud import WordCloud, STOPWORDS
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# empty review list
reviews = []
url = "https://www.imdb.com/title/tt0468569/reviews?ref_=ttexrv_sa_3"
response = requests.get(url)
soup = bs(response.content, "html.parser")
review = soup.find_all("div",attrs = {"class", "text show-more__control"})
print("review :", review)
for j in range(len(review)):
reviews.append(review[j].text)
print(reviews)
# storing data
with open("review.txt", "w", encoding="utf8") as output:
output.write(str(reviews))
# read data
with open("review.txt", "r", encoding="utf8") as inpt:
reviewStr = inpt.read()
reviewStr = re.sub("[^A-Za-z" "]+", " ", reviewStr).lower()
# tokenize
reviewWords = reviewStr.split(" ")
# tfidf
vectorizer = TfidfVectorizer(reviewWords, use_idf= True, ngram_range=(1,3))
X = vectorizer.fit_transform(reviewWords)
with open(r"C:\Users\Megha\Downloads\Datasets NLP\stopwords_en.txt", "r") as sw:
stopWords = sw.read()
stopWords = stopWords.split("\n")
stopWords.extend(["dark", "knight",""])
words = [w for w in reviewWords if w not in stopWords]
reviewStr = " ".join(words)
# wordcloud
wordCloud = WordCloud(width=2000, height=2000).generate(reviewStr)
plt.imshow(wordCloud)
# positive wordcloud
with open(r"C:\Users\Megha\Downloads\Datasets NLP\positive-words.txt", "r") as sw:
positiveWord = sw.read()
positiveWord = positiveWord.split("\n")
pWords = [w for w in reviewWords if w in positiveWord]
reviewStr = " ".join(pWords)
wordCloud = WordCloud(width=2000, height=2000).generate(reviewStr)
plt.imshow(wordCloud)
# negative wordcloud
with open(r"C:\Users\Megha\Downloads\Datasets NLP\negative-words.txt", "r") as sw:
negativeWord = sw.read()
negativeWord = negativeWord.split("\n")
nWords = [w for w in reviewWords if w in negativeWord]
reviewStr = " ".join(nWords)
wordCloud = WordCloud(width=2000, height=2000).generate(reviewStr)
plt.imshow(wordCloud)
# bigram
with open("review.txt", "r", encoding="utf8") as inpt:
reviewStr = inpt.read()
nltk.download('punkt')
wnl = nltk.WordNetLemmatizer()
t = reviewStr.lower()
t
t = t.replace("'", "")
token = nltk.word_tokenize(t)
text = nltk.Text(t)
textContent = ["".join(re.split("[ .,:;!?''""@#$%^&*()<>{}~\n\t\\\-]", word)) for word in token]
sStopword = set(stopWords)
customWord = ["[", "''", ",","``"]
sStopword = sStopword.union(customWord)
textContent = [word for word in textContent if word not in sStopword]
# Lemmatizer
textContent = [wnl.lemmatize(t) for t in textContent]
nltkToken = nltk.word_tokenize(t)
bigramList = list(nltk.bigrams(textContent))
dictionary = [' '.join(t) for t in bigramList]
# freq in bigram
vectorizer = CountVectorizer(ngram_range=(2,2))
bow = vectorizer.fit_transform(dictionary)
vectorizer.vocabulary_.items()
sumWords = bow.sum(axis = 0)
wordFreq = [(word, sumWords[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
wordFreq = sorted(wordFreq, key= lambda x: x[1], reverse = True)
# wordcloud
data = dict(wordFreq)
wc = WordCloud(width=2000, height=2000, stopwords= sStopword).generate_from_frequencies(data)
plt.imshow(wc)