forked from yoconana/Information-Retrieval
-
Notifications
You must be signed in to change notification settings - Fork 0
/
outputQuery.py
46 lines (33 loc) · 1.03 KB
/
outputQuery.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# Xi Chen
# 3/16/2016
# Let user input a query and then do tokenize, remove stop word and stemming on the string
from lxml import html
from lxml.html.clean import clean_html
import string
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import os
import codecs # otherwise, it has ascii encoding error
import preprocess
query = raw_input('Enter your query:')
query = preprocess.cleanquery(query)
def outputStringQuery(query):
ps = PorterStemmer()
#tokenize query
tokenized_word = word_tokenize(query)
#remove stop word
stop_words = set(stopwords.words("english"))
removedstop_word = []
removedstop_word = [w for w in tokenized_word if not w in stop_words]
#stemming query
stemmed_words = []
for w in removedstop_word:
try:
stemmed_words.append(str(ps.stem(w)))
except UnicodeDecodeError:
print w
outcome = ' '.join(stemmed_words)
return outcome
query = outputStringQuery(query)