-
Notifications
You must be signed in to change notification settings - Fork 0
/
utility.py
116 lines (101 loc) · 2.74 KB
/
utility.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
inv_index = dict()
allTweets = []
# DEBUGGING FUNCTION: Reads all the tweets held in the file
def readtweets(filename):
print("***********READING TWEETS***********")
print("------------------------------------")
tweetnum = 1
file = open(filename, "r")
line = file.readline()
while line:
print("[Tweet " + str(tweetnum) + "]", line)
line = file.readline()
tweetnum += 1
file.close()
return
# Checks whether the file path is valid
def validFile(filename):
try:
file = open(filename, "r")
line = file.readline()
except:
return False
return True
# Stores tweets in an array
def storeTweets(filename):
file = open(filename, "r")
line = file.readline()
while line:
allTweets.append(line)
line = file.readline()
allTweets[len(allTweets)-1] += "\n"
file.close()
print("Stored " + str(len(allTweets)) + " tweets.")
return
# Build inverted index from file (tweet number, word placement)
def buildIndex(filename):
tweetnum = 0
file = open(filename, "r")
line = file.readline().split()
while line:
tweetnum += 1
for i in range(0, len(line)):
word = line[i].lower()
if word not in inv_index:
inv_index[word] = [(tweetnum, i)]
else:
inv_index[word].append((tweetnum, i))
line = file.readline().split()
file.close()
return
# Finds & Prints tweets that at least matches one word in pattern
def scoreTweets(inv_index, pattern):
# Prevents from reprinting tweets
numMatches = 0
wordMatches = [0]*len(allTweets)
for word in pattern.split(" "):
if word in inv_index:
tweet_list = inv_index[word]
else:
continue
for pair in tweet_list:
index = pair[0]-1
if wordMatches[index] == 0:
numMatches += 1
wordMatches[index] = 1
else:
wordMatches[index] += 1
return wordMatches, numMatches
def positionScore(inv_index, pattern):
return
def printRelevant(wordMatches, numMatches):
# Find the indicies of top 10 values
if numMatches == 0:
print("There are no relevant tweets.")
return
sortedMatches = sorted(range(len(wordMatches)), key=lambda x: wordMatches[x])[::-1]
printNum = 10
if numMatches < printNum:
printNum = numMatches
print("There are", printNum, "Relevant Tweets")
print("---------------------------------------")
for i in range(printNum):
tweetIdx = sortedMatches[i]
print("[Tweet " + str(tweetIdx+1) + "]:", end=" ")
print(allTweets[tweetIdx])
def derestrict(pattern):
newpattern = ""
for word in pattern.split(" "):
newpattern += (word + " ")
newpattern += (word +"s ")
newpattern += (word + "ing ")
newpattern += (word + "ed ")
return newpattern
def getTweets(filename):
storeTweets(filename)
buildIndex(filename)
query = input("Enter your query: ")
query = derestrict(query)
result = scoreTweets(inv_index, query)
printRelevant(result[0], result[1])
return