-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreader.py
251 lines (210 loc) · 10.5 KB
/
reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
# Creating a program to scrape .txt files of Reddit pages for usernames of commenters
# Importing os module to be able to iterate through files in the `txt-files` directory
import copy
import os
path = "/Users/syrrgordon/Desktop/development/CSCI/Spring-Break-Lab/txt-files/"
# Creating an empty list to store usernames
usernames = []
# Creating empty list to store usernames without duplicates
unique_users = []
# Creating list of words that can't be in username
restricted_words = [
'AutoModerator', '[deleted]', 'Comment deleted by user', 'Comment removed by moderator', ' ', 'https://www.']
# Creating a list to hold the final version of usernames
clean_usernames = []
# Creating a dictionary to hold the post dates as keys and the number of posters as values
dates_and_names = {}
# Creating a dictionary to hold the post dates as keys and the comments as values
dates_and_comments = {}
# Using `os.lsitdir()` method to create a list of file names
dir_list = os.listdir(path)
# Creating directory list with path in front
dir_list_with_path = []
# Appending the full path of files to the new list above
for file in dir_list:
new_name = path + file
dir_list_with_path.append(new_name)
# Opening all the files
for file in dir_list_with_path:
# Creating a list to hold usernames from each file
files_usernames = []
opened_file = open(file, 'r')
# Turning each line of the file to an element of a list
lines = opened_file.readlines()
# Iterating through the list to find usernames
for line in lines:
i = lines.index(line)
# Every username in the file is proceeded by the user's level
# Finding level and going to the next line allows us to find the username
# Adding usernames to new list
if 'level' in line:
next_line = lines[i+1]
# Iterating through list of restricted words to ensure it's actually a username
counter = 0
for word in restricted_words:
if word not in next_line:
counter += 1
if counter == len(restricted_words) and len(next_line) > 2:
usernames.append(next_line)
# Adding the usernames to the list of usernames from this specifc file
files_usernames.append(next_line)
# Adding the name of the file to the `dates_and_names` dictionary
dates_and_names[file] = copy.deepcopy(files_usernames)
# Clearing the list to prepare for next file
files_usernames.clear()
# Closing file to free up memory space
opened_file.close()
# List copmrehension to remove duplicates from the username list
[unique_users.append(x) for x in usernames if x not in unique_users]
# Removing the newline characters from the usernames
for user in unique_users:
clean_usernames.append(user.strip('\n'))
clean_usernames.sort()
# clean_usernames = clean_usernames.sort()
# Saving the number of unique/original commenters and printing it out
number_of_commenters = len(clean_usernames)
num_unique_commenters = "Number of unique commenters: " + str(number_of_commenters)
# For each user who published a post, print out the user names of those who published 2 or more posts
# Creating a dictionary that hold the username as the key and the number of comments made as the value
users_comments_frequency = {}
for name in unique_users:
users_comments_frequency[name] = usernames.count(name)
# Creating an empty list to hold the usernames that have 2 or more comments
users_with_multiple_posts = []
temp = []
# Scraping dictionary to save the names (keys) of users with 2 or more posts
for i in users_comments_frequency:
if users_comments_frequency[i] >= 2:
users_with_multiple_posts.append(i)
for i in users_with_multiple_posts:
temp.append(i)
# 78-83 strip the list of users who post multiple comments of newline characters
# Clearing the list of users with multiple posts
users_with_multiple_posts = []
for i in temp:
users_with_multiple_posts.append(i.strip('\n'))
num_repeat_commenters = "The following users published two or more comments: " + str(users_with_multiple_posts)
# Print out the posts that mention the following symptoms: (i) cough, (ii) cold, (iii) fever
# The following lines congregate the actual comments
# Creating a list of words that we want to flag comments for
flag_words = ['cough', 'cold', 'fever']
# Creating a list of comments that have the flag word
comments = []
# Opening all the files (again)
for file in dir_list_with_path:
# Creating a list to hold comments specific to each file
files_comments = []
opened_file = open(file, 'r')
# Turning each line of the file to an element of a list
lines = opened_file.readlines()
# Iterating through the list to find flagged comments
for line in lines:
# i = lines.index(line)
# Adding flagged comments to new list
for flag in flag_words:
if flag in line and ' ' in line:
comments.append(line)
# Adding the comment to the list of comments from this specific file
files_comments.append(line)
break
# Adding the name of the file to the `dates_and_comments` dictionary
dates_and_comments[file] = copy.deepcopy(files_comments)
# Clearing the list to prepare for next file
files_comments.clear()
# Closing file to free up memory space
opened_file.close()
# Creating a list to hold unique comments
unique_comments = []
# List copmrehension to remove duplicates from the username list
[unique_comments.append(x) for x in comments if x not in unique_comments]
# REMEMBER TO UNCOMMENT LINES 130-132
# print("Comments that mention fevers, colds, or coughs: ")
# for comment in comments:
# print(comment)
# Over the course of the week, was there an increase in the number of users who published posts?
# Creating a list to hold the usernames that will form the values of the dict above
unique_users_for_dict = []
# Creating a final nested list to hold the unique usernames for each day
final_unique_users = []
# Removing duplicates from values in `dates_and_names` dictionary
# Iterating through each list (each value is a list) in the dictionary
for val in dates_and_names.values():
# List copmrehension to remove duplicates from the value
[unique_users_for_dict.append(x) for x in val if x not in unique_users_for_dict]
# Appends list of unique users to final list
final_unique_users.append(copy.deepcopy(unique_users_for_dict))
# Clears initial list of unique users to prepare for next value
unique_users_for_dict.clear()
# Creating a list of all the keys from `dates_and_names` to store the dates
dates = []
for key in dates_and_names:
dates.append(key[-8:])
# Merging the dates and the final list of usernames to create a dictionary
# The dictionary will have dates as the keys and the numeber of unique commenters that day as the values
dates_and_names_clean = {dates[i]: final_unique_users[i] for i in range(len(dates))}
# Sorting the dictionary keys by date order
myKeys = list(dates_and_names_clean.keys())
myKeys.sort()
sorted_dict = {i: dates_and_names_clean[i] for i in myKeys}
# Creating a list to store the number of unique posters for each day
frequency = []
for val in sorted_dict.values():
frequency.append(len(val))
# Checking to see if there was an increase in the number of users who published posts over the course of the week
# Comparing the first value in the frequency list to the last value
def user_frequency():
if frequency[0] > frequency[-1]:
freq = 'There was an increase in the number of users who published comments over the course of a week.'
elif frequency[0] < frequency[-1]:
freq = 'There was a decrease in the number of users who published comments over the course of a week.'
else:
freq = "print('There was no change in the number of users who published comments over the course of a week."
return freq
# Was there an increase in the number of posts that mentioned the following symptoms: (i) cough, (ii) cold, (iii) fever?
# Creating a list to hold the comments that mention the symptoms
unique_comments_for_dict = []
# Creating a final nested list to hold the flagged comments for each day
final_unique_comments = []
# Removing duplicates from values in `dates_and_comments` dictionary
# Iterating through each list (each value is a list) in the dictionary
for val in dates_and_comments.values():
# List copmrehension to remove duplicates from the value
[unique_comments_for_dict.append(x) for x in val if x not in unique_comments_for_dict]
# Appends list of unqiue comments to final list
final_unique_comments.append(copy.deepcopy(unique_comments_for_dict))
# Clears initial list of unique comments to prepare for next value
unique_comments_for_dict.clear()
# Creating a list of all the keys from `dates_and_comments` to store the dates
dates = []
for key in dates_and_comments:
dates.append(key[-8:])
# Merging the dates and the final list of comments to create a dictionary
# The dictionary will have dates as the keys and the number of unique comments that day as the values
dates_and_comments_clean = {dates[i]: final_unique_comments[i] for i in range(len(dates))}
# Sorting the dictionary keys by date order
myKeys_2 = list(dates_and_comments_clean.keys())
myKeys_2.sort()
sorted_dict_2 = {i: dates_and_comments_clean[i] for i in myKeys_2}
# Creating a list to store the number of unique comments that mention symptoms for each day
comment_frequency = []
for val in sorted_dict_2.values():
comment_frequency.append(len(val))
# Checking to see if there was an increase in the number of comments who mentioned the symptoms
# Comparing the first value in the frequency list to the last value
def comment_freq():
if comment_frequency[0] > comment_frequency[-1]:
freq = 'There was an increase in the number of posts that mentioned the following symptoms: cough, cold, or fever over the course of a week.'
elif comment_frequency[0] < comment_frequency[-1]:
freq = 'There was a decrease in the number of posts that mentioned the following symptoms: cough, cold, or fever over the course of a week.'
else:
freq = "There was no change in the number of posts that mentioned the following symptoms: cough, cold, or fever over the course of a week."
return freq
# Writing all the requested information to a new .txt file to make reading it easier
f = open("info.txt", "a")
f.write(num_unique_commenters+'\n')
f.write(num_repeat_commenters+'\n')
f.write(user_frequency()+'\n')
f.write(comment_freq()+'\n')
f.close()
f = open("info.txt", 'r')
print(f.read())