-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmainprogram.py
202 lines (163 loc) · 6.75 KB
/
mainprogram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# A PROGRAM TO ANALYSE THE LANGUAGE USED
# TO TALK ABOUT PREHISTORIC ROCK ART IN SCOTLAND
# ONLINE AT THE MEGALITHIC PORTAL
# - - - - - - - - - -
from PIL import Image, ImageFont, ImageDraw
from textblob import TextBlob
from finalModule import *
import pandas as pd
import caffeine
import time
import sys
import csv
import re
import os
# - - - - - - - - - -
# CREATE NEW DIRECTORY TO STORE ALL SEARCH RESULTS
print('')
# Loop which allows the user to create a new directory or use an existing one
while True:
print('Create new search directory.')
# Ask the user to specify the name for the new directory
search_name = input('Enter name for this search: ')
# If it doesn't already exist, create the new directory and break loop
if not os.path.exists(search_name):
os.makedirs(search_name)
print('Directory created.')
break
else:
# If the directory already exists, ask the user to confirm use (break loop) or decline (restart loop)
if input('Directory already exists, use this directory? [y/n] \n * Warning: this may overwrite existing files: ') == 'y':
print('Using existing directory.')
break
print('')
# Create non-alphanumeric version of the search name for use in filenames
clean_name = stripNonAlphaNum(search_name)
# - - - - - - - - - -
# EXTRACT RAW DATA FROM EACH ARTICLE LISTED ON A SEARCH INDEX PAGE AND WRITE TO CSV
# Prepare a CSV file to store raw data from each article
filename_raw = search_name + '/' + clean_name + '_raw_data'
f_raw = csv.writer(open(filename_raw + '.csv','w'))
f_raw.writerow(['Site name', 'URL', 'Region', 'Country', 'Description text', 'Caption text', 'Comment text', 'All text'])
# Loop which allows the user to extract data from multiple search index pages
while True:
# Ask user to input the search index page URL
if input('Get article data from search index page? [y/n]: ') == 'y':
index_page_url = input('Enter search index page URL: ')
print('Getting article data...')
# Find individual article URLs in search index page
articles = locateArticles(index_page_url)
for i in articles:
elem = i.next_element
if elem.name == ('b'):
# Extract individual article URL
url = i.get('href')
url = 'https://www.megalithic.co.uk/' + url
time.sleep(3)
# Extract raw data from article
data_tuple = extractData(url)
# Only continue with articles which have the correct topic class
if data_tuple != False:
#Write all raw data to CSV file
f_raw.writerow(data_tuple)
# Throttle requests to the server with a delay of 3 seconds
time.sleep(3)
print('Article data stored.')
print('')
else:
break
# - - - - - - - - - -
# Loop which allows the user to proceed analysis of the raw data, or close the program
while True:
if input('Proceed with data analysis? [y/n]: ') == 'y':
print('Data analysis in progress...')
break
else:
print('Data analysis cancelled.')
sys.exit()
# - - - - - - - - - -
# FIND THE TOP 20 MOST FREQUENT WORDS USED IN ALL TEXT,
# THE DESCRIPTION, THE IMAGE CAPTIONS, AND THE COMMENTS
# Prepare a CSV file to store the top 20 words, their frequency and length, and their text type (eg: description, comment)
filename_t20 = search_name + '/' + clean_name + '_top_20_words'
f_t20 = csv.writer(open(filename_t20 + '.csv','w'))
f_t20.writerow(['Ranking', 'Keyword', 'Frequency', 'Length', 'Text type'])
# Read all data from each text type stored in the raw data CSV
raw_csv = (filename_raw + '.csv')
df = pd.read_csv(raw_csv, usecols=['Description text', 'Caption text', 'Comment text', 'All text'])
# Combine all rows of 'All text' column into one string
all_col = df['All text']
all_cat = (all_col.str.cat(sep=' '))
# Combine all rows of 'Description text' column into one string
des_col = df['Description text']
des_cat = (des_col.str.cat(sep=' '))
# Combine all rows of 'Caption text' column into one string
cap_col = df['Caption text']
cap_cat = (cap_col.str.cat(sep=' '))
# Combine all rows of 'Comment text' column into one string
com_col = df['Comment text']
com_cat = (com_col.str.cat(sep=' '))
# Extract the top 20 words for each different text type and write all results to one CSV file
Top20(all_cat, f_t20, 'all text')
Top20(des_cat, f_t20, 'description')
Top20(cap_cat, f_t20, 'caption')
Top20(com_cat, f_t20, 'comment')
# Confirm that the data has been extracted
print('Top 20 words extracted.')
# - - - - - - - - - -
# FIND THE POLARITY FINGERPRINT OF THE TEXTUAL DATA
# Create a key image to show how the colour scale relates to polarity values
polarityKey(search_name)
# Create a polarity fingerprint visualisation for each text type
polarityPrint(all_cat, search_name, '_all_text')
polarityPrint(des_cat, search_name, '_des')
polarityPrint(cap_cat, search_name, '_cap')
polarityPrint(com_cat, search_name, '_com')
# Confirm that the visualisation has been created
print('Polarity fingerprint created.')
# - - - - - - - - - -
# FIND THE SUBJECTIVITY FINGERPRINT OF THE TEXTUAL DATA
# Create a key image to show how the colour scale relates to subjectivity values
subjectivityKey(search_name)
# Create a subjectivity fingerprint visualisation for each text type
subjectivityPrint(all_cat, search_name, '_all_text')
subjectivityPrint(des_cat, search_name, '_des')
subjectivityPrint(cap_cat, search_name, '_cap')
subjectivityPrint(com_cat, search_name, '_com')
# Confirm that the visualisation has been created
print('Subjectivity fingerprint created.')
print('')
# - - - - - - - - - -
# FIND THE SENTENCES WITH THE HIGHEST AND LOWEST VALUES
# FOR POLARITY AND SUBJECTIVITY
# Prepare TXT files to store the results
filename_neg = search_name + '/' + clean_name + '_most_neg'
f_neg = open(filename_neg + '.txt','w')
filename_pos = search_name + '/' + clean_name + '_most_pos'
f_pos = open(filename_pos + '.txt','w')
filename_obj = search_name + '/' + clean_name + '_most_obj'
f_obj = open(filename_obj + '.txt','w')
filename_sub = search_name + '/' + clean_name + '_most_sub'
f_sub = open(filename_sub + '.txt','w')
# Find the sentences with the highest and lowest polarity values
p_result = polarityLoHi(str(all_cat))
# Find the sentences with the highest and lowest subjectivity values
s_result = subjectivityLoHi(str(all_cat))
# Write the results to the TXT files
[f_neg.write((str(elem)) + '\n\n') for elem in p_result[0]]
[f_pos.write((str(elem)) + '\n\n') for elem in p_result[1]]
[f_obj.write((str(elem)) + '\n\n') for elem in s_result[0]]
[f_sub.write((str(elem)) + '\n\n') for elem in s_result[1]]
# Close the TXT files
f_neg.close()
f_pos.close()
f_obj.close()
f_sub.close()
print('Highest and lowest sentiment value sentences extracted.')
print('')
# - - - - - - - - - -
# CREATE A VISUALISATION OF THE TOP 20 WORDS DATA
# Prompt the user to run the separate visualisation program (doesn't run properly otherwise)
print('* Run visualisation program to complete analysis.')
print('')
# - - - - - - - - - -