-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcompile_urls.py
executable file
·112 lines (89 loc) · 3.35 KB
/
compile_urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/bin/python
import argparse
import os
import signal
import subprocess
import threading
import time
from urllib.parse import urlparse
#####################################################################################
# Argument inputs include:
#
# input_file: A list of search entries, separated by newline
# context: An additional search value to append to all queries
# domain: The sought site resource domain, used to filter viable search results
# engine: The search engine to use
# sleep: The length of time to pause between processing chunks of 10
#
######################################################################################
argument_parser = argparse.ArgumentParser(description='Compile UFC fighter rankings')
argument_parser.add_argument('input_file', type=argparse.FileType('r'))
argument_parser.add_argument('-c', '--context', default='')
argument_parser.add_argument('-d', '--domain', default='sherdog.com')
argument_parser.add_argument('-e', '--engine', default='https://duckduckgo.com')
argument_parser.add_argument('-s', '--sleep', type=int, default=30)
args = argument_parser.parse_args()
def get_output_files():
cwd, script = os.path.split(os.path.abspath(__file__))
output_directory = "{}/logs/{}".format(cwd.rstrip('/'), int(time.time()))
output_file = "{}/output.txt".format(output_directory)
error_file = "{}/error.txt".format(output_directory)
try:
print("Creating logs output directory {}".format(os.path.dirname(output_directory)))
os.makedirs(output_directory)
output_file = open(output_file, 'w')
error_file = open(error_file, 'w')
except OSError as exception:
if exception.errno != errno.EEXIST:
raise
return output_file, error_file
def kill_process(pid):
os.kill(pid, signal.SIGKILL)
os.waitpid(-1, os.WNOHANG)
print("Process {0} killed".format(pid))
###########################################
# #
######## Main processing routine ##########
# #
###########################################
output_file, error_file = get_output_files()
# iterate over sorted list of ufc fighters
processed = 0
for fighter in args.input_file:
if not fighter:
continue
fighter = fighter.strip()
print("Searching for link to {}...".format(fighter))
# code derived by example: http://blog.quibb.org/2010/11/crawling-the-web-with-lynx/
url = "{0}?q={1}".format(args.engine, "{} {} {}".format(args.domain, fighter, args.context))
cmd = "lynx -dump -nolist -notitle \"{0}\"".format(url)
process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
timer = threading.Timer(60.0, kill_process, args=[process.pid])
timer.start()
results = process.stdout.read()
timer.cancel()
results = results.decode('utf-8', 'replace')
found_link = False
for line in iter(results.splitlines()):
print(line)
try:
parsed_url = urlparse('//' + line.strip())
if parsed_url.netloc == args.domain:
output_file.write("{0},{1}\n"
.format(
fighter,
parsed_url.netloc + parsed_url.path
)
)
found_link = True
break
except ValueError:
continue
if not found_link:
print("Unable to find link to {0}".format(fighter))
output_file.write("{0},\n".format(fighter))
error_file.write(fighter + "\n")
processed += 1
if processed % 10 == 0:
print('Delaying for {0} seconds'.format(args.sleep))
time.sleep(args.sleep)