-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcrawl.py
57 lines (43 loc) · 1.39 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
"""Async/await crawler."""
import argparse
import asyncio
import logging
import sys
from crawler import Crawler
ARGS = argparse.ArgumentParser(description="async/await crawler")
ARGS.add_argument(
'root', help='Root URL')
ARGS.add_argument(
'-v', '--verbose', action='count', dest='level', default=2, help='Verbose logging (repeat for more verbose)')
ARGS.add_argument(
'-o', '--out', dest='out', default='sitemap.html', help='Sitemap output file')
def fix_url(url):
"""Prefix a schema-less URL with http://."""
if '://' not in url:
url = 'http://' + url
return url
def main():
args = ARGS.parse_args()
if not args.root:
print('Use --help for command line help')
return
levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]
logging.basicConfig(level=levels[min(args.level, len(levels) - 1)])
loop = asyncio.get_event_loop()
root = fix_url(args.root)
crawler = Crawler(root=root, loop=loop, out=args.out)
try:
loop.run_until_complete(crawler.crawl())
except KeyboardInterrupt:
sys.stderr.flush()
print('\nInterrupted\n')
finally:
crawler.report()
crawler.close()
# next two lines are required for actual aiohttp resource cleanup
# Todo (Nour): Check!
loop.stop()
loop.run_forever()
loop.close()
if __name__ == '__main__':
main()