forked from m4ll0k/BBTz
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind-all-links.py
53 lines (43 loc) · 1.06 KB
/
find-all-links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/python
import requests
import json
import sys
if len(sys.argv) < 2 or len(sys.argv) > 4:
print('%s [domain] [output]'%sys.argv[0])
sys.exit(0)
all_links = []
domain = sys.argv[1]
try:
filename = sys.argv[2]
except IndexError:
filename = None
import urlparse
parse = urlparse.urlparse(domain)
if parse.netloc:
domain = parse.netloc
elif parse.path != '' and parse.netloc == '':
domain = parse.path
else: domain = domain
def archive():
content = requests.get('http://web.archive.org/cdx/search/cdx?url=*.%s/*&output=json&collapse=urlkey'%domain).content
c = json.loads(content)
for i in c:
for b in i:
if domain in b and b.startswith('http'):
if b not in all_links:
all_links.append(b)
##
## http://index.commoncrawl.org/CC-MAIN-2018-22-index?url=*.rezserver.com/*&output=json
##
archive()
if filename != None:
file = open(filename,'w+')
for i in all_links:
if filename:
i = i.replace('\u3000','')
i = i.replace(r'\u','')
file.write('%s\n'%i.encode('ascii', 'ignore').decode('ascii'))
else:
print(i)
if filename:
file.close()