-
Notifications
You must be signed in to change notification settings - Fork 1
/
DD.py
219 lines (168 loc) · 7.34 KB
/
DD.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import os, sys
import requests
from urllib.parse import unquote
from bs4 import BeautifulSoup
import re
import argparse
out_dir = '.' #out dir to write files to
date = 'today'
force_dl = False #doesn't Re-Download Existing
def main():
global out_dir
url = url_from_argv()
print('\n[*]Starting at ',url)
dURL = findDarshanURL(url,date,force_dl)
for imgURLs in dURL:
out_dir = imgURLs.split('/')[-1]
if not out_dir:
out_dir = imgURLs.split('/')[-2]
dPhotoURL = extract_DarshanImages(imgURLs)
print('[*]{} Images URLs to download: \n{}'.format(len(dPhotoURL),dPhotoURL))
saveImg(dPhotoURL)
print('\nYS. Hari Bol! :) ')
openFolder(out_dir) # to open the downloaded folder
def url_from_argv():
global date, force_dl
url = '' #init
parser = argparse.ArgumentParser()
parser.add_argument('--temple','-t','--Temple','--iskcon','--ISKCON','-i', nargs='*', help='Name of temple like Mayapur')
parser.add_argument('--url','-u','--URL', help='Explicitly the URL page of Darshan')
parser.add_argument('--page','-p','--Page', help='Previous pages to dowload')
parser.add_argument('--all','-a','--All', action='store_true', help='If all pages are to be downloaded')
parser.add_argument('--force','-f', action='store_true', help='To re-download even if already Downloaded in Past!')
args,unknown_args = parser.parse_known_args()
print('[*]Using Args: ',args)
#print(unknown_args)
if args.all:
date = 'all' #default is today as global
if args.force:
force_dl = True
if args.temple:
Temple = ' '.join(args.temple) #to convert list to string
Temple = Temple.lower()
Temple = Temple.replace(' ','-')
url = 'https://darshan.iskcondesiretree.com/category/iskcon-{}/'.format(Temple)
elif args.url:
url = arg.url
if args.page:
url = url + '/page/{}'.format(args.page)
if not url:
#url = u'https://darshan.iskcondesiretree.com/category/iskcon-kolkata/' #default url
#print('[-]Nothing specified! Using default: ',url)
Temple = input("Which Temple?\n")
Temple = Temple.lower()
Temple = Temple.replace(' ','-')
url = 'https://darshan.iskcondesiretree.com/category/iskcon-{}/'.format(Temple)
return url
def extract_DarshanImages(url):
print('[*]Downloading from ',url)
page_x = requests.get(url)
soup = BeautifulSoup(page_x.content, 'lxml')
#title = soup.title.text
#print(u'[+]Trying {}'.format(title))
img_urls = soup.find_all("a",href=re.compile("\.jpg",re.IGNORECASE))
dPhotos = []
for img_url in img_urls:
dPhotos = dPhotos + [img_url.get("href")]
list(set(dPhotos))
return(dPhotos)
def findDarshanURL(url,date='today',force_dl=False):
D_page = requests.get(url)
soup = BeautifulSoup(D_page.content, 'lxml')
title = soup.title.text
#print(title)
#print(u'[+]Trying {}'.format(title))
all_links = soup.find_all("a",href=re.compile("https://darshan.iskcondesiretree.com/.*?[\d]+-[\d\w]+-[\d]+"))
if not force_dl:
exists_flag, all_links_new = checkIfAlreadyDownloaded(all_links,date) # to avoid Re-Downloading
all_links = all_links_new #only new links
#exists_flag=False
if force_dl or not exists_flag:
if date is 'today':
dURL = [all_links[0].get("href")] #Converting to list just for uniformity with else case
print('[*]Going after {}'.format(dURL))
elif date is 'all':
dURL = []
for link in all_links:
dURL = dURL + [link.get("href")]
dURL = list(set(dURL))
print('[*]Too many pages to download... ',dURL)
else:
pass
else:
print('\n[-]Already Exists! Quitting. Use --force flag to Force Download')
#sys.exit()
#openFolder(re.search("https://darshan.iskcondesiretree.com/(.*?[\d]+-[\d\w]+-[\d]+)",all_links[0])[1])
dURL = []
return(dURL)
def checkIfAlreadyDownloaded(all_links,date='today'):
new_links_list = []
alreadyDownloaded = False
for link in all_links:
date_info = re.search("https://darshan.iskcondesiretree.com/(.*?[\d]+-[\d\w]+-[\d]+)",str(link)) #2nd element will be date info
#when iterating over bs4 list, each link is treated as 'navigable bs4 str' not as bs4.tag
#date_info = re.search("https://darshan.iskcondesiretree.com/(.*?[\d]+-[\d\w]+-[\d]+)",link.get('href'))
if not os.path.isdir(date_info[1]):
new_links_list.append(link)
else:
print('[-]Exists: {} SKIPPING!'.format(date_info[1]))
if date is 'today': #For taday only 1st link is required, else the previous link becomes 1st one if that already existed
break
if not new_links_list:
alreadyDownloaded = True
return([alreadyDownloaded, new_links_list])
def saveImg(imgURLs):
#print('[+]Trying to download:\n',imgURLs)
print('[*]It looks like our Prayer has been granted!')
for url in imgURLs:
r = requests.get(url)
if not os.path.exists(out_dir):
os.mkdir(out_dir)
try:
file_name = unquote(unquote(url.split('/')[-1])).replace('+',' ')
with open(out_dir + '/' + file_name,'wb') as f:
f.write(r.content)
print('[+]Downloaded: ',url)
except:
print('[-]Failed to write file.\n')
print('\n')
def openFolder(out_dir):
if 'win' in sys.platform:
#not working in *nix
os.startfile(out_dir)
else:
os.system('nautilus {} &> /dev/null &'.format(out_dir)) #to supress the WARNINGS
def url_from_argv_OLD():
global date
url = u'https://darshan.iskcondesiretree.com/category/iskcon-kolkata/' #default url
#if 'juhu' in Temple: Temple = 'juhu-sringar' #Juhu has special URL format VERIFY again
if len(sys.argv) >= 2:
arg1 = sys.argv[1]
if arg1 == '--all':
date = 'all'
try:
arg2 = sys.argv[2]
possible_arg2 = ['--url','-url','-u','-t','--temple']
if arg2 in possible_arg2:
url = sys.argv[3]
if '.com' not in url: # for just temple name given as --url
url = 'https://darshan.iskcondesiretree.com/category/iskcon-{}/'.format(url)
print('[-]Doesn\'t look like a URL: using ',url)
except:
print('[-]NO definite URL. Using defualt ...',url)
elif (arg1 == '--url') or (arg1 == '-u'):
url = sys.argv[2]
elif (arg1 == '--iskcon') or (arg1 == '--temple'):
try:
arg2 = sys.argv[2]
Temple = arg2
except:
Temple = input("Which Temple?\n")
Temple = Temple.lower()
Temple = Temple.replace(' ','-')
url = 'https://darshan.iskcondesiretree.com/category/iskcon-{}/'.format(Temple) #change .lower() seeing the URL
else:
pass
return(url)
if __name__ == '__main__':
main()