2
2
# -*- coding: utf-8 -*-"
3
3
# vim: set expandtab tabstop=4 shiftwidth=4:
4
4
"""
5
- $Id$
5
+ This file is part of the XSSer project, https://xsser.03c8.net
6
6
7
- This file is part of the xsser project, http://xsser.03c8.net
8
-
9
- Copyright (c) 2011/2016 psy <[email protected] >
7
+ Copyright (c) 2010/2019 | psy <[email protected] >
10
8
11
9
xsser is free software; you can redistribute it and/or modify it under
12
10
the terms of the GNU General Public License as published by the Free
@@ -40,14 +38,10 @@ class EmergencyLanding(Exception):
40
38
class Crawler (object ):
41
39
"""
42
40
Crawler class.
43
-
44
- Crawls a webpage looking for url arguments.
45
- Dont call from several threads! You should create a new one
46
- for every thread.
47
41
"""
48
42
def __init__ (self , parent , curlwrapper = None , crawled = None , pool = None ):
49
43
# verbose: 0-no printing, 1-prints dots, 2-prints full output
50
- self .verbose = 1
44
+ self .verbose = 0
51
45
self ._parent = parent
52
46
self ._to_crawl = []
53
47
self ._parse_external = True
@@ -81,7 +75,10 @@ def _find_args(self, url):
81
75
find parameters in given url.
82
76
"""
83
77
parsed = urllib2 .urlparse .urlparse (url )
84
- qs = urlparse .parse_qs (parsed .query )
78
+ if "C=" in parsed .query and "O=" in parsed .query :
79
+ qs = ""
80
+ else :
81
+ qs = urlparse .parse_qs (parsed .query )
85
82
if parsed .scheme :
86
83
path = parsed .scheme + "://" + parsed .netloc + parsed .path
87
84
else :
@@ -92,6 +89,14 @@ def _find_args(self, url):
92
89
if not zipped or not path in zipped [0 ]:
93
90
self ._found_args [key ].append ([path , url ])
94
91
self .generate_result (arg_name , path , url )
92
+ if not qs :
93
+ parsed = urllib2 .urlparse .urlparse (url )
94
+ if path .endswith ("/" ):
95
+ attack_url = path + "XSS"
96
+ else :
97
+ attack_url = path + "/XSS"
98
+ if not attack_url in self ._parent .crawled_urls :
99
+ self ._parent .crawled_urls .append (attack_url )
95
100
ncurrent = sum (map (lambda s : len (s ), self ._found_args .values ()))
96
101
if ncurrent >= self ._max :
97
102
self ._armed = False
@@ -121,6 +126,7 @@ def crawl(self, path, depth=3, width=0, local_only=True):
121
126
attack_urls = []
122
127
if not self ._parent ._landing and self ._armed :
123
128
self ._crawl (basepath , path , depth , width )
129
+ # now parse all found items
124
130
if self ._ownpool :
125
131
self .pool .dismissWorkers (len (self .pool .workers ))
126
132
self .pool .joinAllDismissedWorkers ()
@@ -138,7 +144,7 @@ def generate_result(self, arg_name, path, url):
138
144
for key , val in qs .iteritems ():
139
145
qs_joint [key ] = val [0 ]
140
146
attack_qs = dict (qs_joint )
141
- attack_qs [arg_name ] = "VECTOR "
147
+ attack_qs [arg_name ] = "XSS "
142
148
attack_url = path + '?' + urllib .urlencode (attack_qs )
143
149
if not attack_url in self ._parent .crawled_urls :
144
150
self ._parent .crawled_urls .append (attack_url )
@@ -178,37 +184,35 @@ def _curl_main(self, pars):
178
184
self ._get_done (basepath , depth , width , path , res , c_info )
179
185
180
186
def _get_error (self , request , error ):
181
- try :
182
- path , depth , width , basepath = request .args [0 ]
183
- e_type , e_value , e_tb = error
184
- if e_type == pycurl .error :
185
- errno , message = e_value .args
186
- if errno == 28 :
187
- print ("requests pyerror -1" )
188
- self .enqueue_jobs ()
189
- self ._requests .remove (path )
190
- return # timeout
191
- else :
192
- self .report ('crawler curl error: ' + message + ' (' + str (errno )+ ')' )
193
- elif e_type == EmergencyLanding :
194
- pass
187
+ path , depth , width , basepath = request .args [0 ]
188
+ e_type , e_value , e_tb = error
189
+ if e_type == pycurl .error :
190
+ errno , message = e_value .args
191
+ if errno == 28 :
192
+ print ("requests pyerror -1" )
193
+ self .enqueue_jobs ()
194
+ self ._requests .remove (path )
195
+ return # timeout
195
196
else :
196
- traceback .print_tb (e_tb )
197
- self .report ('crawler error: ' + str (e_value )+ ' ' + path )
198
- if not e_type == EmergencyLanding :
199
- for reporter in self ._parent ._reporters :
200
- reporter .mosquito_crashed (path , str (e_value ))
201
- self .enqueue_jobs ()
202
- self ._requests .remove (path )
203
- except :
204
- return
197
+ self .report ('crawler curl error: ' + message + ' (' + str (errno )+ ')' )
198
+ elif e_type == EmergencyLanding :
199
+ pass
200
+ else :
201
+ traceback .print_tb (e_tb )
202
+ self .report ('crawler error: ' + str (e_value )+ ' ' + path )
203
+ if not e_type == EmergencyLanding :
204
+ for reporter in self ._parent ._reporters :
205
+ reporter .mosquito_crashed (path , str (e_value ))
206
+ self .enqueue_jobs ()
207
+ self ._requests .remove (path )
205
208
206
209
def _emergency_parse (self , html_data , start = 0 ):
207
210
links = set ()
208
211
pos = 0
209
- if not html_data :
210
- return
211
- data_len = len (html_data )
212
+ try :
213
+ data_len = len (html_data )
214
+ except :
215
+ data_len = html_data
212
216
while pos < data_len :
213
217
if len (links )+ start > self ._max :
214
218
break
@@ -236,35 +240,31 @@ def enqueue_jobs(self):
236
240
next_job = self ._to_crawl .pop ()
237
241
self ._crawl (* next_job )
238
242
239
- def _get_done (self , basepath , depth , width , path , html_data , content_type ): # request, result):
243
+ def _get_done (self , basepath , depth , width , path , html_data , content_type ):
240
244
if not self ._armed or len (self ._parent .crawled_urls ) >= self ._max :
241
245
raise EmergencyLanding
242
246
try :
243
247
encoding = content_type .split (";" )[1 ].split ("=" )[1 ].strip ()
244
248
except :
245
249
encoding = None
246
250
try :
247
- soup = BeautifulSoup (html_data , from_encoding = encoding )
251
+ soup = BeautifulSoup (html_data , fromEncoding = encoding )
248
252
links = None
249
253
except :
250
254
soup = None
251
255
links = self ._emergency_parse (html_data )
252
-
253
256
for reporter in self ._parent ._reporters :
254
257
reporter .start_crawl (path )
255
-
256
258
if not links and soup :
257
- links = soup .find_all ('a' )
258
- forms = soup .find_all ('form' )
259
-
259
+ links = soup .findAll ('a' )
260
+ forms = soup .findAll ('form' )
260
261
for form in forms :
261
262
pars = {}
262
263
if form .has_key ("action" ):
263
264
action_path = urlparse .urljoin (path , form ["action" ])
264
265
else :
265
266
action_path = path
266
- for input_par in form .find_all ('input' ):
267
-
267
+ for input_par in form .findAll ('input' ):
268
268
if not input_par .has_key ("name" ):
269
269
continue
270
270
value = "foo"
@@ -284,8 +284,6 @@ def _get_done(self, basepath, depth, width, path, html_data, content_type): # re
284
284
elif self .verbose :
285
285
sys .stdout .write ("." )
286
286
sys .stdout .flush ()
287
- if not links :
288
- return
289
287
if len (links ) > self ._max :
290
288
links = links [:self ._max ]
291
289
for a in links :
@@ -323,7 +321,6 @@ def _check_url(self, basepath, path, href, depth, width):
323
321
self ._find_args (href )
324
322
for reporter in self ._parent ._reporters :
325
323
reporter .add_link (path , href )
326
- self .report ("\n [Info] Spidering: " + str (href ))
327
324
if self ._armed and depth > 0 :
328
325
if len (self ._to_crawl ) < self ._max :
329
326
self ._to_crawl .append ([basepath , href , depth - 1 , width ])
0 commit comments