Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Version 2 Support #14

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added classes2/__init__.py
Empty file.
742 changes: 742 additions & 0 deletions classes2/discovery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,742 @@
import re
import socket
import urllib
from collections import Counter, defaultdict

from HTMLParser import HTMLParser


class DiscoverAllCMS(object):
# match all fingerprints against all responses
# this might generate false positives

def __init__(self, data):
self.cache = data['cache']
self.results = data['results']
self.matcher = data['matcher']
self.fps = data['fingerprints']
self.printer = data['printer']

# only used for pretty printing of debugging info
self.tmp_set = set()

def run(self):
self.printer.print_debug_line('Checking for more matches in cache (option -a) ...', 1)

# find matches for all the responses in the cache
for fp_category in ['cms', 'platform']:
for fp_type in self.fps.data[fp_category]:
fps = self.fps.data[fp_category][fp_type]['fps']

for response in self.cache.get_responses():
matches = self.matcher.get_result(fps, response)
for fp in matches:
self.results.add( fp_category, fp['name'], fp['output'], fp )

if (fp['name'], fp['output']) not in self.tmp_set:
self.printer.print_debug_line('- Found match: %s %s' % (fp['name'], fp['output']) , 2)

self.tmp_set.add((fp['name'], fp['output']))


class DiscoverCMS(object):

def __init__(self, options, data):
self.printer = data['printer']
self.matcher = data['matcher']
self.requester = data['requester']
self.result = data['results']
self.printer = data['printer']

self.batch_size = options['batch_size']
self.num_cms_to_find = options['stop_after']
self.find_all_cms = options['run_all']

# only used for pretty printing of debugging info
self.tmp_set = set()

self.queue = defaultdict(list)
for fp_type in data['fingerprints'].data['cms']:
for fp in data['fingerprints'].data['cms'][fp_type]['fps']:
self.queue[fp['url']].append(fp)



def get_queue(self, cms=None):
queue = []
if cms is None:
for i in range(self.batch_size):
try:
url, fp_list = self.queue.popitem()
queue.append(fp_list)
except KeyError:
break
else:
# the following procedure is *not* optimal
# the self.queue dict is completely destroyed and
# and rebuilt each time this procedure is called :(

# create a temp queue dict
tmp_queue = defaultdict(list)

# remove elements from the dict until it is empty
while len(self.queue) > 0:
url, fp_list = self.queue.popitem()

# remove all the elements of a queue entry's list
# one-by-one and check if the fingerprints are
# belong to the specified 'cms'
tmp_list = []
out_list = []

while len(fp_list) > 0:
# remove the fingerprint
fp = fp_list.pop()

# if the fingerprint matches the cms, add it to the
# out_list for the current url
# otherwise add it to the tmp_list
if fp['name'] == cms:
out_list.append(fp)
else:
tmp_list.append(fp)

# if there are elements in tmp_list (the new list of fps that
# that do *not* match the 'cms'), add it to the tmp_queue's entry
# for the current url
if len(tmp_list) > 0:
tmp_queue[url].extend(tmp_list)

# if matches for the specified cms have been found, add the list
# to the fingerprintQueue for the requester
if len(out_list) > 0:
queue.append(out_list)

# replace the queue with the tmp queue
self.queue = tmp_queue

return queue



def run(self):
batch_no = 0
self.printer.print_debug_line('Determining CMS type ...', 1)

detected_cms = []
stop_searching = len(detected_cms) >= self.num_cms_to_find

while (not stop_searching or self.find_all_cms) and (not len(self.queue) == 0):
self.printer.print_debug_line('Checking fingerprint group no. %s ...' % (batch_no, ) , 3)

# set the requester queue
results = self.requester.run('CMS', self.get_queue())

# search for CMS matches
cms_matches = []
while not results.empty():
fingerprints, response = results.get()

for fp in self.matcher.get_result(fingerprints, response):
self.result.add( 'cms', fp['name'], fp['output'], fp)
cms_matches.append(fp['name'])

# search for the found CMS versions
for cms in cms_matches:

# skip checking the cms, if it has already been detected
if cms in detected_cms: continue

if cms not in self.tmp_set:
self.tmp_set.add(cms)
self.printer.print_debug_line('- Found CMS match: %s' % (cms, ) , 2)

# set the requester queue with only fingerprints for the cms
results = self.requester.run('CMS_version', self.get_queue(cms))

# find the results
self.printer.print_debug_line('Determining CMS version ...', 1)
while results.qsize() > 0:
res_fps,response = results.get()
for fp in self.matcher.get_result(res_fps, response):
self.result.add( 'cms', fp['name'], fp['output'], fp)

if (fp['name'], fp['output']) not in self.tmp_set:
self.tmp_set.add((fp['name'], fp['output']))
self.printer.print_debug_line('- Found version: %s %s' % (fp['name'], fp['output']) , 2)


# update the stop criteria
detected_cms.append(cms)

stop_searching = (len(detected_cms) >= self.num_cms_to_find) or len(self.queue) == 0
batch_no += 1



class DiscoverCookies(object):

def __init__(self, data):
self.data = data
self.printer = data['printer']

def run(self):
self.printer.print_debug_line('Checking for cookies ...' , 1)

cookies = set()
for r in self.data['cache'].get_responses():
try:
c = r.headers['set-cookie'].strip().split('=')[0]
if c not in cookies:
self.printer.print_debug_line('- Found cookie: %s' % (c,) , 2)

cookies.add(c)

except:
pass

self.data['results'].site_info['cookies'] = cookies


class DiscoverSubdomains:

def __init__(self, url, data):
self.results = data['results']
self.subdomains = data['fingerprints'].data['subdomains']['fps']
self.url = url


def run(self):
domain = urllib.request.urlparse(self.url).netloc
domain = domain.split(':')[0]

valid = set()
for subdomain in self.subdomains:
d = subdomain + '.' + domain
try:
valid.add((d, socket.gethostbyname(d)))
except:
continue

return valid




class DiscoverErrorPage:
# find error pages on the site
# the requester has a built-in list of items and patterns
# to remove before calculating a checksum of pages that
# should not exists

def __init__(self, options, data):
self.host = options['url']
self.fps = data['fingerprints'].data['error_pages']['fps']
self.requester = data['requester']
self.printer = data['printer']


def run(self):
self.requester.find_404s = True

self.printer.print_debug_line('Error page detection ...', 1)

queue = [[fp] for fp in self.fps]
results = self.requester.run('ErrorPages', queue)

error_pages = set()
while results.qsize() > 0:
fp, response = results.get()
if response is not None:
error_pages.add(response.md5_404)
error_pages.add(response.md5_404_text)
self.printer.print_debug_line('- Error page fingerprint: %s, %s - %s' % (response.md5_404, response.md5_404_text, fp[0]['url']), 2)

self.requester.find_404s = False

return error_pages


class DiscoverInteresting(object):
def __init__(self, options, data):
self.url = options['url']
self.printer = data['printer']
self.requester = data['requester']
self.matcher = data['matcher']
self.result = data['results']
self.error_pages = data['error_pages']
self.cache = data['cache']
self.category = "interesting"

# add the fingerprints to the queue, ensuring that
# all fps with the same url, are collected in a list
self.queue = defaultdict(list)
for fp in data['fingerprints'].data['interesting']['fps']:
self.queue[fp['url']].append(fp)


def run(self):
self.printer.print_debug_line('Detecting interesting files ...', 1)

# process the results
results = self.requester.run('Interesting', list(self.queue.values()))

while results.qsize() > 0:
fps,response = results.get()

# if the response includes a 404 md5, check if the response
# is a redirection to a known error page
# this is a fix for https://github.com/jekyc/wig/issues/7
if response is not None:
redirected = response.md5_404 in self.error_pages
redirected = redirected or (response.md5_404_text in self.error_pages)
redirected = redirected or (response.md5_404_text == self.cache[self.url].md5_404_text)

# if it is an error page, skip it
if redirected: continue

# if the response does not have a 404 md5, something most have gone wrong
# skip checking the page
else:
continue

for fp in self.matcher.get_result(fps, response):
self.result.add( self.category, None, None, fp, weight=1)
try:
self.printer.print_debug_line('- Found file: %s (%s)' % (fp['url'], fp['note'] ), 2)
except:
pass


class DiscoverIP(object):

def __init__(self, path):
self.path = path

def run(self):

try:
hostname = self.path.split('//')[1]
hostname = hostname.split('/')[0]
ip = socket.gethostbyname(hostname)
except Exception as e:
#print(e)
ip = 'Unknown'

return ip


class DiscoverJavaScript(object):
def __init__(self, options, data):
self.printer = data['printer']
self.cache = data['cache']
self.matcher = data['matcher']
self.result = data['results']

self.fingerprints = []
for fp_type in data['fingerprints'].data['js']:
self.fingerprints.extend(data['fingerprints'].data['js'][fp_type]['fps'])


def run(self):
self.printer.print_debug_line('Detecting Javascript ...', 1)
for response in self.cache.get_responses():

# match only if the response is JavaScript
# check content type
content_type = response.headers['content-type'] if 'content-type' in response.headers else ''
# and extension
is_js = 'javascript' in content_type or '.js' in response.url.split('.')[-1]

# if the response is JavaScript try to match it to the known fingerprints
if is_js:
matches = self.matcher.get_result(self.fingerprints, response)
for fp in matches:
self.result.add( 'js', fp['name'], fp['output'], fingerprint=fp, weight=1)

self.printer.print_debug_line('- Found JavaScript: %s %s' % (fp['name'], fp['output']), 2)



# Used by the DiscoverMore crawler
# The
class LinkExtractor(HTMLParser):
def __init__(self, strict):
HTMLParser.__init__(self)
self.results = set()

def get_results(self):
return self.results

def handle_starttag(self, tag, attrs):
try:
url = ''
if tag == 'script' or tag == 'img':
for attr in attrs:
if attr[0] == 'src':
self.results.add(attr[1])
if tag == 'link':
for attr in attrs:
if attr[0] == 'href':
self.results.add(attr[1])
except:
pass



class DiscoverMore(object):

def __init__(self, options, data):
self.host = options['url']
self.threads = options['threads']
self.printer = data['printer']
self.cache = data['cache']
self.result = data['results']
self.matcher = data['matcher']
self.requester = data['requester']
self.fingerprints = data['fingerprints']


def _get_urls(self, response):
# only get urls from elements that use 'src' to avoid
# fetching resources provided by <a>-tags, as this could
# lead to the crawling of the whole application
regexes = [ 'src="(.+?)"', "src='(.+?)'"]

urls = set()
for regex in regexes:
for match in re.findall(regex, response.body):
urls.add( match )

return urls


def run(self):
self.printer.print_debug_line('Detecting links ...', 1)
resources = set()
parser = LinkExtractor(strict=False)

for req in self.cache.get_responses():
# skip pages that do not set 'content-type'
# these might be binaries
if not 'content-type' in req.headers:
continue

# skip responses that have been discovered
# with 'DiscoverMore'
if req.crawled_response:
continue

# only scrape pages that can contain links/references
if 'text/html' in req.headers['content-type']:
tmp = self._get_urls(req)

parser.feed(req.body)
tmp = tmp.union( parser.get_results())

for i in tmp:
url_data = urllib.request.urlparse(i)

# skip data urls
if url_data.path.startswith('data:'): continue

resources.add( i )

# the items in the resource set should mimic a list of fingerprints:
# a fingerprint is a dict with at least an URL key
self.printer.print_debug_line('- Discovered %s new resources' % (len(resources), ), 2)

# prepare the urls
queue = defaultdict(list)
for url in resources:
queue[url].append({'url': url})


# fetch'em
results = self.requester.run('DiscoverMore', list(queue.values()))


class DiscoverOS:
def __init__(self, options, data):
self.printer = data['printer']
self.cache = data['cache']
self.results = data['results']
self.fingerprints = data['fingerprints'].data['os']['fps']

self.os = Counter()
self.os_family_list = Counter()
self.matched_packages = set()


def search_and_prioritize_os(self, pkg_name, pkg_version):
for fp in self.fingerprints:
if fp['pkg_name'] == pkg_name and fp['pkg_version'] == pkg_version:
weight = 1 if not 'weight' in fp else fp['weight']

if not type(fp['os_version']) == type([]):
fp['os_version'] = [fp['os_version']]

for os_version in fp['os_version']:
if fp['os_name'].lower() in self.os_family_list:
self.printer.print_debug_line('- Prioritizing fingerprints for OS: %s' % (fp['os_name'], ), 7)
self.os[ (fp['os_name'], os_version) ] += weight * 100
else:
self.os[ (fp['os_name'], os_version) ] += weight


def find_match_in_headers(self, response):
headers = response.headers
if 'server' in headers:
line = headers['server']

if "(" in line:
os = line[line.find('(')+1:line.find(')')]

# hack for RHEL
if os == 'Red Hat':
os = 'Red Hat Enterprise Linux'

line = line[:line.find('(')-1] + line[line.find(')')+1: ]
else:
os = None

if os is not None:
self.os_family_list[os.lower()] += 1

for part in line.split(" "):
try:
pkg,version = list(map(str.lower, part.split('/')))
self.search_and_prioritize_os(pkg, version)

except Exception as e:
continue


def find_match_in_results(self):
platforms = self.results.scores['platform']

for pkg in platforms:
for version in platforms[pkg]:
# hack for asp.net
if pkg == 'ASP.NET':
version = version[:3] if not version.startswith("4.5") else version[:5]

self.search_and_prioritize_os(pkg, version)


def finalize(self):
# add OS to results: self.os: {(os, version): weight, ...}
results = []
for p in self.os:
results.append({'version': p[1], 'os': p[0], 'count': self.os[p]})

if len(results) == 0: return

prio = sorted(results, key=lambda x:x['count'], reverse=True)
max_count = prio[0]['count']
for i in prio:
if i['count'] == max_count:
self.results.add('os', i['os'], i['version'], weight=i['count'])
self.printer.print_debug_line('- Found OS: %s %s' % (i['os'], i['version']), 2)
else:
break


def run(self):
self.printer.print_debug_line('Detecting OS ...', 1)
headers = set()
responses = self.cache.get_responses()

# find matches in the header
for response in responses:
self.find_match_in_headers(response)

# find match in current results
self.find_match_in_results()

# do some house keeping
self.finalize()




class DiscoverPlatform:

def __init__(self, options, data):
self.printer = data['printer']
self.requester = data['requester']
self.matcher = data['matcher']
self.result = data['results']
self.printer = data['printer']

self.threads = options['threads']
self.batch_size = options['batch_size']

self.queue = defaultdict(list)
for fp_type in data['fingerprints'].data['platform']:
for fp in data['fingerprints'].data['platform'][fp_type]['fps']:
self.queue[fp['url']].append(fp)

# only used for pretty printing of debugging info
self.tmp_set = set()

def run(self):
self.printer.print_debug_line('Detecting platform ...', 1)

while len(self.queue) > 0:
queue = []
for i in range(self.batch_size):
try:
url, fp_list = self.queue.popitem()
queue.append(fp_list)
except KeyError:
break

results = self.requester.run('Plaform', queue)

# search for CMS matches
while not results.empty():
fingerprints, response = results.get()
matches = self.matcher.get_result(fingerprints, response)
for fp in matches:
self.result.add('platform', fp['name'], fp['output'], fp)

if (fp['name'], fp['output']) not in self.tmp_set:
self.printer.print_debug_line('- Found platform %s %s' % (fp['name'], fp['output']), 2)

self.tmp_set.add((fp['name'], fp['output']))



class DiscoverTitle:

def __init__(self, options, data):
self.data = data
self.url = options['url']
self.printer = data['printer']

def run(self):
self.printer.print_debug_line('Getting title ...', 1)

r = self.data['requester'].run('Title', [[{'url': '/'}]])

front_page = self.data['cache'][self.url]

try:
title = re.findall('<title>\s*(.*)\s*</title>', front_page.body)[0]
title = title.strip()
except:
title = ''

try:
self.printer.print_debug_line('- Found title: %s' % (title, ), 2)
except:
pass

return title


class DiscoverTools:
def __init__(self, data):
self.fps = data['fingerprints']
self.results = data['results']
self.printer = data['printer']


def run(self):
self.printer.print_debug_line('Searching for tools ...', 1)
cms_results = self.results.get_versions()

# loop over the cms' in the results
for cms,_ in cms_results:
# loop over all the translations
for fn in self.fps.translator:
# check if the translated name is the same as the cms
if self.fps.translator[fn]['name'] == cms and 'tool' in self.fps.translator[fn]:
for tool in self.fps.translator[fn]['tool']:
self.results.add_tool(cms, tool['name'], tool['link'])
self.printer.print_debug_line('- Found tool: %s (%s)' % (tool['name'], tool['link']), 2)



class DiscoverUrlLess(object):
def __init__(self, options, data):
self.printer = data['printer']
self.cache = data['cache']
self.results = data['results']
self.matcher = data['matcher']
self.fingerprints = data['fingerprints']


def run(self):
self.printer.print_debug_line('Matching urlless fingerprints...', 1)

# only used for pretty printing of debugging info
tmp_set = set()

for fp_category in ['cms', 'platform']:
for fp_type in self.fingerprints.data[fp_category]:
fps = self.fingerprints.data[fp_category][fp_type]['fps']
fps = [fp for fp in fps if fp['url'] == '']

# find matches for all the responses in the cache
for response in self.cache.get_responses():
matches = self.matcher.get_result(fps, response)
for fp in matches:

url_data = urllib.request.urlparse(response.get_url())
fp['url'] = url_data.path


show_all_detections = True
if 'show_all_detections' in fp:
show_all_detections = fp['show_all_detections']

if (fp['name'], fp['output']) in tmp_set:
if show_all_detections:
self.results.add(fp_category, fp['name'], fp['output'], fingerprint=fp, weight=1)

else:
self.printer.print_debug_line('- Found fingerprint: %s %s' % (fp['name'], fp['output']), 2)
self.results.add(fp_category, fp['name'], fp['output'], fingerprint=fp, weight=1)

tmp_set.add((fp['name'], fp['output']))




class DiscoverVulnerabilities:
def __init__(self, data):
self.printer = data['printer']
self.results = data['results']
self.fps = []

vuln_sources = data['fingerprints'].data['vulnerabilities']

for source in vuln_sources:
self.fps.extend(data['fingerprints'].data['vulnerabilities'][source]['fps'])


def run(self):
self.printer.print_debug_line('Searching for vulnerabilities ...', 1)

cms_results = self.results.get_versions()

vendors = Counter()
for r in cms_results: vendors[r[0]] += 1

# if there are more than 5 results,
# skip displaying vuln count, as the
# results are unreliable
for cms, version in cms_results:
if vendors[cms] > 5: continue

try:
for fp in self.fps:
if fp['name'] == cms and fp['version'] == version:
self.results.add_vulnerabilities(cms, version, fp['num_vulns'], fp['link'])
self.printer.print_debug_line('- Found vulnerability: %s %s: %s' % (cms, version, fp['num_vulns']), 2)

except Exception as e:
print(e)
pass
334 changes: 334 additions & 0 deletions classes2/request2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,334 @@
import sys
try:
import concurrent.futures
except:
print 'Install futures: sudo pip install futures'
raise sys.exit(1)
import hashlib
import re
import string
import random
import urlparse
import urllib2
from HTMLParser import HTMLParser


class HTMLStripper(HTMLParser):
def __init__(self):
self.reset()
self.strict = False
self.convert_charrefs = True
self.tagtext = []
def handle_data(self, d):
self.tagtext.append(d)
def get_tagtext(self):
return ''.join(self.tagtext)


def _clean_page(page):
# this the same method nmap's http.lua uses for error page detection
# nselib/http.lua: clean_404
# remove information from the page that might not be static

# time
page = re.sub(b'(\d?\d:?){2,3}', b'',page)
page = re.sub(b'AM', b'',page, flags=re.IGNORECASE)
page = re.sub(b'PM', b'',page, flags=re.IGNORECASE)
page = re.sub(b'(\d){13}', b'', page) # timestamp

# date with 4 digit year
page = re.sub(b'(\d){8}', '',page)
page = re.sub(b'\d{4}-\d{2}-\d{2}', b'',page)
page = re.sub(b'\d{4}/\d{2}/\d{2}', b'',page)
page = re.sub(b'\d{2}-\d{2}-\d{4}', b'',page)
page = re.sub(b'\d{2}/\d{2}/\d{4}', b'',page)

# date with 2 digit year
page = re.sub( b'(\d){6}', '',page)
page = re.sub( b'\d{2}-\d{2}-\d{2}', b'',page)
page = re.sub( b'\d{2}/\d{2}/\d{2}', b'',page)

# links and paths
page = re.sub( b'/[^ ]+', b'', page)
page = re.sub( b'[a-zA-Z]:\\[^ ]+', b'', page)

# return the fingerprint of the stripped page
return hashlib.md5(page).hexdigest().lower()


def _create_response(response):
R = Response()

url = response.geturl()
response_info = urlparse.urlparse(url)
body = response.read()

# get the page text only
parser = HTMLStripper()
parser.feed(body.decode('utf-8', 'ignore'))
page_text = parser.get_tagtext()

R.set_body(body)
R.protocol = response_info.scheme
R.host = response_info.netloc
R.url = url
R.status = {'code': response.code, 'text': response.msg}
R.headers = {pair[0].lower():pair[1] for pair in response.headers}
R.md5 = hashlib.md5(body).hexdigest().lower()
R.md5_404 = _clean_page(body)
R.md5_404_text = _clean_page(page_text.encode('utf-8', 'ignore'))

return(R)


#######################################################################
#
# Override urllib.request classes
#
#######################################################################

class OutOfScopeException(Exception):
def __init__(self, org_url, new_url):
self.original_netloc = org_url.netloc
self.new_netloc = new_url.netloc

def __str__(self):
return repr( "%s is not in scope %s" % (self.new_netloc, self.original_netloc) )


class UnknownHostName(Exception):
def __init__(self, url):
self.url = url

def __str__(self):
return "Unknown host: %s" % (self.url,)


class ErrorHandler(urllib2.HTTPDefaultErrorHandler):
def http_error_default(self, req, fp, code, msg, hdrs):
return(fp)


class RedirectHandler(urllib2.HTTPRedirectHandler):
"""
This currently only checks if the redirection netloc is
the same as the the netloc for the request.
NOTE: this is very strict, as it will not allow redirections
from 'example.com' to 'www.example.com'
"""

def http_error_302(self, req, fp, code, msg, headers):
if 'location' in headers:
org_url = urlparse.urlparse(req.get_full_url())
new_url = urlparse.urlparse(headers['location'])

# if the location starts with '/' the path is relative
if headers['location'].startswith('/'):
new_url = new_url._replace(scheme=org_url.scheme, netloc=org_url.netloc)

if not new_url.netloc == org_url.netloc:
raise OutOfScopeException(org_url, new_url)

# call python's built-in redirection handler
return urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)

http_error_301 = http_error_303 = http_error_307 = http_error_302


#######################################################################
#
# Custom request and response classes
#
#######################################################################

class Response:
"""
This is object is used to store response information
The normal http.client.HTTPResponse cannot be pickled
which is used in the caching process
"""

def __init__(self):
self.url = ''
self.protocol = ''
self.host = ''
self.status = {}
self.headers = {}
self.body = ''

self.md5 = None
self.md5_404 = None
self.should_be_error_page = False

self.crawled_response = False

chars = string.ascii_uppercase + string.digits
self.id = ''.join(random.choice(chars) for _ in range(16))


def get_url(self):
url_data = urlparse.urlparse(self.url)

if url_data.scheme == '': url_data._replace(scheme=self.protocol)
if url_data.netloc == '': url_data._replace(netloc=self.host)

return url_data.geturl()


def set_body(self, body):
# check if the encoding is specified in the http header
content_type = 'Content-Type'.lower()

if content_type not in self.headers:
self.body = str(body).decode(errors='replace')

else:
# find content-type definitions
content_types = {'text': False, 'charset': None}

for item in self.headers[content_type].split(';'):
if 'text' in item:
content_types['text'] = True

if 'charset' in item:
content_types['charset'] = item.split('=')[1]

# set the encoding to use
if content_types['charset'] is not None:
self.body = str(body).decode(content_types['charset'], errors='replace')
elif content_types['text']:
self.body = str(body).decode('ISO-8859-1', errors='replace')
else:
self.body = str(body).decode(errors='replace')


def __repr__(self):
def get_string(r):
string = r.url + '\n'
string += '%s %s\n' %(r.status['code'], r.status['text'])
string += '\n'.join([header +': '+ r.headers[header] for header in r.headers])
string += '\n\n'
string += 'MD5: ' + self.md5 + '\n'
string += 'MD5 Error page: ' + self.md5_404 + '\n'
return string

return get_string(self)


class Requester:
def __init__(self, options, data):
self.threads = options['threads']
self.proxy = options['proxy']
self.user_agent = options['user_agent']

self.data = data
self.cache = data['cache']
self.requested = data['requested']
self.printer = data['printer']

self.is_redirected = False
self.find_404s = False
self.fingerprintQueue = None

self.url_data = urlparse.urlparse(options['url'])
if options['prefix']:
self.url_data.path = options['prefix'] + self.url_data.path
self.url = urlparse.urlunparse(self.url_data)

def _create_fetcher(self, redirect_handler=True):
args = [ErrorHandler]
if self.proxy == None:
args.append(urllib2.ProxyHandler({}))
elif not self.proxy == False:
protocol = self.url_data.scheme
args.append(urllib2.ProxyHandler({protocol: self.proxy}))

if redirect_handler:
args.append(RedirectHandler)

opener = urllib2.build_opener(*args)
opener.addheaders = [('User-agent', self.user_agent)]
return opener

def detect_redirect(self):
parse = urlparse.urlparse

# the original url
org_url = self.url_data

# get an opener doing redirections
try:
opener = self._create_fetcher(redirect_handler=False)
response = opener.open(self.url)
except:
raise UnknownHostName(self.url)

# the new url
new_url = parse(response.geturl())

# detect a redirection
new_loc = new_url.scheme + '://' + new_url.netloc
org_loc = org_url.scheme + '://' + org_url.netloc

self.is_redirected = not(new_loc == org_loc)

if self.is_redirected:
self.printer.print_debug_line('%s redirects to %s' % (org_loc, new_loc),2)
else:
self.printer.print_debug_line('%s does not redirect' % (org_loc, ), 2)

# create an response object and add it to the cache
R = _create_response(response)
self.cache[new_loc] = R
self.cache[self.url] = R

return (self.is_redirected, new_loc)


def request(self, fp_list, run_type):

url = fp_list[0]['url']
complete_url = urlparse.urljoin(self.url, url)

R = None

# check if the url is out of scope
url_data = urlparse.urlparse(complete_url)
host_data = urlparse.urlparse(self.url)

if not url_data.netloc == host_data.netloc:
pass

elif not complete_url in self.cache:
try:
opener = self._create_fetcher()
request = urllib2.Request(complete_url)
response = opener.open(request)
R = _create_response(response)

if run_type == 'DiscoverMore':
R.crawled_response = True

self.cache[complete_url] = R
self.cache[response.geturl()] = R
except Exception as e:
pass
else:
R = self.cache[complete_url]

return (fp_list, R)


def run(self, run_type=None, fp_lists=[]):
with concurrent.futures.ThreadPoolExecutor(max_workers=self.threads) as executor:
future_list = []

for fp_list in fp_lists:
future_list.append(executor.submit(self.request, fp_list, run_type))

for future in concurrent.futures.as_completed(future_list):
self.requested.put(future.result())

return self.requested
15 changes: 10 additions & 5 deletions wig.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python3
#!/usr/bin/python
"""
wig - WebApp Information Gatherer
@@ -26,17 +26,22 @@
"""


import time, queue, sys, argparse
import time, sys, argparse
from classes.cache import Cache
from classes.results import Results
from classes.fingerprints import Fingerprints
from classes.discovery import *
from classes.headers import ExtractHeaders
from classes.matcher import Match
from classes.printer import Printer
from classes.output import OutputPrinter, OutputJSON
from classes.request2 import Requester, UnknownHostName

if sys.version_info.major == 3:
import queue
from classes.discovery import *
from classes.request2 import Requester, UnknownHostName
elif sys.version_info.major == 2:
import Queue as queue
from classes2.discovery import *
from classes2.request2 import Requester, UnknownHostName


class Wig(object):