Initial Push

cheetz · cheetz · commit 8d55134ce21e · 2015-02-16T17:22:42.000-08:00
diff --git a/brutescrape.banner b/brutescrape.banner
@@ -0,0 +1,15 @@
+====================================================================================
+__________                __           _________                                        
+\______   \_______ __ ___/  |_  ____  /   _____/ ________________  ______   ____
+ |    |  _/\_  __ \  |  \   __\/ __ \ \_____  \_/ ___\_  __ \__  \ \____ \_/ __ \
+ |    |   \ |  | \/  |  /|  | \  ___/ /        \  \___|  | \// __ \|  |_> >  ___/
+ |______  / |__|  |____/ |__|  \___  >_______  /\___  >__|  (____  /   __/ \___  >
+        \/                         \/        \/     \/           \/|__|        \/
+
+Brutescrape | A web scraper for generating password files based on plain text found
+               in specific web pages.
+Written by Peter Kim <Author, The Hacker Playbook>
+                     <CEO, Secure Planet LLC>
+
+Usage | python brutescrape.py
+====================================================================================
diff --git a/brutescrape.py b/brutescrape.py
@@ -0,0 +1,104 @@
+#!/usr/bin/python
+
+#Secure Planet LLC
+
+import urllib2
+import re
+import re
+import os, sys
+
+from collections import OrderedDict
+
+banner_file = "brutescrape.banner"
+def banner():
+    global banner_file
+    open_banner = open(banner_file, "r")
+    for line in open_banner:
+        print line.rstrip()
+    open_banner.close()
+
+def stripHTMLTags (html):
+#Strip HTML tags from any string and transfrom special entities.
+    text = html
+ 
+#Apply rules in given order.
+    rules = [
+      { r'>\s+' : u'>'},                  # Remove spaces after a tag opens or closes.
+      { r'\s+' : u' '},                   # Replace consecutive spaces.
+      { r'\s*<br\s*/?>\s*' : u'\n'},      # Newline after a <br>.
+      { r'</(div)\s*>\s*' : u'\n'},       # Newline after </p> and </div> and <h1/>.
+      { r'</(p|h\d)\s*>\s*' : u'\n\n'},   # Newline after </p> and </div> and <h1/>.
+      { r'<head>.*<\s*(/head|body)[^>]*>' : u'' },     # Remove <head> to </head>.
+      { r'<a\s+href="([^"]+)"[^>]*>.*</a>' : r'\1' },  # Show links instead of texts.
+      { r'[ \t]*<[^<]*?/?>' : u'' },            # Remove remaining tags.
+      { r'^\s+' : u'' }                   # Remove spaces at the beginning.
+    ]
+ 
+    for rule in rules:
+      for (k,v) in rule.items():
+        try:
+          regex = re.compile (k)
+          text  = regex.sub (v, text)
+        except:
+          pass #Pass up whatever we don't find.
+ 
+  #Replace special strings.
+    special = {
+      '&nbsp;' : ' ', '&amp;' : '&', '&quot;' : '"',
+      '&lt;'   : '<', '&gt;'  : '>'
+    }
+ 
+    for (k,v) in special.items():
+      text = text.replace (k, v)
+ 
+    return text
+
+banner()
+#Create an empty list for generation logic.
+y_arr = []
+
+try:
+    file_list = open('sites.scrape','r')
+    sites = file_list.read().split(',')
+
+except:
+    banner()
+    sys.exit()
+
+for site in sites:
+    try:
+        site = site.strip()
+        print "[*] Downloading Content For : " + site
+        x_arr = []
+        response = urllib2.urlopen(site)
+        x = stripHTMLTags(response.read())
+	#Replace junk found in our response
+        x = x.replace('\n',' ')
+        x = x.replace(',',' ')
+        x = x.replace('.',' ')
+        x = x.replace('/',' ')
+        x = re.sub('[^A-Za-z0-9]+', ' ', x)
+        x_arr = x.split(' ')
+        for y in x_arr:
+            y = y.strip()
+            if y and (len(y) > 4):
+              if ((y[0] == '2') and (y[1] == 'F')) or ((y[0] == '2') and (y[1] == '3')) or ((y[0] == '3') and (y[1] == 'F')) or ((y[0] == '3') and (y[1] == 'D')):
+                y = y[2:]
+              y_arr.append(y)
+    except:
+        pass
+
+y_arr_unique = OrderedDict.fromkeys(y_arr).keys()
+print "[*] Processing List"
+f_write = open("passwordList.txt","w")
+for yy in y_arr_unique:
+    if yy.strip().isdigit():
+      pass
+    else:
+      #print yy.strip()
+      f_write.write(yy.strip() + "\n")
+f_write.close()
+print "[*] Wordlist Generation Complete."
+print "[*] Output Located: passwordList.txt"
+print "[*] Total Count of Passwords >> " + str(len(y_arr_unique))
+
diff --git a/readme.txt b/readme.txt
@@ -0,0 +1,49 @@
+====================================================================================
+__________                __           _________                                        
+\______   \_______ __ ___/  |_  ____  /   _____/ ________________  ______   ____
+ |    |  _/\_  __ \  |  \   __\/ __ \ \_____  \_/ ___\_  __ \__  \ \____ \_/ __ \
+ |    |   \ |  | \/  |  /|  | \  ___/ /        \  \___|  | \// __ \|  |_> >  ___/
+ |______  / |__|  |____/ |__|  \___  >_______  /\___  >__|  (____  /   __/ \___  >
+        \/                         \/        \/     \/           \/|__|        \/
+
+Brutescrape | A web scraper for generating password files based on plain text found
+               in specific web pages.
+Written by Peter Kim <Author, The Hacker Playbook>
+                     <CEO, Secure Planet LLC>
+
+Usage | python brutescrape.py
+====================================================================================
+
+< About >
+
+	Brutescrape is a tool designed to parse out text from specific web pages and generate password lists for bruteforcing with this text.
+	The main idea in mind was to be able to create password lists that were specific to an organization. This way, the user will then have 
+	a password list that contains keywords specific to the target entity, which provides a better chance at recovering credentials used
+	within said entity. Furthermore, the use of rule files found within the users favorite password cracking tool could essentially increase
+	the chances of recovering plain text passwords from an organization. 
+
+	E.X >> The user is performing a penetration test against HackMe, Inc. The user knows the HackMe company has a website http://www.hackme.com/, and 
+	uses BruteScrape against this site. The user now has a password file created specifically from parsing text within HackMe's website. The user
+	then uses this wordlist against hashes they had found during a phase of the pentest. The user then decides to use this wordlist against his list
+	of hashes within oclHashcat, and recovers the plain text of a hash: "hackme". 
+	
+	In this example, the user found a very weak password, but cases such as these would be very rare, as organizations usually have password policies
+	in place. The use of rules files would probably be more viable in recovering these plain text hash values, and so the user attempts to crack the
+	hashes again, this time using a rule file that will append 4 digits from 0000 - 9999 at the end of every word in his list. 
+
+	Ah! More hashes are found: "hackme4331,hackme9901". How about a rule to change every word to leet speak?
+
+	More hashes found: "h4ckm3, h4ckm3,inc.P455". And so on and so forth.
+
+< Usage >
+
+	Using the script is simple. The target webpage(s) should be listed in your "sites.scrape" file like so-
+
+		http://www.site.com,http://www.site2.com,http://www.site3.com/index.php,http://www.site4.com/admin
+
+	Then run the script-
+
+		python brutescrape.py
+
+	And that's it. The target sites defined in your "sites.scrape" file will be parsed through and the parsed words will be written to a file
+	named "passwordList.txt".