diff --git a/README.md b/README.md index b118dc2..2ba0d4e 100644 --- a/README.md +++ b/README.md @@ -5,13 +5,14 @@ ## Description -RobotScraper is an open-source tool designed to scrape and analyze the `robots.txt` file of a specified domain. This Python script helps in identifying directories and pages that are allowed or disallowed by the `robots.txt` file and can save the results if needed. It is useful for web security researchers, SEO analysts, and anyone interested in examining the structure and access rules of a website. +RobotScraper is an open-source tool designed to scrape and analyze both the `robots.txt` and `sitemap.xml` files of a specified domain. This Python script helps in identifying directories and pages that are allowed or disallowed by the `robots.txt` file, as well as all URLs listed in the `sitemap.xml` file. Results can be saved to output files for further analysis. It is useful for web security researchers, SEO analysts, and anyone interested in examining the structure and access rules of a website. ## Requirements - Python 3.x - `requests` package - `beautifulsoup4` package +- `xml.etree.ElementTree` (included in Python standard library) ## Installation @@ -31,8 +32,36 @@ RobotScraper is an open-source tool designed to scrape and analyze the `robots.t To run the RobotScraper, you can use the following command syntax: ```sh -python robotScraper.py domain [-s output.txt] +python robotScraper.py -d domain [-s output.txt] [-m mode] ``` +### Parameters + +- `-d, --domain`: Specifies the target domain to analyze +- `-s, --save`: Enable saving output and specify the output file for robots.txt results +- `-m, --mode`: Specify the mode: `robots` (default) or `sitemap` + +### Examples + +```sh +# Check both robots.txt and sitemap.xml (sitemap results saved to sitemap.txt) +python robotScraper.py -d example.com + +# Check only robots.txt and save results to output.txt +python robotScraper.py -d example.com -s output.txt + +# Check only sitemap.xml and save results to urls.txt +python robotScraper.py -d example.com -m sitemap -s urls.txt +``` + +### Features + +- Extracts and analyzes all entries from robots.txt +- Extracts all URLs from sitemap.xml, including nested sitemaps +- Verifies the accessibility of found URLs +- Handles SSL certificate verification issues +- Color-coded terminal output for better readability +- Saves results to specified output files + # Disclaimer This tool is intended for educational and research purposes only. The author and contributors are not responsible for any misuse of this tool. Users are advised to use this tool responsibly and only on systems for which they have explicit permission. Unauthorized access to systems, networks, or data is illegal and unethical. Always obtain proper authorization before conducting any kind of activities that could impact other users or systems. diff --git a/robotScraper.py b/robotScraper.py index 3418ee8..c2ea5c1 100644 --- a/robotScraper.py +++ b/robotScraper.py @@ -1,6 +1,12 @@ import sys import requests +import urllib3 +import xml.etree.ElementTree as ET from bs4 import BeautifulSoup +from urllib.parse import urlparse + +# SSL uyarılarını devre dışı bırak +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) class colors: HEADER = '\033[1;35m' @@ -25,15 +31,15 @@ def banner(): |_| \_\___/|_.__/ \___/ \__|____/ \___|_| \__,_| .__/ \___|_| |_| """ + colors.ENDC) - print(colors.WARNING + "RobotScraper v.1.0 - Open Source Project | " + colors.OKGREEN + "Author: " + colors.WARNING + "Robotshell | " + colors.OKGREEN + "Twitter: " + colors.WARNING + "https://twitter.com/robotshelld\n" + colors.ENDC) + print(colors.WARNING + "RobotScraper v.1.1 - Open Source Project | " + colors.OKGREEN + "Author: " + colors.WARNING + "Robotshell | " + colors.OKGREEN + "Twitter: " + colors.WARNING + "https://twitter.com/robotshelld\n" + colors.ENDC) #CORE FUNCTION -def getRobots(domain,enable_save, filename): +def getRobots(domain, enable_save, filename): print (colors.OKCYAN + "Starting RobotScraper to recollect directories and pages from " + colors.WARNING + "robots.txt " + colors.OKCYAN + "in " + colors.FAIL + domain + colors.ENDC) print (colors.OKCYAN + "[+] Checking if the" + colors.WARNING + " robots.txt " + colors.OKCYAN + "file exists" + colors.ENDC) - r = requests.get("https://" + domain + "/robots.txt") + r = requests.get("https://" + domain + "/robots.txt", verify=False) if r.status_code == 200: print (colors.OKCYAN + "[✓] File" + colors.WARNING + " robots.txt " + colors.OKCYAN + "exists:" + colors.ENDC) @@ -61,7 +67,7 @@ def getRobots(domain,enable_save, filename): if directory[0] == '/': newDomain = "https://" + domain + directory - r2 = requests.get(newDomain) + r2 = requests.get(newDomain, verify=False) print (colors.OKCYAN + "[+] Checking " + colors.WARNING + newDomain + colors.ENDC, end = '') @@ -84,40 +90,125 @@ def getRobots(domain,enable_save, filename): #MAIN FUNCTION +def getSitemap(domain, enable_save, filename): + print(colors.OKCYAN + "Starting RobotScraper to extract URLs from " + colors.WARNING + "sitemap.xml " + colors.OKCYAN + "in " + colors.FAIL + domain + colors.ENDC) + print(colors.OKCYAN + "[+] Checking if " + colors.WARNING + "sitemap.xml " + colors.OKCYAN + "file exists" + colors.ENDC) + + try: + r = requests.get("https://" + domain + "/sitemap.xml", verify=False) + + if r.status_code == 200: + print(colors.OKCYAN + "[✓] File " + colors.WARNING + "sitemap.xml " + colors.OKCYAN + "exists:" + colors.ENDC) + print() + + sitemap_urls = [] + + try: + root = ET.fromstring(r.content) + namespace = {"ns": "http://www.sitemaps.org/schemas/sitemap/0.9"} + + for url in root.findall(".//ns:url", namespace): + loc = url.find("ns:loc", namespace) + if loc is not None and loc.text: + sitemap_urls.append(loc.text) + print(colors.OKGREEN + "[✓] Found URL: " + colors.WARNING + loc.text + colors.ENDC) + + if enable_save == 1: + with open(filename, "a") as file: + file.write(loc.text + "\n") + + for sitemap in root.findall(".//ns:sitemap", namespace): + loc = sitemap.find("ns:loc", namespace) + if loc is not None and loc.text: + print(colors.OKGREEN + "[✓] Found nested sitemap: " + colors.WARNING + loc.text + colors.ENDC) + try: + nested_r = requests.get(loc.text, verify=False) + if nested_r.status_code == 200: + nested_root = ET.fromstring(nested_r.content) + for url in nested_root.findall(".//ns:url", namespace): + nested_loc = url.find("ns:loc", namespace) + if nested_loc is not None and nested_loc.text: + sitemap_urls.append(nested_loc.text) + print(colors.OKGREEN + "[✓] Found URL in nested sitemap: " + colors.WARNING + nested_loc.text + colors.ENDC) + + if enable_save == 1: + with open(filename, "a") as file: + file.write(nested_loc.text + "\n") + except Exception as e: + print(colors.FAIL + "[✗] Error processing nested sitemap: " + colors.WARNING + str(e) + colors.ENDC) + except ET.ParseError: + print(colors.FAIL + "[✗] XML parsing error. Trying alternative parsing method." + colors.ENDC) + soup = BeautifulSoup(r.content, 'xml') + urls = soup.find_all('loc') + + for url in urls: + sitemap_urls.append(url.text) + print(colors.OKGREEN + "[✓] Found URL: " + colors.WARNING + url.text + colors.ENDC) + + if enable_save == 1: + with open(filename, "a") as file: + file.write(url.text + "\n") + + print(colors.OKCYAN + "\n[+] Total URLs found in sitemap: " + colors.WARNING + str(len(sitemap_urls)) + colors.ENDC) + return sitemap_urls + else: + print(colors.FAIL + "[✗] Sitemap.xml not found (Status code: " + str(r.status_code) + ")" + colors.ENDC) + return [] + except Exception as e: + print(colors.FAIL + "[✗] Error accessing sitemap: " + colors.WARNING + str(e) + colors.ENDC) + return [] + def main(): - banner() - enable_save = 0 - filename = "" - - if len(sys.argv) == 1: - print (colors.FAIL + "ERROR: No domain or parameters found" + colors.ENDC) - elif len(sys.argv) == 2: - arg = sys.argv[1] - - if arg == "-h" or arg == "--help" : - print (colors.BOLD + "HELP SECTION:" + colors.ENDC) - print ("Usage:" + colors.OKCYAN + '\trobotscraper.py domain' + colors.ENDC) - print ("Example:" + colors.OKCYAN + '\trobotscraper.py example.com -s output.txt' + colors.ENDC) - print ("-d,--domain" + colors.OKCYAN + "\tSpecifies the domain" + colors.ENDC) - print ("-h,--help" + colors.OKCYAN + "\tThis help" + colors.ENDC) - print ("-v,--version" + colors.OKCYAN + "\tShow version" + colors.ENDC) - print ("-s,--save" + colors.OKCYAN + "\tEnable save output and specifies the output file" + colors.ENDC) - elif arg == "-v" or arg == "--version": - print (colors.WARNING + "RobotScraper v.1.0" + colors.ENDC) - else: - print (colors.FAIL + "ERROR: Incorrect argument or sintaxis" + colors.ENDC) - - elif len(sys.argv) > 2 and len(sys.argv) <= 5: - - if sys.argv[1] == "-d" or sys.argv[1] == "--domain": - - domain = sys.argv[2] - - if(len(sys.argv) > 3): - if sys.argv[3] == "-s" or sys.argv[3] == "--save": - enable_save = 1 - filename = sys.argv[4] - - getRobots(domain,enable_save,filename) - -main() + banner() + enable_save = 0 + robots_filename = "" + sitemap_filename = "sitemap.txt" + sitemap_mode = False + + if len(sys.argv) == 1: + print(colors.FAIL + "ERROR: No domain or parameters found" + colors.ENDC) + elif len(sys.argv) == 2: + arg = sys.argv[1] + + if arg == "-h" or arg == "--help": + print(colors.BOLD + "HELP SECTION:" + colors.ENDC) + print("Usage:" + colors.OKCYAN + '\trobotscraper.py domain' + colors.ENDC) + print("Example:" + colors.OKCYAN + '\trobotscraper.py example.com -s output.txt' + colors.ENDC) + print("-d,--domain" + colors.OKCYAN + "\tSpecifies the domain" + colors.ENDC) + print("-h,--help" + colors.OKCYAN + "\tThis help" + colors.ENDC) + print("-v,--version" + colors.OKCYAN + "\tShow version" + colors.ENDC) + print("-s,--save" + colors.OKCYAN + "\tEnable save output and specifies the output file for robots.txt results" + colors.ENDC) + print("-m,--mode" + colors.OKCYAN + "\tSpecify mode: robots (default) or sitemap" + colors.ENDC) + print(colors.OKCYAN + "Note: When no mode is specified, both robots.txt and sitemap.xml will be checked" + colors.ENDC) + elif arg == "-v" or arg == "--version": + print(colors.WARNING + "RobotScraper v.1.1" + colors.ENDC) + else: + print(colors.FAIL + "ERROR: Incorrect argument or syntax" + colors.ENDC) + + elif len(sys.argv) > 2: + if sys.argv[1] == "-d" or sys.argv[1] == "--domain": + domain = sys.argv[2] + + for i in range(3, len(sys.argv)): + if sys.argv[i] == "-s" or sys.argv[i] == "--save": + if i+1 < len(sys.argv): + enable_save = 1 + robots_filename = sys.argv[i+1] + elif sys.argv[i] == "-m" or sys.argv[i] == "--mode": + if i+1 < len(sys.argv): + if sys.argv[i+1].lower() == "sitemap": + sitemap_mode = True + + if sitemap_mode: + # Sadece sitemap modu + getSitemap(domain, enable_save, robots_filename) + else: + # Hem robots.txt hem de sitemap.xml'i kontrol et + print(colors.OKGREENL + "\n[+] Checking robots.txt..." + colors.ENDC) + getRobots(domain, enable_save, robots_filename) + + print(colors.OKGREENL + "\n[+] Checking sitemap.xml..." + colors.ENDC) + # Sitemap sonuçlarını her zaman sitemap.txt'ye kaydet + getSitemap(domain, 1, sitemap_filename) +if __name__ == "__main__": + main()