forked from aadityakushwaha/DWCrawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
106 lines (86 loc) · 3.25 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import requests
from bs4 import BeautifulSoup
import mysql.connector
session = requests.session()
session.proxies = {
'http': 'socks5h://localhost:9050',
'https': 'socks5h://localhost:9050'
}
def deep_scan(url):
# Define the starting URL
start_url = url
# Define the MySQL database configuration
mysql_config = {
"host": "34.220.243.94",
"port": "3306",
"user": "root",
"password": "Girlactor@77",
"database": "Crawler"
}
# Connect to the MySQL database
try:
db = mysql.connector.connect(**mysql_config)
except mysql.connector.Error as err:
print(f"Error connecting to MySQL database: {err}")
return
# Create a cursor to execute SQL queries
cursor = db.cursor()
# Create the table to store the URLs (if it doesn't exist already)
try:
cursor.execute("""
CREATE TABLE IF NOT EXISTS onion_urls (
id INT AUTO_INCREMENT PRIMARY KEY,
url VARCHAR(255) NOT NULL UNIQUE
)
""")
except mysql.connector.Error as err:
print(f"Error creating MySQL table: {err}")
db.close()
return
# Define a set to keep track of visited URLs
visited_urls = set()
# Define a list to keep track of URLs to visit
urls_to_visit = [start_url]
# Loop until there are no more URLs to visit
while urls_to_visit:
# Pop the next URL from the list of URLs to visit
url = urls_to_visit.pop(0)
# Skip URLs that have already been visited
if url in visited_urls:
continue
# Add the URL to the set of visited URLs
visited_urls.add(url)
# Make a request to the URL
try:
response = session.get(url, timeout=10)
except requests.exceptions.RequestException as err:
print(f"Error making request to URL: {err}")
continue
# Parse the HTML content of the response using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")
# Find all links on the page
links = soup.find_all("a")
# Add any new links to the list of URLs to visit
for link in links:
href = link.get("href")
try:
if href and href.startswith("http") and href.endswith(".onion") and href not in visited_urls:
# Insert the new URL into the database (if it doesn't exist already)
try:
cursor.execute("INSERT IGNORE INTO onion_urls (url) VALUES (%s)", (href,))
db.commit()
except mysql.connector.Error as err:
print(f"Error inserting URL into MySQL database: {err}")
continue
# Add the new URL to the list of URLs to visit
urls_to_visit.append(href)
except AttributeError as err:
print(f"Something went wrong: {err}")
print("done")
db.close()
# Read the list of URLs from a file
with open('urls.txt', 'r') as f:
urls = f.read().splitlines()
# Call the deep_scan function for each URL
for url in urls:
deep_scan(url)