-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbehance.py
169 lines (139 loc) · 7.42 KB
/
behance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from time import sleep
from datetime import datetime
import csv
import re
# Start Selenium WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
# Load the Behance job list
url = 'https://www.behance.net/joblist?tracking_source=nav20' # Link to the Behance job list
driver.get(url)
# Configuration of the number of jobs to review and recent days to filter
recent_days = 30
desired_number = 600 # Total jobs to review on the website
reviewed_links = [] # All reviewed links, regardless of the days filter
found_links = [] # Links that meet the day filter
saved_jobs = []
cycles_without_new_links = 0 # Counter to detect when no new jobs are loading
# Keyword to search in the job description
keyword = "" # Add the keyword you want to search in the job description
try:
# Wait up to 20 seconds for the first job link to appear
WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.CLASS_NAME, "JobCard-jobCardLink-Ywm"))
)
# Scrolling loop to capture up to the desired number of jobs
while len(reviewed_links) < desired_number and cycles_without_new_links < 5:
# Execute scroll down and wait to load new content
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sleep(5) # Increase the time to ensure new elements load
# Get the HTML of the current page
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
# Save the number of links reviewed before this page
previous_links = len(reviewed_links)
# Process each job and extract the link
for job_card in soup.find_all('a', class_='JobCard-jobCardLink-Ywm'):
link = job_card['href']
full_url = f"https://www.behance.net{link}"
# Check if we have already reviewed this link
if full_url not in reviewed_links:
reviewed_links.append(full_url) # Add to the reviewed list
# Now check if it meets the `recent_days` filter
aria_label = job_card.get('aria-label', '')
days_published = None
'''Make sure the website is in English to match the text patterns, if not, you should chamge the patterns'''
if "ago" in aria_label:
if re.search(r'(\d+)\s*days?', aria_label): # Matches "X days ago"
days_published = int(re.search(r'(\d+)\s*days?', aria_label).group(1))
elif re.search(r'one\s*month', aria_label): # Matches "one month ago"
days_published = 30
elif re.search(r'(\d+)\s*months?', aria_label): # Matches "X months ago"
days_published = int(re.search(r'(\d+)\s*months?', aria_label).group(1)) * 30
elif re.search(r'(\d+)\s*hours?', aria_label): # Matches "X hours ago"
days_published = 0 # Published today if the time is in hours
# If the previous method didn't work, try to find the time in the job page
if days_published is None:
time_element = job_card.find_next('span', class_='JobCard-time-Cvz')
if time_element:
time_text = time_element.text.strip()
if re.search(r'(\d+)\s*days?', time_text):
days_published = int(re.search(r'(\d+)\s*days?', time_text).group(1))
elif re.search(r'one\s*month', time_text):
days_published = 30
elif re.search(r'(\d+)\s*months?', time_text):
days_published = int(re.search(r'(\d+)\s*months?', time_text).group(1)) * 30
elif re.search(r'(\d+)\s*hours?', time_text):
days_published = 0
# Add to `found_links` only if it meets the days filter
if days_published is not None and days_published <= recent_days:
found_links.append(full_url)
# Check if new links were found in this cycle
if len(reviewed_links) == previous_links:
cycles_without_new_links += 1 # Increment if no new links were found
else:
cycles_without_new_links = 0 # Reset if new links were found
# Display the total number of reviewed links and recent links found
print(f"Total reviewed links: {len(reviewed_links)}")
print(f"Total recent links found (less than {recent_days} days): {len(found_links)}")
# Access each recent link and check for the keyword
for full_url in found_links:
driver.get(full_url)
sleep(2) # Wait for the job page to load
# Extract the HTML of the job page and analyze the content
job_html = driver.page_source
job_soup = BeautifulSoup(job_html, 'html.parser')
# Check for the keyword in the job description
if keyword in job_soup.text:
print(f"Job meets the condition: {full_url}")
# Extract the link and the company name
saved_company_link = None
company_name = None
company_link = job_soup.find('a', class_='JobDetailContent-companyNameLink-EUx')
if company_link:
saved_company_link = company_link["href"]
company_name = company_link.text.strip()
# Remove the extra text "opens in a new tab" if present
company_name = company_name.replace("opens in a new tab", "").strip()
# Save the data to the job list
saved_jobs.append({
'job_link': full_url,
'company_link': saved_company_link,
'company_name': company_name
})
print(f'Saved job list: {saved_jobs}')
finally:
# Close the browser at the end
driver.quit()
# Save to CSV
csv_filename = 'behance_jobs.csv' # Add your local path to save the CSV file
existing_links = set()
# Ensure the CSV file exists
try:
with open(csv_filename, mode='r', encoding='utf-8-sig') as file:
reader = csv.DictReader(file, delimiter=',')
for row in reader:
existing_links.add(row['job_link'])
except FileNotFoundError:
print('File not found. Creating a new file.')
# Create the file with headers if it doesn't exist
with open(csv_filename, mode='w', newline='', encoding='utf-8-sig') as file:
fieldnames = ['job_link', 'company_name', 'company_link', 'stored_date']
writer = csv.DictWriter(file, fieldnames=fieldnames, delimiter=',')
writer.writeheader() # Write header to the new file
# Append new data to the file
with open(csv_filename, mode='a', newline='', encoding='utf-8-sig') as file:
fieldnames = ['job_link', 'company_name', 'company_link', 'stored_date']
writer = csv.DictWriter(file, fieldnames=fieldnames, delimiter=',')
if file.tell() == 0:
writer.writeheader() # Write header if file was empty
for job in saved_jobs:
if job['job_link'] not in existing_links:
job['stored_date'] = datetime.now().strftime('%Y-%m-%d')
writer.writerow(job)