Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dynamic web scraping and static web scraping #75

Merged
merged 6 commits into from
Jun 17, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion jobfunnel/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '2.1.7'
__version__ = '2.1.8'
26 changes: 9 additions & 17 deletions jobfunnel/glassdoor_dynamic.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,21 +121,15 @@ def scrape(self):

# get the html data, initialize bs4 with lxml
self.driver.get(search)
print(
"It's very likely that Glassdoor might require you to fill out a CAPTCHA form. Follow these steps if it does ask you to complete a CAPTCHA: "
'\n 1.Refresh the glassdoor site in the new browser window that just popped up.\n'
' 2.Then complete the CAPTCHA in the browser.\n 3.Press Enter to continue'
)
# wait for user to complete CAPTCHA
input()

# create the soup base
soup_base = BeautifulSoup(self.driver.page_source, self.bs4_parser)
num_res = soup_base.find('p', attrs={'class', 'jobsCount'})
while num_res is None:
print(
'Looks like something went wrong. \nMake sure you complete the CAPTCHA in the new browser window that just popped up. Try refreshing the page and attempt to complete the CAPTCHA again. '
)
num_res = soup_base.find('p', attrs={
'class', 'jobsCount'})
while(num_res is None):
print("It looks like that Glassdoor might require you to fill out a CAPTCHA form. Follow these steps if it does ask you to complete a CAPTCHA:"
"\n 1.Refresh the glassdoor site in the new browser window that just popped up.\n" " 2.Then complete the CAPTCHA in the browser.\n 3.Press Enter to continue")
# wait for user to complete CAPTCHA
input()
soup_base = BeautifulSoup(self.driver.page_source, self.bs4_parser)
num_res = soup_base.find('p', attrs={'class', 'jobsCount'})
Expand Down Expand Up @@ -221,11 +215,9 @@ def scrape(self):
job['tags'] = ''

try:
job['date'] = (
s.find('div', attrs={'class', 'jobLabels'})
.find('span', attrs={'class', 'jobLabel nowrap'})
.text.strip()
)
# dynamic way of fetching date
job['date'] = s.find('div', attrs={
'class', 'd-flex align-items-end pl-std minor css-65p68w'}).text.strip()
except AttributeError:
job['date'] = ''

Expand Down
14 changes: 7 additions & 7 deletions jobfunnel/tools/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,19 @@ def date_filter(cur_dict: Dict[str, dict], number_of_days: int):
cur_dict: today's job scrape dict
number_of_days: how many days old a job can be
"""
if number_of_days<0 or cur_dict is None:
if number_of_days < 0 or cur_dict is None:
return
print("date_filter running")
cur_job_ids = [job['id'] for job in cur_dict.values()]
#calculate the oldest date a job can be
threshold_date = datetime.now() - timedelta(days=number_of_days)
# calculate the oldest date a job can be
threshold_date = datetime.now() - timedelta(days=number_of_days)
for job_id in cur_job_ids:
#get the date from job with job_id
# get the date from job with job_id
job_date = datetime.strptime(cur_dict[job_id]['date'], '%Y-%m-%d')
#if this job is older than threshold_date, delete it from current scrape
if job_date<threshold_date:
# if this job is older than threshold_date, delete it from current scrape
if job_date < threshold_date:
logging.info(f"{cur_dict[job_id]['link']} has been filtered out by date_filter because"
f" it is older than {number_of_days} days")
f" it is older than {number_of_days} days")
del cur_dict[job_id]


Expand Down
3 changes: 2 additions & 1 deletion jobfunnel/tools/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def post_date_from_relative_post_age(job_list):
logging.error(f"unknown date for job {job['id']}")
# format date in standard format e.g. 2020-01-01
job['date'] = post_date.strftime('%Y-%m-%d')
# print('job['date']'')


def split_url(url):
Expand Down Expand Up @@ -155,7 +156,7 @@ def get_webdriver():
executable_path=GeckoDriverManager().install())
except Exception:
try:
webdriver.Chrome(ChromeDriverManager().install())
driver = webdriver.Chrome(ChromeDriverManager().install())
except Exception:
try:
driver = webdriver.Ie(IEDriverManager().install())
Expand Down
2 changes: 1 addition & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ Filter undesired companies by providing your own `yaml` configuration and adding
JobFunnel can be easily automated to run nightly with [crontab][cron] <br />
For more information see the [crontab document][cron_doc].

**NOTE ABOUT AUTOMATING:** As of right now, Glassdoor requires a human to complete a CAPTCHA. Therefore, in the case of automating with something like cron, you need to be **physically present** to complete the Glassdoor CAPTCHA.
**NOTE ABOUT AUTOMATING:** As of right now, Glassdoor **might** require a human to complete a CAPTCHA. Therefore, in the case of automating with something like cron, you **might** need to be **physically present** to complete the Glassdoor CAPTCHA.

You may also of course disable the Glassdoor scraper in your `settings.yaml` to not have to complete any CAPTCHA at all:
```
Expand Down
Empty file added tests/test_glassdoor.py
Empty file.