Skip to content

Commit

Permalink
Merge pull request #75 from thebigG/testing
Browse files Browse the repository at this point in the history
Dynamic web scraping and static web scraping
  • Loading branch information
thebigG authored Jun 17, 2020
2 parents f7e10d3 + df8b8a8 commit b30b284
Show file tree
Hide file tree
Showing 6 changed files with 20 additions and 27 deletions.
2 changes: 1 addition & 1 deletion jobfunnel/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '2.1.7'
__version__ = '2.1.8'
26 changes: 9 additions & 17 deletions jobfunnel/glassdoor_dynamic.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,21 +121,15 @@ def scrape(self):

# get the html data, initialize bs4 with lxml
self.driver.get(search)
print(
"It's very likely that Glassdoor might require you to fill out a CAPTCHA form. Follow these steps if it does ask you to complete a CAPTCHA: "
'\n 1.Refresh the glassdoor site in the new browser window that just popped up.\n'
' 2.Then complete the CAPTCHA in the browser.\n 3.Press Enter to continue'
)
# wait for user to complete CAPTCHA
input()

# create the soup base
soup_base = BeautifulSoup(self.driver.page_source, self.bs4_parser)
num_res = soup_base.find('p', attrs={'class', 'jobsCount'})
while num_res is None:
print(
'Looks like something went wrong. \nMake sure you complete the CAPTCHA in the new browser window that just popped up. Try refreshing the page and attempt to complete the CAPTCHA again. '
)
num_res = soup_base.find('p', attrs={
'class', 'jobsCount'})
while(num_res is None):
print("It looks like that Glassdoor might require you to fill out a CAPTCHA form. Follow these steps if it does ask you to complete a CAPTCHA:"
"\n 1.Refresh the glassdoor site in the new browser window that just popped up.\n" " 2.Then complete the CAPTCHA in the browser.\n 3.Press Enter to continue")
# wait for user to complete CAPTCHA
input()
soup_base = BeautifulSoup(self.driver.page_source, self.bs4_parser)
num_res = soup_base.find('p', attrs={'class', 'jobsCount'})
Expand Down Expand Up @@ -221,11 +215,9 @@ def scrape(self):
job['tags'] = ''

try:
job['date'] = (
s.find('div', attrs={'class', 'jobLabels'})
.find('span', attrs={'class', 'jobLabel nowrap'})
.text.strip()
)
# dynamic way of fetching date
job['date'] = s.find('div', attrs={
'class', 'd-flex align-items-end pl-std minor css-65p68w'}).text.strip()
except AttributeError:
job['date'] = ''

Expand Down
14 changes: 7 additions & 7 deletions jobfunnel/tools/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,19 @@ def date_filter(cur_dict: Dict[str, dict], number_of_days: int):
cur_dict: today's job scrape dict
number_of_days: how many days old a job can be
"""
if number_of_days<0 or cur_dict is None:
if number_of_days < 0 or cur_dict is None:
return
print("date_filter running")
cur_job_ids = [job['id'] for job in cur_dict.values()]
#calculate the oldest date a job can be
threshold_date = datetime.now() - timedelta(days=number_of_days)
# calculate the oldest date a job can be
threshold_date = datetime.now() - timedelta(days=number_of_days)
for job_id in cur_job_ids:
#get the date from job with job_id
# get the date from job with job_id
job_date = datetime.strptime(cur_dict[job_id]['date'], '%Y-%m-%d')
#if this job is older than threshold_date, delete it from current scrape
if job_date<threshold_date:
# if this job is older than threshold_date, delete it from current scrape
if job_date < threshold_date:
logging.info(f"{cur_dict[job_id]['link']} has been filtered out by date_filter because"
f" it is older than {number_of_days} days")
f" it is older than {number_of_days} days")
del cur_dict[job_id]


Expand Down
3 changes: 2 additions & 1 deletion jobfunnel/tools/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def post_date_from_relative_post_age(job_list):
logging.error(f"unknown date for job {job['id']}")
# format date in standard format e.g. 2020-01-01
job['date'] = post_date.strftime('%Y-%m-%d')
# print('job['date']'')


def split_url(url):
Expand Down Expand Up @@ -155,7 +156,7 @@ def get_webdriver():
executable_path=GeckoDriverManager().install())
except Exception:
try:
webdriver.Chrome(ChromeDriverManager().install())
driver = webdriver.Chrome(ChromeDriverManager().install())
except Exception:
try:
driver = webdriver.Ie(IEDriverManager().install())
Expand Down
2 changes: 1 addition & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ Filter undesired companies by providing your own `yaml` configuration and adding
JobFunnel can be easily automated to run nightly with [crontab][cron] <br />
For more information see the [crontab document][cron_doc].

**NOTE ABOUT AUTOMATING:** As of right now, Glassdoor requires a human to complete a CAPTCHA. Therefore, in the case of automating with something like cron, you need to be **physically present** to complete the Glassdoor CAPTCHA.
**NOTE ABOUT AUTOMATING:** As of right now, Glassdoor **might** require a human to complete a CAPTCHA. Therefore, in the case of automating with something like cron, you **might** need to be **physically present** to complete the Glassdoor CAPTCHA.

You may also of course disable the Glassdoor scraper in your `settings.yaml` to not have to complete any CAPTCHA at all:
```
Expand Down
Empty file added tests/test_glassdoor.py
Empty file.

0 comments on commit b30b284

Please sign in to comment.