From 01d92d01de58911fa2ac9e3ead763825262ddd81 Mon Sep 17 00:00:00 2001 From: Lorenzo Gomez Date: Tue, 12 May 2020 18:09:25 -0400 Subject: [PATCH 1/5] -TODO:Investigate flaky test appraoch for glassdoor testing --- demo/settings.yaml | 2 +- tests/test_glassdoor.py | 0 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 tests/test_glassdoor.py diff --git a/demo/settings.yaml b/demo/settings.yaml index a190446b..1ab3d05f 100644 --- a/demo/settings.yaml +++ b/demo/settings.yaml @@ -5,7 +5,7 @@ output_path: './' # providers from which to search (case insensitive) providers: - #- 'GlassDoor' + - 'GlassDoor' - 'Indeed' - 'Monster' diff --git a/tests/test_glassdoor.py b/tests/test_glassdoor.py new file mode 100644 index 00000000..e69de29b From b6846fe89e87f51cefa5d856e0aab089f89af6c2 Mon Sep 17 00:00:00 2001 From: Lorenzo Gomez Date: Wed, 13 May 2020 18:32:06 -0400 Subject: [PATCH 2/5] -Found possible bug in glassdoor.py's post_date_from_relative_post_age -TODO:Investigate this issue further -TODO:Implement a way to limit the number of pages being scraped to speed up testing process -It looks like the static way of web scraping is working again in glassdoor.py -TODO:Implement a stateful switch for scrapers that will allow programmers to know whether to use static web scraping or dynamic(selenium-way) scraping --- jobfunnel/glassdoor.py | 12 ++++++++++++ jobfunnel/tools/filters.py | 20 +++++++++++++------- jobfunnel/tools/tools.py | 1 + 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/jobfunnel/glassdoor.py b/jobfunnel/glassdoor.py index 2dbaab80..905f8789 100644 --- a/jobfunnel/glassdoor.py +++ b/jobfunnel/glassdoor.py @@ -33,6 +33,18 @@ def __init__(self, args): 'Cache-Control': 'no-cache', 'Connection': 'keep-alive' } + self.location_headers = { + 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,' + 'image/webp,*/*;q=0.01', + 'accept-encoding': 'gzip, deflate, sdch, br', + 'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6', + 'referer': 'https://www.glassdoor.{0}/'.format( + self.search_terms['region']['domain']), + 'upgrade-insecure-requests': '1', + 'user-agent': self.user_agent, + 'Cache-Control': 'no-cache', + 'Connection': 'keep-alive' + } self.query = '-'.join(self.search_terms['keywords']) # initialize the webdriver diff --git a/jobfunnel/tools/filters.py b/jobfunnel/tools/filters.py index 8833c87f..00bcb980 100644 --- a/jobfunnel/tools/filters.py +++ b/jobfunnel/tools/filters.py @@ -14,19 +14,25 @@ def date_filter(cur_dict: Dict[str, dict], number_of_days: int): cur_dict: today's job scrape dict number_of_days: how many days old a job can be """ - if number_of_days<0 or cur_dict is None: + if number_of_days < 0 or cur_dict is None: return print("date_filter running") + print('number_of_days=', number_of_days) cur_job_ids = [job['id'] for job in cur_dict.values()] - #calculate the oldest date a job can be - threshold_date = datetime.now() - timedelta(days=number_of_days) + # calculate the oldest date a job can be + print('date_filter#1') + threshold_date = datetime.now() - timedelta(days=number_of_days) + print('date_filter#2') for job_id in cur_job_ids: - #get the date from job with job_id + # get the date from job with job_id + print('date_filter#3') + print("cur_dict[job_id]['date']=", cur_dict[job_id]['date']) job_date = datetime.strptime(cur_dict[job_id]['date'], '%Y-%m-%d') - #if this job is older than threshold_date, delete it from current scrape - if job_date Date: Thu, 14 May 2020 11:45:02 -0400 Subject: [PATCH 3/5] -disable glassdoor testing for TravisCI --- demo/settings.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demo/settings.yaml b/demo/settings.yaml index 1ab3d05f..f8154521 100644 --- a/demo/settings.yaml +++ b/demo/settings.yaml @@ -5,7 +5,7 @@ output_path: './' # providers from which to search (case insensitive) providers: - - 'GlassDoor' + #- 'GlassDoor' - 'Indeed' - 'Monster' From dc8ff18472392020c3aff9846f61e66fa44ab007 Mon Sep 17 00:00:00 2001 From: Lorenzo Gomez Date: Sun, 17 May 2020 14:29:27 -0400 Subject: [PATCH 4/5] -Fixed issue when fetching date with dynamic web scraping in the glassdoor scraper --- demo/settings.yaml | 2 +- jobfunnel/glassdoor.py | 17 ++++++++++------- jobfunnel/tools/filters.py | 6 ------ readme.md | 2 +- 4 files changed, 12 insertions(+), 15 deletions(-) diff --git a/demo/settings.yaml b/demo/settings.yaml index f8154521..1ab3d05f 100644 --- a/demo/settings.yaml +++ b/demo/settings.yaml @@ -5,7 +5,7 @@ output_path: './' # providers from which to search (case insensitive) providers: - #- 'GlassDoor' + - 'GlassDoor' - 'Indeed' - 'Monster' diff --git a/jobfunnel/glassdoor.py b/jobfunnel/glassdoor.py index 905f8789..59e38a6c 100644 --- a/jobfunnel/glassdoor.py +++ b/jobfunnel/glassdoor.py @@ -207,17 +207,15 @@ def scrape(self): # get the html data, initialize bs4 with lxml self.driver.get(search) - print("It's very likely that Glassdoor might require you to fill out a CAPTCHA form. Follow these steps if it does ask you to complete a CAPTCHA:" - "\n 1.Refresh the glassdoor site in the new browser window that just popped up.\n" " 2.Then complete the CAPTCHA in the browser.\n 3.Press Enter to continue") - # wait for user to complete CAPTCHA - input() # create the soup base soup_base = BeautifulSoup(self.driver.page_source, self.bs4_parser) num_res = soup_base.find('p', attrs={ 'class', 'jobsCount'}) while(num_res is None): - print('Looks like something went wrong. \nMake sure you complete the CAPTCHA in the new browser window that just popped up. Try refreshing the page and attempt to complete the CAPTCHA again. ') + print("It looks like that Glassdoor might require you to fill out a CAPTCHA form. Follow these steps if it does ask you to complete a CAPTCHA:" + "\n 1.Refresh the glassdoor site in the new browser window that just popped up.\n" " 2.Then complete the CAPTCHA in the browser.\n 3.Press Enter to continue") + # wait for user to complete CAPTCHA input() soup_base = BeautifulSoup(self.driver.page_source, self.bs4_parser) num_res = soup_base.find('p', attrs={ @@ -290,8 +288,13 @@ def scrape(self): job['tags'] = '' try: - job['date'] = s.find('div', attrs={'class', 'jobLabels'}).find( - 'span', attrs={'class', 'jobLabel nowrap'}).text.strip() + # static way of fetching date + # job['date'] = s.find('div', attrs={'class', 'jobLabels'}).find( + # 'span', attrs={'class', 'jobLabel nowrap'}).text.strip() + + # dynamic way of fetching date + job['date'] = s.find('div', attrs={ + 'class', 'd-flex align-items-end pl-std minor css-65p68w'}).text.strip() except AttributeError: job['date'] = '' diff --git a/jobfunnel/tools/filters.py b/jobfunnel/tools/filters.py index 00bcb980..cd3fcfc1 100644 --- a/jobfunnel/tools/filters.py +++ b/jobfunnel/tools/filters.py @@ -17,18 +17,12 @@ def date_filter(cur_dict: Dict[str, dict], number_of_days: int): if number_of_days < 0 or cur_dict is None: return print("date_filter running") - print('number_of_days=', number_of_days) cur_job_ids = [job['id'] for job in cur_dict.values()] # calculate the oldest date a job can be - print('date_filter#1') threshold_date = datetime.now() - timedelta(days=number_of_days) - print('date_filter#2') for job_id in cur_job_ids: # get the date from job with job_id - print('date_filter#3') - print("cur_dict[job_id]['date']=", cur_dict[job_id]['date']) job_date = datetime.strptime(cur_dict[job_id]['date'], '%Y-%m-%d') - print('date_filter#4') # if this job is older than threshold_date, delete it from current scrape if job_date < threshold_date: logging.info(f"{cur_dict[job_id]['link']} has been filtered out by date_filter because" diff --git a/readme.md b/readme.md index 281c9844..c3c8b56e 100644 --- a/readme.md +++ b/readme.md @@ -76,7 +76,7 @@ Filter undesired companies by providing your own `yaml` configuration and adding JobFunnel can be easily automated to run nightly with [crontab][cron]
For more information see the [crontab document][cron_doc]. - **NOTE ABOUT AUTOMATING:** As of right now, Glassdoor requires a human to complete a CAPTCHA. Therefore, in the case of automating with something like cron, you need to be **physically present** to complete the Glassdoor CAPTCHA. + **NOTE ABOUT AUTOMATING:** As of right now, Glassdoor **might** require a human to complete a CAPTCHA. Therefore, in the case of automating with something like cron, you **might** need to be **physically present** to complete the Glassdoor CAPTCHA. You may also of course disable the Glassdoor scraper in your `settings.yaml` to not have to complete any CAPTCHA at all: ``` From 31be644e24ff5848bb199de4ff6eaf017616ed83 Mon Sep 17 00:00:00 2001 From: Lorenzo Gomez Date: Tue, 19 May 2020 13:11:27 -0400 Subject: [PATCH 5/5] -Added Accept-Language 'en-CA' header to glassdoor.py -Added missing assignment to get_webdriver --- jobfunnel/__init__.py | 2 +- jobfunnel/glassdoor.py | 18 +----------------- jobfunnel/tools/tools.py | 2 +- 3 files changed, 3 insertions(+), 19 deletions(-) diff --git a/jobfunnel/__init__.py b/jobfunnel/__init__.py index bc6379c1..7042a396 100644 --- a/jobfunnel/__init__.py +++ b/jobfunnel/__init__.py @@ -1 +1 @@ -__version__ = '2.1.7' +__version__ = '2.1.8' diff --git a/jobfunnel/glassdoor.py b/jobfunnel/glassdoor.py index 59e38a6c..ac85e464 100644 --- a/jobfunnel/glassdoor.py +++ b/jobfunnel/glassdoor.py @@ -25,19 +25,7 @@ def __init__(self, args): 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,' 'image/webp,*/*;q=0.01', 'accept-encoding': 'gzip, deflate, sdch, br', - 'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6', - 'referer': 'https://www.glassdoor.{0}/'.format( - self.search_terms['region']['domain']), - 'upgrade-insecure-requests': '1', - 'user-agent': self.user_agent, - 'Cache-Control': 'no-cache', - 'Connection': 'keep-alive' - } - self.location_headers = { - 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,' - 'image/webp,*/*;q=0.01', - 'accept-encoding': 'gzip, deflate, sdch, br', - 'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6', + 'accept-language': 'en-GB,en-CA,en-US;q=0.8,en;q=0.6', 'referer': 'https://www.glassdoor.{0}/'.format( self.search_terms['region']['domain']), 'upgrade-insecure-requests': '1', @@ -288,10 +276,6 @@ def scrape(self): job['tags'] = '' try: - # static way of fetching date - # job['date'] = s.find('div', attrs={'class', 'jobLabels'}).find( - # 'span', attrs={'class', 'jobLabel nowrap'}).text.strip() - # dynamic way of fetching date job['date'] = s.find('div', attrs={ 'class', 'd-flex align-items-end pl-std minor css-65p68w'}).text.strip() diff --git a/jobfunnel/tools/tools.py b/jobfunnel/tools/tools.py index efeb10b7..2412103d 100644 --- a/jobfunnel/tools/tools.py +++ b/jobfunnel/tools/tools.py @@ -156,7 +156,7 @@ def get_webdriver(): executable_path=GeckoDriverManager().install()) except Exception: try: - webdriver.Chrome(ChromeDriverManager().install()) + driver = webdriver.Chrome(ChromeDriverManager().install()) except Exception: try: driver = webdriver.Ie(IEDriverManager().install())