From 01d92d01de58911fa2ac9e3ead763825262ddd81 Mon Sep 17 00:00:00 2001
From: Lorenzo Gomez <yunior.eury@gmail.com>
Date: Tue, 12 May 2020 18:09:25 -0400
Subject: [PATCH 1/5] -TODO:Investigate flaky test appraoch for glassdoor
 testing

---
 demo/settings.yaml      | 2 +-
 tests/test_glassdoor.py | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 create mode 100644 tests/test_glassdoor.py

diff --git a/demo/settings.yaml b/demo/settings.yaml
index a190446b..1ab3d05f 100644
--- a/demo/settings.yaml
+++ b/demo/settings.yaml
@@ -5,7 +5,7 @@ output_path: './'
 
 # providers from which to search (case insensitive)
 providers:
-        #- 'GlassDoor' 
+  - 'GlassDoor' 
   - 'Indeed'
   - 'Monster'
 
diff --git a/tests/test_glassdoor.py b/tests/test_glassdoor.py
new file mode 100644
index 00000000..e69de29b

From b6846fe89e87f51cefa5d856e0aab089f89af6c2 Mon Sep 17 00:00:00 2001
From: Lorenzo Gomez <yunior.eury@gmail.com>
Date: Wed, 13 May 2020 18:32:06 -0400
Subject: [PATCH 2/5] -Found possible bug in glassdoor.py's
 post_date_from_relative_post_age -TODO:Investigate this issue further
 -TODO:Implement a way to limit the number of pages being scraped to speed up
 testing process -It looks like the static way of web scraping is working
 again in glassdoor.py -TODO:Implement a stateful switch for scrapers that
 will allow programmers to know whether to use static web scraping or
 dynamic(selenium-way) scraping

---
 jobfunnel/glassdoor.py     | 12 ++++++++++++
 jobfunnel/tools/filters.py | 20 +++++++++++++-------
 jobfunnel/tools/tools.py   |  1 +
 3 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/jobfunnel/glassdoor.py b/jobfunnel/glassdoor.py
index 2dbaab80..905f8789 100644
--- a/jobfunnel/glassdoor.py
+++ b/jobfunnel/glassdoor.py
@@ -33,6 +33,18 @@ def __init__(self, args):
             'Cache-Control': 'no-cache',
             'Connection': 'keep-alive'
         }
+        self.location_headers = {
+            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,'
+            'image/webp,*/*;q=0.01',
+            'accept-encoding': 'gzip, deflate, sdch, br',
+            'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6',
+            'referer': 'https://www.glassdoor.{0}/'.format(
+                self.search_terms['region']['domain']),
+            'upgrade-insecure-requests': '1',
+            'user-agent': self.user_agent,
+            'Cache-Control': 'no-cache',
+            'Connection': 'keep-alive'
+        }
         self.query = '-'.join(self.search_terms['keywords'])
 
         # initialize the webdriver
diff --git a/jobfunnel/tools/filters.py b/jobfunnel/tools/filters.py
index 8833c87f..00bcb980 100644
--- a/jobfunnel/tools/filters.py
+++ b/jobfunnel/tools/filters.py
@@ -14,19 +14,25 @@ def date_filter(cur_dict: Dict[str, dict], number_of_days: int):
         cur_dict: today's job scrape dict
         number_of_days: how many days old a job can be
     """
-    if number_of_days<0 or cur_dict is None:
+    if number_of_days < 0 or cur_dict is None:
         return
     print("date_filter running")
+    print('number_of_days=', number_of_days)
     cur_job_ids = [job['id'] for job in cur_dict.values()]
-    #calculate the oldest date a job can be
-    threshold_date  = datetime.now() - timedelta(days=number_of_days)
+    # calculate the oldest date a job can be
+    print('date_filter#1')
+    threshold_date = datetime.now() - timedelta(days=number_of_days)
+    print('date_filter#2')
     for job_id in cur_job_ids:
-        #get the date from job with job_id
+        # get the date from job with job_id
+        print('date_filter#3')
+        print("cur_dict[job_id]['date']=", cur_dict[job_id]['date'])
         job_date = datetime.strptime(cur_dict[job_id]['date'], '%Y-%m-%d')
-        #if this job is older than threshold_date, delete it from current scrape
-        if job_date<threshold_date:
+        print('date_filter#4')
+        # if this job is older than threshold_date, delete it from current scrape
+        if job_date < threshold_date:
             logging.info(f"{cur_dict[job_id]['link']} has been filtered out by date_filter because"
-                    f" it is older than {number_of_days} days")
+                         f" it is older than {number_of_days} days")
             del cur_dict[job_id]
 
 
diff --git a/jobfunnel/tools/tools.py b/jobfunnel/tools/tools.py
index 0271fd72..efeb10b7 100644
--- a/jobfunnel/tools/tools.py
+++ b/jobfunnel/tools/tools.py
@@ -79,6 +79,7 @@ def post_date_from_relative_post_age(job_list):
                             logging.error(f"unknown date for job {job['id']}")
         # format date in standard format e.g. 2020-01-01
         job['date'] = post_date.strftime('%Y-%m-%d')
+        # print('job['date']'')
 
 
 def split_url(url):

From 93eba916dd26419b147d803558caf96b1a1b14c3 Mon Sep 17 00:00:00 2001
From: Lorenzo Gomez <yunior.eury@gmail.com>
Date: Thu, 14 May 2020 11:45:02 -0400
Subject: [PATCH 3/5] -disable glassdoor testing for TravisCI

---
 demo/settings.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/demo/settings.yaml b/demo/settings.yaml
index 1ab3d05f..f8154521 100644
--- a/demo/settings.yaml
+++ b/demo/settings.yaml
@@ -5,7 +5,7 @@ output_path: './'
 
 # providers from which to search (case insensitive)
 providers:
-  - 'GlassDoor' 
+  #- 'GlassDoor' 
   - 'Indeed'
   - 'Monster'
 

From dc8ff18472392020c3aff9846f61e66fa44ab007 Mon Sep 17 00:00:00 2001
From: Lorenzo Gomez <yunior.eury@gmail.com>
Date: Sun, 17 May 2020 14:29:27 -0400
Subject: [PATCH 4/5] -Fixed issue when fetching date with dynamic web scraping
 in the glassdoor scraper

---
 demo/settings.yaml         |  2 +-
 jobfunnel/glassdoor.py     | 17 ++++++++++-------
 jobfunnel/tools/filters.py |  6 ------
 readme.md                  |  2 +-
 4 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/demo/settings.yaml b/demo/settings.yaml
index f8154521..1ab3d05f 100644
--- a/demo/settings.yaml
+++ b/demo/settings.yaml
@@ -5,7 +5,7 @@ output_path: './'
 
 # providers from which to search (case insensitive)
 providers:
-  #- 'GlassDoor' 
+  - 'GlassDoor' 
   - 'Indeed'
   - 'Monster'
 
diff --git a/jobfunnel/glassdoor.py b/jobfunnel/glassdoor.py
index 905f8789..59e38a6c 100644
--- a/jobfunnel/glassdoor.py
+++ b/jobfunnel/glassdoor.py
@@ -207,17 +207,15 @@ def scrape(self):
 
         # get the html data, initialize bs4 with lxml
         self.driver.get(search)
-        print("It's very likely that Glassdoor might require you to fill out a CAPTCHA form. Follow these steps if it does ask you to complete a CAPTCHA:"
-              "\n 1.Refresh the glassdoor site in the new browser window that just popped up.\n" " 2.Then complete the CAPTCHA in the browser.\n 3.Press Enter to continue")
-        # wait for user to complete CAPTCHA
-        input()
 
         # create the soup base
         soup_base = BeautifulSoup(self.driver.page_source, self.bs4_parser)
         num_res = soup_base.find('p', attrs={
             'class', 'jobsCount'})
         while(num_res is None):
-            print('Looks like something went wrong. \nMake sure you complete the CAPTCHA in the new browser window that just popped up. Try refreshing the page and attempt to complete the CAPTCHA again. ')
+            print("It looks like that Glassdoor might require you to fill out a CAPTCHA form. Follow these steps if it does ask you to complete a CAPTCHA:"
+                  "\n 1.Refresh the glassdoor site in the new browser window that just popped up.\n" " 2.Then complete the CAPTCHA in the browser.\n 3.Press Enter to continue")
+        # wait for user to complete CAPTCHA
             input()
             soup_base = BeautifulSoup(self.driver.page_source, self.bs4_parser)
             num_res = soup_base.find('p', attrs={
@@ -290,8 +288,13 @@ def scrape(self):
                 job['tags'] = ''
 
             try:
-                job['date'] = s.find('div', attrs={'class', 'jobLabels'}).find(
-                    'span', attrs={'class', 'jobLabel nowrap'}).text.strip()
+                # static way of fetching date
+                # job['date'] = s.find('div', attrs={'class', 'jobLabels'}).find(
+                # 'span', attrs={'class', 'jobLabel nowrap'}).text.strip()
+
+                # dynamic way of fetching date
+                job['date'] = s.find('div', attrs={
+                                     'class', 'd-flex align-items-end pl-std minor css-65p68w'}).text.strip()
             except AttributeError:
                 job['date'] = ''
 
diff --git a/jobfunnel/tools/filters.py b/jobfunnel/tools/filters.py
index 00bcb980..cd3fcfc1 100644
--- a/jobfunnel/tools/filters.py
+++ b/jobfunnel/tools/filters.py
@@ -17,18 +17,12 @@ def date_filter(cur_dict: Dict[str, dict], number_of_days: int):
     if number_of_days < 0 or cur_dict is None:
         return
     print("date_filter running")
-    print('number_of_days=', number_of_days)
     cur_job_ids = [job['id'] for job in cur_dict.values()]
     # calculate the oldest date a job can be
-    print('date_filter#1')
     threshold_date = datetime.now() - timedelta(days=number_of_days)
-    print('date_filter#2')
     for job_id in cur_job_ids:
         # get the date from job with job_id
-        print('date_filter#3')
-        print("cur_dict[job_id]['date']=", cur_dict[job_id]['date'])
         job_date = datetime.strptime(cur_dict[job_id]['date'], '%Y-%m-%d')
-        print('date_filter#4')
         # if this job is older than threshold_date, delete it from current scrape
         if job_date < threshold_date:
             logging.info(f"{cur_dict[job_id]['link']} has been filtered out by date_filter because"
diff --git a/readme.md b/readme.md
index 281c9844..c3c8b56e 100644
--- a/readme.md
+++ b/readme.md
@@ -76,7 +76,7 @@ Filter undesired companies by providing your own `yaml` configuration and adding
   JobFunnel can be easily automated to run nightly with [crontab][cron] <br />
   For more information see the [crontab document][cron_doc].
   
-  **NOTE ABOUT AUTOMATING:** As of right now, Glassdoor requires a human to complete a CAPTCHA. Therefore, in the case of automating with something like cron, you need to be **physically present** to complete the Glassdoor CAPTCHA. 
+  **NOTE ABOUT AUTOMATING:** As of right now, Glassdoor **might** require a human to complete a CAPTCHA. Therefore, in the case of automating with something like cron, you **might** need to be **physically present** to complete the Glassdoor CAPTCHA. 
   
   You may also of course disable the Glassdoor scraper in your `settings.yaml` to not have to complete any CAPTCHA at all:
 ``` 

From 31be644e24ff5848bb199de4ff6eaf017616ed83 Mon Sep 17 00:00:00 2001
From: Lorenzo Gomez <yunior.eury@gmail.com>
Date: Tue, 19 May 2020 13:11:27 -0400
Subject: [PATCH 5/5] -Added Accept-Language 'en-CA' header to glassdoor.py
 -Added missing assignment to get_webdriver

---
 jobfunnel/__init__.py    |  2 +-
 jobfunnel/glassdoor.py   | 18 +-----------------
 jobfunnel/tools/tools.py |  2 +-
 3 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/jobfunnel/__init__.py b/jobfunnel/__init__.py
index bc6379c1..7042a396 100644
--- a/jobfunnel/__init__.py
+++ b/jobfunnel/__init__.py
@@ -1 +1 @@
-__version__ = '2.1.7'
+__version__ = '2.1.8'
diff --git a/jobfunnel/glassdoor.py b/jobfunnel/glassdoor.py
index 59e38a6c..ac85e464 100644
--- a/jobfunnel/glassdoor.py
+++ b/jobfunnel/glassdoor.py
@@ -25,19 +25,7 @@ def __init__(self, args):
             'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,'
                       'image/webp,*/*;q=0.01',
             'accept-encoding': 'gzip, deflate, sdch, br',
-            'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6',
-            'referer': 'https://www.glassdoor.{0}/'.format(
-                self.search_terms['region']['domain']),
-            'upgrade-insecure-requests': '1',
-            'user-agent': self.user_agent,
-            'Cache-Control': 'no-cache',
-            'Connection': 'keep-alive'
-        }
-        self.location_headers = {
-            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,'
-            'image/webp,*/*;q=0.01',
-            'accept-encoding': 'gzip, deflate, sdch, br',
-            'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6',
+            'accept-language': 'en-GB,en-CA,en-US;q=0.8,en;q=0.6',
             'referer': 'https://www.glassdoor.{0}/'.format(
                 self.search_terms['region']['domain']),
             'upgrade-insecure-requests': '1',
@@ -288,10 +276,6 @@ def scrape(self):
                 job['tags'] = ''
 
             try:
-                # static way of fetching date
-                # job['date'] = s.find('div', attrs={'class', 'jobLabels'}).find(
-                # 'span', attrs={'class', 'jobLabel nowrap'}).text.strip()
-
                 # dynamic way of fetching date
                 job['date'] = s.find('div', attrs={
                                      'class', 'd-flex align-items-end pl-std minor css-65p68w'}).text.strip()
diff --git a/jobfunnel/tools/tools.py b/jobfunnel/tools/tools.py
index efeb10b7..2412103d 100644
--- a/jobfunnel/tools/tools.py
+++ b/jobfunnel/tools/tools.py
@@ -156,7 +156,7 @@ def get_webdriver():
             executable_path=GeckoDriverManager().install())
     except Exception:
         try:
-            webdriver.Chrome(ChromeDriverManager().install())
+            driver = webdriver.Chrome(ChromeDriverManager().install())
         except Exception:
             try:
                 driver = webdriver.Ie(IEDriverManager().install())