Merge pull request #75 from thebigG/testing

Dynamic web scraping and static web scraping
PaulMcInnis · Jun 17, 2020 · b30b284 · b30b284
2 parents f7e10d3 + df8b8a8
commit b30b284
Show file tree

Hide file tree

Showing 6 changed files with 20 additions and 27 deletions.
diff --git a/jobfunnel/__init__.py b/jobfunnel/__init__.py
@@ -1 +1 @@
-__version__ = '2.1.7'
+__version__ = '2.1.8'
diff --git a/jobfunnel/glassdoor_dynamic.py b/jobfunnel/glassdoor_dynamic.py
@@ -121,21 +121,15 @@ def scrape(self):
 
         # get the html data, initialize bs4 with lxml
         self.driver.get(search)
-        print(
-            "It's very likely that Glassdoor might require you to fill out a CAPTCHA form. Follow these steps if it does ask you to complete a CAPTCHA: "
-            '\n 1.Refresh the glassdoor site in the new browser window that just popped up.\n'
-            ' 2.Then complete the CAPTCHA in the browser.\n 3.Press Enter to continue'
-        )
-        # wait for user to complete CAPTCHA
-        input()
 
         # create the soup base
         soup_base = BeautifulSoup(self.driver.page_source, self.bs4_parser)
-        num_res = soup_base.find('p', attrs={'class', 'jobsCount'})
-        while num_res is None:
-            print(
-                'Looks like something went wrong. \nMake sure you complete the CAPTCHA in the new browser window that just popped up. Try refreshing the page and attempt to complete the CAPTCHA again. '
-            )
+        num_res = soup_base.find('p', attrs={
+            'class', 'jobsCount'})
+        while(num_res is None):
+            print("It looks like that Glassdoor might require you to fill out a CAPTCHA form. Follow these steps if it does ask you to complete a CAPTCHA:"
+                  "\n 1.Refresh the glassdoor site in the new browser window that just popped up.\n" " 2.Then complete the CAPTCHA in the browser.\n 3.Press Enter to continue")
+        # wait for user to complete CAPTCHA
             input()
             soup_base = BeautifulSoup(self.driver.page_source, self.bs4_parser)
             num_res = soup_base.find('p', attrs={'class', 'jobsCount'})
@@ -221,11 +215,9 @@ def scrape(self):
                 job['tags'] = ''
 
             try:
-                job['date'] = (
-                    s.find('div', attrs={'class', 'jobLabels'})
-                    .find('span', attrs={'class', 'jobLabel nowrap'})
-                    .text.strip()
-                )
+                # dynamic way of fetching date
+                job['date'] = s.find('div', attrs={
+                                     'class', 'd-flex align-items-end pl-std minor css-65p68w'}).text.strip()
             except AttributeError:
                 job['date'] = ''
 

diff --git a/jobfunnel/tools/filters.py b/jobfunnel/tools/filters.py
@@ -14,19 +14,19 @@ def date_filter(cur_dict: Dict[str, dict], number_of_days: int):
         cur_dict: today's job scrape dict
         number_of_days: how many days old a job can be
     """
-    if number_of_days<0 or cur_dict is None:
+    if number_of_days < 0 or cur_dict is None:
         return
     print("date_filter running")
     cur_job_ids = [job['id'] for job in cur_dict.values()]
-    #calculate the oldest date a job can be
-    threshold_date  = datetime.now() - timedelta(days=number_of_days)
+    # calculate the oldest date a job can be
+    threshold_date = datetime.now() - timedelta(days=number_of_days)
     for job_id in cur_job_ids:
-        #get the date from job with job_id
+        # get the date from job with job_id
         job_date = datetime.strptime(cur_dict[job_id]['date'], '%Y-%m-%d')
-        #if this job is older than threshold_date, delete it from current scrape
-        if job_date<threshold_date:
+        # if this job is older than threshold_date, delete it from current scrape
+        if job_date < threshold_date:
             logging.info(f"{cur_dict[job_id]['link']} has been filtered out by date_filter because"
-                    f" it is older than {number_of_days} days")
+                         f" it is older than {number_of_days} days")
             del cur_dict[job_id]
 
 

diff --git a/jobfunnel/tools/tools.py b/jobfunnel/tools/tools.py
@@ -79,6 +79,7 @@ def post_date_from_relative_post_age(job_list):
                             logging.error(f"unknown date for job {job['id']}")
         # format date in standard format e.g. 2020-01-01
         job['date'] = post_date.strftime('%Y-%m-%d')
+        # print('job['date']'')
 
 
 def split_url(url):
@@ -155,7 +156,7 @@ def get_webdriver():
             executable_path=GeckoDriverManager().install())
     except Exception:
         try:
-            webdriver.Chrome(ChromeDriverManager().install())
+            driver = webdriver.Chrome(ChromeDriverManager().install())
         except Exception:
             try:
                 driver = webdriver.Ie(IEDriverManager().install())

diff --git a/readme.md b/readme.md
@@ -76,7 +76,7 @@ Filter undesired companies by providing your own `yaml` configuration and adding
   JobFunnel can be easily automated to run nightly with [crontab][cron] <br />
   For more information see the [crontab document][cron_doc].
 
-  **NOTE ABOUT AUTOMATING:** As of right now, Glassdoor requires a human to complete a CAPTCHA. Therefore, in the case of automating with something like cron, you need to be **physically present** to complete the Glassdoor CAPTCHA. 
+  **NOTE ABOUT AUTOMATING:** As of right now, Glassdoor **might** require a human to complete a CAPTCHA. Therefore, in the case of automating with something like cron, you **might** need to be **physically present** to complete the Glassdoor CAPTCHA. 
 
   You may also of course disable the Glassdoor scraper in your `settings.yaml` to not have to complete any CAPTCHA at all:
 ``` 

diff --git a/tests/test_glassdoor.py b/tests/test_glassdoor.py