forked from mgierada/TGSE
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_sgu_website.py
70 lines (55 loc) · 2.57 KB
/
scrape_sgu_website.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from selenium import webdriver
import json
import re
import os
class SGU_scrape():
def __init__(self):
self.url = 'https://www.theskepticsguide.org/podcasts'
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('headless')
options.add_argument('--test-type')
DRIVER_PATH = '/Users/maciejgierada/Desktop/literature_survey/chromedriver'
self.driver = webdriver.Chrome(
executable_path=DRIVER_PATH, options=options)
self.driver.get(self.url)
def get_publication_date_list(self):
publication_date_list = []
elements = self.driver.find_elements_by_xpath(
'//*[starts-with(@id, "mCSB_") and contains(@id, "_container")]/div[*]/ul/li[2]/a')
for element in elements:
audio_url = element.get_attribute('href')
date_published = re.search('cast(.*).mp3', audio_url).group(1)
publication_date_list.append(date_published)
return publication_date_list
def get_latest_episode_number(self):
element = self.driver.find_element_by_xpath(
'/html/body/div[1]/section/div[2]/div[1]/div/div[2]/h1/a').text
# remove first 9 characters e.g. EPISODE #818 will be just 818
lates_episode_number = element[9:]
return int(lates_episode_number)
def get_all_podcasts_data(self):
podcast_info = {}
publication_date_list = self.get_publication_date_list()
latest_episode_number = self.get_latest_episode_number()
for date, ep in zip(
publication_date_list,
reversed(range(1, latest_episode_number))):
link_to_mp3 = 'https://media.libsyn.com/media/skepticsguide/skepticast{}.mp3'.format(
date)
link_to_podcast = 'https://www.theskepticsguide.org/podcasts/episode-{}'.format(
ep)
inner_dict = {'date_published': date,
'link_to_mp3': link_to_mp3,
'link_to_podcast': link_to_podcast}
podcast_info[ep] = inner_dict
return podcast_info
def get_json(self, json_fname='all_podcasts_data.json'):
podcast_info = self.get_all_podcasts_data()
path_to_json_dir = os.path.join(os.getcwd(), 'source')
os.makedirs(path_to_json_dir, exist_ok=True)
path_to_json = os.path.join(os.getcwd(), 'source', json_fname)
with open(path_to_json, 'w') as f:
json.dump(podcast_info, f, indent=4)
if __name__ == '__main__':
SGU_scrape().get_json()