From 82138c0cbd665aa257c515430afd5deb9a806738 Mon Sep 17 00:00:00 2001 From: Benjamin Sherman Date: Sat, 31 Oct 2020 16:29:47 -0400 Subject: [PATCH] Switch catalog and faculty scraper to aiohttp --- catalog_scraper/main.py | 289 ++++++++++++++++-------------- faculty_directory_scraper/main.py | 62 +++++-- requirements.txt | 1 + 3 files changed, 195 insertions(+), 157 deletions(-) diff --git a/catalog_scraper/main.py b/catalog_scraper/main.py index 2face3d..f7aceb1 100644 --- a/catalog_scraper/main.py +++ b/catalog_scraper/main.py @@ -9,111 +9,114 @@ from typing import Tuple, List - -def scrapePage(url, data): - response = requests.get(url) - soup = BeautifulSoup(response.text.encode("utf8"), "lxml") - - rows = soup.find( - "div", {"id": "advanced_filter_section"} - ).nextSibling.nextSibling.findAll("tr") - final_row = None - for row in rows: - final_row = row - if len(row.findAll("td")) <= 1: - continue - data_url_end = ( - row.findAll("td")[1] - .findChildren("a", recursive=False)[0]["href"] - .split("?")[1] - ) - data_url = f"http://catalog.rpi.edu/preview_course.php?{data_url_end}&print" - # print(data_url) - - course_results = requests.get(data_url) - data_soup = BeautifulSoup(course_results.text.encode("utf8"), "lxml") - course = data_soup.find("h1").contents[0].split("-") - course_code = course[0].split() - key = course_code[0].strip() + "-" + course_code[1].strip() - data[key] = {} - data[key]["subj"] = course_code[0].strip() - data[key]["crse"] = course_code[1].strip() - data[key]["name"] = course[1].strip() - # data[key]['url'] = data_url - # data[key]['coid'] = data_url_end.split('=')[-1] - - description = data_soup.find("hr") - if description: - description = description.parent.encode_contents().decode().strip() - description = re.split("<\/?hr ?\/?>", description)[1] - description = re.split("<\/?br ?\/?>\s*", description)[0] - description = re.sub("<.*?>", "", description) - data[key]["description"] = description.strip() - - # when_offered = data_soup.find('strong', text='When Offered:') - # if when_offered: - # data[key]['when_offered'] = when_offered.nextSibling.strip() - # - # cross_listed = data_soup.find('strong', text='Cross Listed:') - # if cross_listed: - # data[key]['cross_listed'] = cross_listed.nextSibling.strip() - # - # pre_req = data_soup.find('strong', text='Prerequisites/Corequisites:') - # if pre_req: - # data[key]['pre_req'] = pre_req.nextSibling.strip() - # - # credit_hours = data_soup.find('em', text='Credit Hours:') - # if credit_hours: - # credit_hours = credit_hours.nextSibling.nextSibling.text.strip() - # if(credit_hours == 'Variable'): - # data[key]['credit_hours_max'] = 0 - # data[key]['credit_hours_min'] = 999 - # else: - # data[key]['credit_hours'] = credit_hours - - next_page = final_row.findChildren("strong")[0].findNext("a", recursive=False) - if next_page["href"] != "#" and next_page["href"] != "javascript:void(0);": - return BASE_URL + next_page["href"] - return None +import asyncio +import aiohttp + + +async def scrapePage(s, url, data): + async with s.get(url) as response: + response_text = await response.text() + soup = BeautifulSoup(response_text.encode("utf8"), "lxml") + + rows = soup.find( + "div", {"id": "advanced_filter_section"} + ).nextSibling.nextSibling.findAll("tr") + final_row = None + for row in rows: + final_row = row + if len(row.findAll("td")) <= 1: + continue + data_url_end = ( + row.findAll("td")[1] + .findChildren("a", recursive=False)[0]["href"] + .split("?")[1] + ) + data_url = f"http://catalog.rpi.edu/preview_course.php?{data_url_end}&print" + # print(data_url) + + async with s.get(data_url) as course_results: + results_text = await course_results.text() + data_soup = BeautifulSoup(results_text.encode("utf8"), "lxml") + course = data_soup.find("h1").contents[0].split("-") + course_code = course[0].split() + key = course_code[0].strip() + "-" + course_code[1].strip() + data[key] = {} + data[key]["subj"] = course_code[0].strip() + data[key]["crse"] = course_code[1].strip() + data[key]["name"] = course[1].strip() + # data[key]['url'] = data_url + # data[key]['coid'] = data_url_end.split('=')[-1] + + description = data_soup.find("hr") + if description: + description = description.parent.encode_contents().decode().strip() + description = re.split("<\/?hr ?\/?>", description)[1] + description = re.split("<\/?br ?\/?>\s*", description)[0] + description = re.sub("<.*?>", "", description) + data[key]["description"] = description.strip() + + # when_offered = data_soup.find('strong', text='When Offered:') + # if when_offered: + # data[key]['when_offered'] = when_offered.nextSibling.strip() + # + # cross_listed = data_soup.find('strong', text='Cross Listed:') + # if cross_listed: + # data[key]['cross_listed'] = cross_listed.nextSibling.strip() + # + # pre_req = data_soup.find('strong', text='Prerequisites/Corequisites:') + # if pre_req: + # data[key]['pre_req'] = pre_req.nextSibling.strip() + # + # credit_hours = data_soup.find('em', text='Credit Hours:') + # if credit_hours: + # credit_hours = credit_hours.nextSibling.nextSibling.text.strip() + # if(credit_hours == 'Variable'): + # data[key]['credit_hours_max'] = 0 + # data[key]['credit_hours_min'] = 999 + # else: + # data[key]['credit_hours'] = credit_hours + + next_page = final_row.findChildren("strong")[0].findNext("a", recursive=False) + if next_page["href"] != "#" and next_page["href"] != "javascript:void(0);": + return BASE_URL + next_page["href"] + return None BASE_URL = "http://catalog.rpi.edu" -catalog_home = requests.get("http://catalog.rpi.edu/") -catalog_home_soup = BeautifulSoup(catalog_home.text.encode("utf8"), "lxml") -next_url = catalog_home_soup("a", text="Courses")[0]["href"] - - -def get_schools(url): - homepage = requests.get(url) - soup = BeautifulSoup(homepage.text.encode("utf8"), "lxml") - schools = soup.find("h3", text="Four-Letter Subject Codes by School") - num_schools = len( - list( - filter(lambda x: str(x).strip(), schools.next_siblings), + + +async def get_schools(s, url): + async with s.get(url) as homepage: + homepage_text = await homepage.text() + soup = BeautifulSoup(homepage_text.encode("utf8"), "lxml") + schools = soup.find("h3", text="Four-Letter Subject Codes by School") + num_schools = len( + list( + filter(lambda x: str(x).strip(), schools.next_siblings), + ) ) - ) - - school = schools - data = {} - departments = set() - for _ in range(num_schools): - school = school.findNext("p") - - strings = list(school.stripped_strings) - school_title = strings[0] - school_name_end = school_title.index("(") - 1 - school_name = school_title[:school_name_end] - if school_name not in data: - data[school_name] = [] - - for dept in strings[1:]: - first_space = dept.index(" ") - code = dept[:first_space] - name = dept[first_space + 1 :] - if code not in departments: - data[school_name].append({"code": code, "name": name}) - departments.add(code) - return data + + school = schools + data = {} + departments = set() + for _ in range(num_schools): + school = school.findNext("p") + + strings = list(school.stripped_strings) + school_title = strings[0] + school_name_end = school_title.index("(") - 1 + school_name = school_title[:school_name_end] + if school_name not in data: + data[school_name] = [] + + for dept in strings[1:]: + first_space = dept.index(" ") + code = dept[:first_space] + name = dept[first_space + 1 :] + if code not in departments: + data[school_name].append({"code": code, "name": name}) + departments.add(code) + return data def calculate_score(columns): @@ -178,36 +181,43 @@ def optimize_ordering(data, num_columns=3): } # returns [(year, courses url, schools url)] -def get_years() -> List[Tuple[str, str, str]]: - homepage = requests.get(f"{BASE_URL}/index.php") - home_soup = BeautifulSoup(catalog_home.text.encode("utf8"), "lxml") - dropdown_entries = home_soup.find( - "select", {"title": "Select a Catalog"} - ).findChildren("option", recursive=False) - - dropdown_mapped = map(lambda x: (x["value"], x.string), dropdown_entries) - dropdown_formatted = map(lambda x: (x[0], x[1].split(" [")[0]), dropdown_mapped) - dropdown_formatted = map( - lambda x: (x[0], x[1].split("Catalog ")[1]), dropdown_formatted - ) - - ret = [] - - for val, year in dropdown_formatted: - year_home = requests.post( - f"{BASE_URL}/index.php", - headers=HEADERS, - data={"catalog": val, "sel_cat_submit": "GO"}, - ) - year_home_soup = BeautifulSoup(year_home.text.encode("utf8"), "lxml") - courses_url = year_home_soup("a", text="Courses")[0]["href"] - schools_url = year_home_soup("a", text="Subject Codes")[0]["href"] - ret.append((year, BASE_URL + courses_url, BASE_URL + schools_url)) - - return ret - - -def parse_year(year_data): +async def get_years() -> List[Tuple[str, str, str]]: + async with aiohttp.ClientSession() as s: + async with s.get(f"{BASE_URL}/index.php") as homepage: + homepage_text = await homepage.text() + home_soup = BeautifulSoup(homepage_text.encode("utf8"), "lxml") + dropdown_entries = home_soup.find( + "select", {"title": "Select a Catalog"} + ).findChildren("option", recursive=False) + + dropdown_mapped = map(lambda x: (x["value"], x.string), dropdown_entries) + dropdown_formatted = map( + lambda x: (x[0], x[1].split(" [")[0]), dropdown_mapped + ) + dropdown_formatted = map( + lambda x: (x[0], x[1].split("Catalog ")[1]), dropdown_formatted + ) + + ret = [] + + for val, year in dropdown_formatted: + async with s.post( + f"{BASE_URL}/index.php", + headers=HEADERS, + data={"catalog": val, "sel_cat_submit": "GO"}, + ) as year_home: + year_home_text = await year_home.text() + year_home_soup = BeautifulSoup( + year_home_text.encode("utf8"), "lxml" + ) + courses_url = year_home_soup("a", text="Courses")[0]["href"] + schools_url = year_home_soup("a", text="Subject Codes")[0]["href"] + ret.append((year, BASE_URL + courses_url, BASE_URL + schools_url)) + + return ret + + +async def parse_year(s, year_data): year, courses_url, schools_url = year_data if sys.argv[1] == "catalog": @@ -215,9 +225,9 @@ def parse_year(year_data): while True: if courses_url is None: break - courses_url = scrapePage(courses_url, data) + courses_url = await scrapePage(s, courses_url, data) else: - data = get_schools(schools_url) + data = await get_schools(s, schools_url) data = list(map(lambda x: {"name": x[0], "depts": x[1]}, data.items())) data = optimize_ordering(data) @@ -229,7 +239,12 @@ def parse_year(year_data): json.dump(data, outfile, sort_keys=False, indent=2) -years = get_years() +async def parse_years(years_data): + async with aiohttp.ClientSession() as s: + await asyncio.gather(*(parse_year(s, year_data) for year_data in years_data)) + + +years = asyncio.run(get_years()) if len(sys.argv) == 1: print(f"USAGE: python3 {sys.argv[0]} (catalog|schools)") @@ -237,8 +252,6 @@ def parse_year(year_data): if sys.argv[-1] == "LATEST_YEAR": print("Parsing single year") - parse_year(years[0]) + asyncio.run(parse_years(years[:1])) else: - for year in tqdm(years): - print(f"Parsing {year[0]}") - parse_year(year) + asyncio.run(parse_years(years)) diff --git a/faculty_directory_scraper/main.py b/faculty_directory_scraper/main.py index c057ef0..70f3c58 100644 --- a/faculty_directory_scraper/main.py +++ b/faculty_directory_scraper/main.py @@ -1,28 +1,52 @@ +from concurrent import futures +import asyncio +import time import requests +import aiohttp from bs4 import BeautifulSoup import json -from tqdm import tqdm -# response = requests.get(url='https://faculty.rpi.edu/data/peoplesearch') -# soup = BeautifulSoup(response.text.encode('utf8'), "html") +async def get_professor(s, professor, url, data): + async with s.get(url) as response: + soup = BeautifulSoup(await response.text("utf8"), "lxml") + data[professor["node"]["Path"]] = {} + professor_data = data[professor["node"]["Path"]] + professor_data["name"] = professor["node"]["title"].strip() -faculty = requests.get('https://faculty.rpi.edu/data/peoplesearch').json() + # Find all fields on the professor page and append them to the data object + for item in soup.findAll("div", {"class": "views-field"}): + class_parts = item["class"][1].split("-") + if class_parts[len(class_parts) - 1] == "portrait": + professor_data[class_parts[len(class_parts) - 1]] = item.find("img")[ + "src" + ].split("?")[0] + elif item.find("div", {"class": "field-content"}): + professor_data[class_parts[len(class_parts) - 1]] = " ".join( + item.find("div", {"class": "field-content"}).get_text(" ").split() + ) -data = {} -for professor in tqdm(faculty['nodes']): - response = requests.get(url=f"https://faculty.rpi.edu{professor['node']['Path']}") - soup = BeautifulSoup(response.text.encode('utf8'), "lxml") - data[professor['node']['title'].strip()] = {} - professor_data = data[professor['node']['title'].strip()] - professor_data['url'] = professor['node']['Path'] - for item in soup.findAll("div", {"class": "views-field"}): - class_parts = item['class'][1].split('-') - if class_parts[len(class_parts)-1] == 'portrait': - professor_data[class_parts[len(class_parts)-1]] = item.find("img")['src'].split('?')[0] - elif item.find("div", {"class": "field-content"}): - professor_data[class_parts[len(class_parts)-1]] = " ".join(item.find("div", {"class": "field-content"}).get_text(" ").split()) +async def main(): + # Gets a json object that contains all faculty and their associated urls + faculty = requests.get("https://faculty.rpi.edu/data/peoplesearch").json() -with open(f"faculty.json", "w") as outfile: - json.dump(data , outfile, sort_keys=False, indent=2) + data = {} + async with aiohttp.ClientSession() as s: + await asyncio.gather( + *( + get_professor( + s, + professor, + f"https://faculty.rpi.edu{professor['node']['Path']}", + data, + ) + for professor in faculty["nodes"] + ) + ) + + with open("faculty.json", "w") as outfile: + json.dump(data, outfile, sort_keys=True, indent=2) + + +asyncio.run(main()) diff --git a/requirements.txt b/requirements.txt index a4dc8df..64e5e75 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ tqdm requests python-dotenv lxml +aiohttp