Skip to content
This repository has been archived by the owner on Jan 8, 2021. It is now read-only.

Commit

Permalink
Switch catalog and faculty scraper to aiohttp
Browse files Browse the repository at this point in the history
  • Loading branch information
benjaminrsherman committed Oct 31, 2020
1 parent a35afb6 commit 82138c0
Show file tree
Hide file tree
Showing 3 changed files with 195 additions and 157 deletions.
289 changes: 151 additions & 138 deletions catalog_scraper/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,111 +9,114 @@

from typing import Tuple, List


def scrapePage(url, data):
response = requests.get(url)
soup = BeautifulSoup(response.text.encode("utf8"), "lxml")

rows = soup.find(
"div", {"id": "advanced_filter_section"}
).nextSibling.nextSibling.findAll("tr")
final_row = None
for row in rows:
final_row = row
if len(row.findAll("td")) <= 1:
continue
data_url_end = (
row.findAll("td")[1]
.findChildren("a", recursive=False)[0]["href"]
.split("?")[1]
)
data_url = f"http://catalog.rpi.edu/preview_course.php?{data_url_end}&print"
# print(data_url)

course_results = requests.get(data_url)
data_soup = BeautifulSoup(course_results.text.encode("utf8"), "lxml")
course = data_soup.find("h1").contents[0].split("-")
course_code = course[0].split()
key = course_code[0].strip() + "-" + course_code[1].strip()
data[key] = {}
data[key]["subj"] = course_code[0].strip()
data[key]["crse"] = course_code[1].strip()
data[key]["name"] = course[1].strip()
# data[key]['url'] = data_url
# data[key]['coid'] = data_url_end.split('=')[-1]

description = data_soup.find("hr")
if description:
description = description.parent.encode_contents().decode().strip()
description = re.split("<\/?hr ?\/?>", description)[1]
description = re.split("<\/?br ?\/?>\s*<strong>", description)[0]
description = re.sub("<.*?>", "", description)
data[key]["description"] = description.strip()

# when_offered = data_soup.find('strong', text='When Offered:')
# if when_offered:
# data[key]['when_offered'] = when_offered.nextSibling.strip()
#
# cross_listed = data_soup.find('strong', text='Cross Listed:')
# if cross_listed:
# data[key]['cross_listed'] = cross_listed.nextSibling.strip()
#
# pre_req = data_soup.find('strong', text='Prerequisites/Corequisites:')
# if pre_req:
# data[key]['pre_req'] = pre_req.nextSibling.strip()
#
# credit_hours = data_soup.find('em', text='Credit Hours:')
# if credit_hours:
# credit_hours = credit_hours.nextSibling.nextSibling.text.strip()
# if(credit_hours == 'Variable'):
# data[key]['credit_hours_max'] = 0
# data[key]['credit_hours_min'] = 999
# else:
# data[key]['credit_hours'] = credit_hours

next_page = final_row.findChildren("strong")[0].findNext("a", recursive=False)
if next_page["href"] != "#" and next_page["href"] != "javascript:void(0);":
return BASE_URL + next_page["href"]
return None
import asyncio
import aiohttp


async def scrapePage(s, url, data):
async with s.get(url) as response:
response_text = await response.text()
soup = BeautifulSoup(response_text.encode("utf8"), "lxml")

rows = soup.find(
"div", {"id": "advanced_filter_section"}
).nextSibling.nextSibling.findAll("tr")
final_row = None
for row in rows:
final_row = row
if len(row.findAll("td")) <= 1:
continue
data_url_end = (
row.findAll("td")[1]
.findChildren("a", recursive=False)[0]["href"]
.split("?")[1]
)
data_url = f"http://catalog.rpi.edu/preview_course.php?{data_url_end}&print"
# print(data_url)

async with s.get(data_url) as course_results:
results_text = await course_results.text()
data_soup = BeautifulSoup(results_text.encode("utf8"), "lxml")
course = data_soup.find("h1").contents[0].split("-")
course_code = course[0].split()
key = course_code[0].strip() + "-" + course_code[1].strip()
data[key] = {}
data[key]["subj"] = course_code[0].strip()
data[key]["crse"] = course_code[1].strip()
data[key]["name"] = course[1].strip()
# data[key]['url'] = data_url
# data[key]['coid'] = data_url_end.split('=')[-1]

description = data_soup.find("hr")
if description:
description = description.parent.encode_contents().decode().strip()
description = re.split("<\/?hr ?\/?>", description)[1]
description = re.split("<\/?br ?\/?>\s*<strong>", description)[0]
description = re.sub("<.*?>", "", description)
data[key]["description"] = description.strip()

# when_offered = data_soup.find('strong', text='When Offered:')
# if when_offered:
# data[key]['when_offered'] = when_offered.nextSibling.strip()
#
# cross_listed = data_soup.find('strong', text='Cross Listed:')
# if cross_listed:
# data[key]['cross_listed'] = cross_listed.nextSibling.strip()
#
# pre_req = data_soup.find('strong', text='Prerequisites/Corequisites:')
# if pre_req:
# data[key]['pre_req'] = pre_req.nextSibling.strip()
#
# credit_hours = data_soup.find('em', text='Credit Hours:')
# if credit_hours:
# credit_hours = credit_hours.nextSibling.nextSibling.text.strip()
# if(credit_hours == 'Variable'):
# data[key]['credit_hours_max'] = 0
# data[key]['credit_hours_min'] = 999
# else:
# data[key]['credit_hours'] = credit_hours

next_page = final_row.findChildren("strong")[0].findNext("a", recursive=False)
if next_page["href"] != "#" and next_page["href"] != "javascript:void(0);":
return BASE_URL + next_page["href"]
return None


BASE_URL = "http://catalog.rpi.edu"
catalog_home = requests.get("http://catalog.rpi.edu/")
catalog_home_soup = BeautifulSoup(catalog_home.text.encode("utf8"), "lxml")
next_url = catalog_home_soup("a", text="Courses")[0]["href"]


def get_schools(url):
homepage = requests.get(url)
soup = BeautifulSoup(homepage.text.encode("utf8"), "lxml")
schools = soup.find("h3", text="Four-Letter Subject Codes by School")
num_schools = len(
list(
filter(lambda x: str(x).strip(), schools.next_siblings),


async def get_schools(s, url):
async with s.get(url) as homepage:
homepage_text = await homepage.text()
soup = BeautifulSoup(homepage_text.encode("utf8"), "lxml")
schools = soup.find("h3", text="Four-Letter Subject Codes by School")
num_schools = len(
list(
filter(lambda x: str(x).strip(), schools.next_siblings),
)
)
)

school = schools
data = {}
departments = set()
for _ in range(num_schools):
school = school.findNext("p")

strings = list(school.stripped_strings)
school_title = strings[0]
school_name_end = school_title.index("(") - 1
school_name = school_title[:school_name_end]
if school_name not in data:
data[school_name] = []

for dept in strings[1:]:
first_space = dept.index(" ")
code = dept[:first_space]
name = dept[first_space + 1 :]
if code not in departments:
data[school_name].append({"code": code, "name": name})
departments.add(code)
return data

school = schools
data = {}
departments = set()
for _ in range(num_schools):
school = school.findNext("p")

strings = list(school.stripped_strings)
school_title = strings[0]
school_name_end = school_title.index("(") - 1
school_name = school_title[:school_name_end]
if school_name not in data:
data[school_name] = []

for dept in strings[1:]:
first_space = dept.index(" ")
code = dept[:first_space]
name = dept[first_space + 1 :]
if code not in departments:
data[school_name].append({"code": code, "name": name})
departments.add(code)
return data


def calculate_score(columns):
Expand Down Expand Up @@ -178,46 +181,53 @@ def optimize_ordering(data, num_columns=3):
}

# returns [(year, courses url, schools url)]
def get_years() -> List[Tuple[str, str, str]]:
homepage = requests.get(f"{BASE_URL}/index.php")
home_soup = BeautifulSoup(catalog_home.text.encode("utf8"), "lxml")
dropdown_entries = home_soup.find(
"select", {"title": "Select a Catalog"}
).findChildren("option", recursive=False)

dropdown_mapped = map(lambda x: (x["value"], x.string), dropdown_entries)
dropdown_formatted = map(lambda x: (x[0], x[1].split(" [")[0]), dropdown_mapped)
dropdown_formatted = map(
lambda x: (x[0], x[1].split("Catalog ")[1]), dropdown_formatted
)

ret = []

for val, year in dropdown_formatted:
year_home = requests.post(
f"{BASE_URL}/index.php",
headers=HEADERS,
data={"catalog": val, "sel_cat_submit": "GO"},
)
year_home_soup = BeautifulSoup(year_home.text.encode("utf8"), "lxml")
courses_url = year_home_soup("a", text="Courses")[0]["href"]
schools_url = year_home_soup("a", text="Subject Codes")[0]["href"]
ret.append((year, BASE_URL + courses_url, BASE_URL + schools_url))

return ret


def parse_year(year_data):
async def get_years() -> List[Tuple[str, str, str]]:
async with aiohttp.ClientSession() as s:
async with s.get(f"{BASE_URL}/index.php") as homepage:
homepage_text = await homepage.text()
home_soup = BeautifulSoup(homepage_text.encode("utf8"), "lxml")
dropdown_entries = home_soup.find(
"select", {"title": "Select a Catalog"}
).findChildren("option", recursive=False)

dropdown_mapped = map(lambda x: (x["value"], x.string), dropdown_entries)
dropdown_formatted = map(
lambda x: (x[0], x[1].split(" [")[0]), dropdown_mapped
)
dropdown_formatted = map(
lambda x: (x[0], x[1].split("Catalog ")[1]), dropdown_formatted
)

ret = []

for val, year in dropdown_formatted:
async with s.post(
f"{BASE_URL}/index.php",
headers=HEADERS,
data={"catalog": val, "sel_cat_submit": "GO"},
) as year_home:
year_home_text = await year_home.text()
year_home_soup = BeautifulSoup(
year_home_text.encode("utf8"), "lxml"
)
courses_url = year_home_soup("a", text="Courses")[0]["href"]
schools_url = year_home_soup("a", text="Subject Codes")[0]["href"]
ret.append((year, BASE_URL + courses_url, BASE_URL + schools_url))

return ret


async def parse_year(s, year_data):
year, courses_url, schools_url = year_data

if sys.argv[1] == "catalog":
data = {}
while True:
if courses_url is None:
break
courses_url = scrapePage(courses_url, data)
courses_url = await scrapePage(s, courses_url, data)
else:
data = get_schools(schools_url)
data = await get_schools(s, schools_url)
data = list(map(lambda x: {"name": x[0], "depts": x[1]}, data.items()))
data = optimize_ordering(data)

Expand All @@ -229,16 +239,19 @@ def parse_year(year_data):
json.dump(data, outfile, sort_keys=False, indent=2)


years = get_years()
async def parse_years(years_data):
async with aiohttp.ClientSession() as s:
await asyncio.gather(*(parse_year(s, year_data) for year_data in years_data))


years = asyncio.run(get_years())

if len(sys.argv) == 1:
print(f"USAGE: python3 {sys.argv[0]} (catalog|schools)")
sys.exit(1)

if sys.argv[-1] == "LATEST_YEAR":
print("Parsing single year")
parse_year(years[0])
asyncio.run(parse_years(years[:1]))
else:
for year in tqdm(years):
print(f"Parsing {year[0]}")
parse_year(year)
asyncio.run(parse_years(years))
62 changes: 43 additions & 19 deletions faculty_directory_scraper/main.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,52 @@
from concurrent import futures
import asyncio
import time
import requests
import aiohttp
from bs4 import BeautifulSoup
import json
from tqdm import tqdm


# response = requests.get(url='https://faculty.rpi.edu/data/peoplesearch')
# soup = BeautifulSoup(response.text.encode('utf8'), "html")
async def get_professor(s, professor, url, data):
async with s.get(url) as response:
soup = BeautifulSoup(await response.text("utf8"), "lxml")
data[professor["node"]["Path"]] = {}
professor_data = data[professor["node"]["Path"]]
professor_data["name"] = professor["node"]["title"].strip()

faculty = requests.get('https://faculty.rpi.edu/data/peoplesearch').json()
# Find all fields on the professor page and append them to the data object
for item in soup.findAll("div", {"class": "views-field"}):
class_parts = item["class"][1].split("-")
if class_parts[len(class_parts) - 1] == "portrait":
professor_data[class_parts[len(class_parts) - 1]] = item.find("img")[
"src"
].split("?")[0]
elif item.find("div", {"class": "field-content"}):
professor_data[class_parts[len(class_parts) - 1]] = " ".join(
item.find("div", {"class": "field-content"}).get_text(" ").split()
)

data = {}
for professor in tqdm(faculty['nodes']):
response = requests.get(url=f"https://faculty.rpi.edu{professor['node']['Path']}")
soup = BeautifulSoup(response.text.encode('utf8'), "lxml")
data[professor['node']['title'].strip()] = {}
professor_data = data[professor['node']['title'].strip()]
professor_data['url'] = professor['node']['Path']
for item in soup.findAll("div", {"class": "views-field"}):
class_parts = item['class'][1].split('-')
if class_parts[len(class_parts)-1] == 'portrait':
professor_data[class_parts[len(class_parts)-1]] = item.find("img")['src'].split('?')[0]
elif item.find("div", {"class": "field-content"}):
professor_data[class_parts[len(class_parts)-1]] = " ".join(item.find("div", {"class": "field-content"}).get_text(" ").split())

async def main():
# Gets a json object that contains all faculty and their associated urls
faculty = requests.get("https://faculty.rpi.edu/data/peoplesearch").json()

with open(f"faculty.json", "w") as outfile:
json.dump(data , outfile, sort_keys=False, indent=2)
data = {}
async with aiohttp.ClientSession() as s:
await asyncio.gather(
*(
get_professor(
s,
professor,
f"https://faculty.rpi.edu{professor['node']['Path']}",
data,
)
for professor in faculty["nodes"]
)
)

with open("faculty.json", "w") as outfile:
json.dump(data, outfile, sort_keys=True, indent=2)


asyncio.run(main())
Loading

0 comments on commit 82138c0

Please sign in to comment.