diff --git a/catalog_scraper/main.py b/catalog_scraper/main.py index f7aceb1..72d74f1 100644 --- a/catalog_scraper/main.py +++ b/catalog_scraper/main.py @@ -5,7 +5,6 @@ import re import sys from tqdm import tqdm -from copy import deepcopy from typing import Tuple, List @@ -15,8 +14,7 @@ async def scrapePage(s, url, data): async with s.get(url) as response: - response_text = await response.text() - soup = BeautifulSoup(response_text.encode("utf8"), "lxml") + soup = BeautifulSoup(await response.text("utf8"), "lxml") rows = soup.find( "div", {"id": "advanced_filter_section"} @@ -35,8 +33,7 @@ async def scrapePage(s, url, data): # print(data_url) async with s.get(data_url) as course_results: - results_text = await course_results.text() - data_soup = BeautifulSoup(results_text.encode("utf8"), "lxml") + data_soup = BeautifulSoup(await course_results.text("utf8"), "lxml") course = data_soup.find("h1").contents[0].split("-") course_code = course[0].split() key = course_code[0].strip() + "-" + course_code[1].strip() @@ -87,8 +84,7 @@ async def scrapePage(s, url, data): async def get_schools(s, url): async with s.get(url) as homepage: - homepage_text = await homepage.text() - soup = BeautifulSoup(homepage_text.encode("utf8"), "lxml") + soup = BeautifulSoup(await homepage.text("utf8"), "lxml") schools = soup.find("h3", text="Four-Letter Subject Codes by School") num_schools = len( list( @@ -119,63 +115,6 @@ async def get_schools(s, url): return data -def calculate_score(columns): - if not columns: - return 99999999999 # some arbitrarily large number - - def column_sum(column): - return sum(map(lambda x: len(x["depts"]), column)) - - mean = sum(map(column_sum, columns)) / len(columns) - return sum(map(lambda x: abs(mean - column_sum(x)), columns)) / len(columns) - - -# Recursively finds the most balanced set of columns. -# Since `best` needs to be passed by reference, it's -# actually [best], so we only manipulate best[0]. -def optimize_ordering_inner(data, i, columns, best): - if i == len(data): - this_score = calculate_score(columns) - best_score = calculate_score(best[0]) - - if this_score < best_score: - best[0] = deepcopy(columns) - return - - for column in columns: - column.append(data[i]) - optimize_ordering_inner(data, i + 1, columns, best) - column.pop() - - -def optimize_ordering(data, num_columns=3): - """ - Because we want the QuACS homepage to be as "square-like" as possible, - we need to re-order departments in such a way that once they're laid out - in multiple columns, each column is a similar height. - """ - - columns = [[] for _ in range(num_columns)] - best_result = [[]] - - optimize_ordering_inner(data, 0, columns, best_result) - - best_result = best_result[0] - - for i in range(len(best_result)): - best_result[i] = sorted( - best_result[i], key=lambda s: len(s["depts"]), reverse=True - ) - - best_result = sorted(best_result, key=lambda c: len(c[0]["depts"]), reverse=True) - - flattened = [] - for column in best_result: - flattened.extend(column) - - return flattened - - HEADERS = { "Content-Type": "application/x-www-form-urlencoded", } @@ -229,7 +168,6 @@ async def parse_year(s, year_data): else: data = await get_schools(s, schools_url) data = list(map(lambda x: {"name": x[0], "depts": x[1]}, data.items())) - data = optimize_ordering(data) years = year.split("-") for directory in (f"{years[0]}09", f"{years[1]}01", f"{years[1]}05"): @@ -241,7 +179,8 @@ async def parse_year(s, year_data): async def parse_years(years_data): async with aiohttp.ClientSession() as s: - await asyncio.gather(*(parse_year(s, year_data) for year_data in years_data)) + for year_data in years_data: + await parse_year(s, year_data) years = asyncio.run(get_years()) diff --git a/faculty_directory_scraper/main.py b/faculty_directory_scraper/main.py index 70f3c58..6a1d272 100644 --- a/faculty_directory_scraper/main.py +++ b/faculty_directory_scraper/main.py @@ -33,17 +33,13 @@ async def main(): data = {} async with aiohttp.ClientSession() as s: - await asyncio.gather( - *( - get_professor( - s, - professor, - f"https://faculty.rpi.edu{professor['node']['Path']}", - data, - ) - for professor in faculty["nodes"] + for professor in faculty["nodes"]: + await get_professor( + s, + professor, + f"https://faculty.rpi.edu{professor['node']['Path']}", + data, ) - ) with open("faculty.json", "w") as outfile: json.dump(data, outfile, sort_keys=True, indent=2) diff --git a/sis_scraper/main.py b/sis_scraper/main.py index df79eef..a7760f5 100644 --- a/sis_scraper/main.py +++ b/sis_scraper/main.py @@ -8,6 +8,7 @@ import math from tqdm import tqdm import urllib.parse +from copy import deepcopy load_dotenv() @@ -139,9 +140,66 @@ def toTitle(text): return text +def calculate_score(columns): + if not columns: + return 99999999999 # some arbitrarily large number + + def column_sum(column): + return sum(map(lambda x: len(x["depts"]) + 3, column)) + + mean = sum(map(column_sum, columns)) / len(columns) + return sum(map(lambda x: abs(mean - column_sum(x)), columns)) / len(columns) + + +# Recursively finds the most balanced set of columns. +# Since `best` needs to be passed by reference, it's +# actually [best], so we only manipulate best[0]. +def optimize_ordering_inner(data, i, columns, best): + if i == len(data): + this_score = calculate_score(columns) + best_score = calculate_score(best[0]) + + if this_score < best_score: + best[0] = deepcopy(columns) + return + + for column in columns: + column.append(data[i]) + optimize_ordering_inner(data, i + 1, columns, best) + column.pop() + + +def optimize_column_ordering(data, num_columns=3): + """ + Because we want the QuACS homepage to be as "square-like" as possible, + we need to re-order departments in such a way that once they're laid out + in multiple columns, each column is a similar height. + """ + + columns = [[] for _ in range(num_columns)] + best_result = [[]] + + optimize_ordering_inner(data, 0, columns, best_result) + + best_result = best_result[0] + + for i in range(len(best_result)): + best_result[i] = sorted( + best_result[i], key=lambda s: len(s["depts"]), reverse=True + ) + + best_result = sorted(best_result, key=lambda c: len(c[0]["depts"]), reverse=True) + + flattened = [] + for column in best_result: + flattened.extend(column) + + return flattened + + payload = f'sid={os.getenv("RIN")}&PIN={urllib.parse.quote(os.getenv("PASSWORD"))}' headers = {"Content-Type": "application/x-www-form-urlencoded"} -with requests.Session() as s: +with requests.Session() as s: # We purposefully don't use aiohttp here since SIS doesn't like multiple logged in connections s.get(url="https://sis.rpi.edu/rss/twbkwbis.P_WWWLogin") response = s.request( "POST", @@ -184,9 +242,6 @@ def toTitle(text): data = [] - print(response) - print(response.text) - print(term) # print(response.text.encode('utf8')) soup = BeautifulSoup(response.text.encode("utf8"), "html.parser") table = soup.findAll("table", {"class": "datadisplaytable"})[0] @@ -299,11 +354,29 @@ def toTitle(text): # data = reformatJson(data) # print(json.dumps(data,sort_keys=False,indent=2)) - with open( - f"data/{term}/courses.json", "w" - ) as outfile: # -{os.getenv("CURRENT_TERM")} + with open(f"data/{term}/courses.json", "w") as outfile: json.dump(data, outfile, sort_keys=False, indent=2) + # Remove schools which have no courses, then format it for the homepage + with open(f"data/{term}/schools.json", "r") as all_schools_f: + all_schools = json.load(all_schools_f) + + schools = [] + for possible_school in all_schools: + res_school = {"name": possible_school["name"], "depts": []} + for target_dept in possible_school["depts"]: + matching_depts = list( + filter(lambda d: d["code"] == target_dept["code"], data) + ) + if matching_depts: + res_school["depts"].append(target_dept) + if res_school["depts"]: + schools.append(res_school) + + school_columns = optimize_column_ordering(schools) + with open(f"data/{term}/schools.json", "w") as schools_f: + json.dump(school_columns, schools_f, sort_keys=False, indent=2) + # Generate binary conflict output # (32bit crn + 3*64bit conflicts 5am-midnight(by 30min))for every course TIME_START = 700