Remove schools which have no courses associated with them

quacs · Nov 1, 2020 · 5bb885a · 5bb885a
1 parent 82138c0
commit 5bb885a
Show file tree

Hide file tree

Showing 3 changed files with 91 additions and 83 deletions.
diff --git a/catalog_scraper/main.py b/catalog_scraper/main.py
@@ -5,7 +5,6 @@
 import re
 import sys
 from tqdm import tqdm
-from copy import deepcopy
 
 from typing import Tuple, List
 
@@ -15,8 +14,7 @@
 
 async def scrapePage(s, url, data):
     async with s.get(url) as response:
-        response_text = await response.text()
-        soup = BeautifulSoup(response_text.encode("utf8"), "lxml")
+        soup = BeautifulSoup(await response.text("utf8"), "lxml")
 
         rows = soup.find(
             "div", {"id": "advanced_filter_section"}
@@ -35,8 +33,7 @@ async def scrapePage(s, url, data):
             # print(data_url)
 
             async with s.get(data_url) as course_results:
-                results_text = await course_results.text()
-                data_soup = BeautifulSoup(results_text.encode("utf8"), "lxml")
+                data_soup = BeautifulSoup(await course_results.text("utf8"), "lxml")
                 course = data_soup.find("h1").contents[0].split("-")
                 course_code = course[0].split()
                 key = course_code[0].strip() + "-" + course_code[1].strip()
@@ -87,8 +84,7 @@ async def scrapePage(s, url, data):
 
 async def get_schools(s, url):
     async with s.get(url) as homepage:
-        homepage_text = await homepage.text()
-        soup = BeautifulSoup(homepage_text.encode("utf8"), "lxml")
+        soup = BeautifulSoup(await homepage.text("utf8"), "lxml")
         schools = soup.find("h3", text="Four-Letter Subject Codes by School")
         num_schools = len(
             list(
@@ -119,63 +115,6 @@ async def get_schools(s, url):
         return data
 
 
-def calculate_score(columns):
-    if not columns:
-        return 99999999999  # some arbitrarily large number
-
-    def column_sum(column):
-        return sum(map(lambda x: len(x["depts"]), column))
-
-    mean = sum(map(column_sum, columns)) / len(columns)
-    return sum(map(lambda x: abs(mean - column_sum(x)), columns)) / len(columns)
-
-
-# Recursively finds the most balanced set of columns.
-# Since `best` needs to be passed by reference, it's
-# actually [best], so we only manipulate best[0].
-def optimize_ordering_inner(data, i, columns, best):
-    if i == len(data):
-        this_score = calculate_score(columns)
-        best_score = calculate_score(best[0])
-
-        if this_score < best_score:
-            best[0] = deepcopy(columns)
-        return
-
-    for column in columns:
-        column.append(data[i])
-        optimize_ordering_inner(data, i + 1, columns, best)
-        column.pop()
-
-
-def optimize_ordering(data, num_columns=3):
-    """
-    Because we want the QuACS homepage to be as "square-like" as possible,
-    we need to re-order departments in such a way that once they're laid out
-    in multiple columns, each column is a similar height.
-    """
-
-    columns = [[] for _ in range(num_columns)]
-    best_result = [[]]
-
-    optimize_ordering_inner(data, 0, columns, best_result)
-
-    best_result = best_result[0]
-
-    for i in range(len(best_result)):
-        best_result[i] = sorted(
-            best_result[i], key=lambda s: len(s["depts"]), reverse=True
-        )
-
-    best_result = sorted(best_result, key=lambda c: len(c[0]["depts"]), reverse=True)
-
-    flattened = []
-    for column in best_result:
-        flattened.extend(column)
-
-    return flattened
-
-
 HEADERS = {
     "Content-Type": "application/x-www-form-urlencoded",
 }
@@ -229,7 +168,6 @@ async def parse_year(s, year_data):
     else:
         data = await get_schools(s, schools_url)
         data = list(map(lambda x: {"name": x[0], "depts": x[1]}, data.items()))
-        data = optimize_ordering(data)
 
     years = year.split("-")
     for directory in (f"{years[0]}09", f"{years[1]}01", f"{years[1]}05"):
@@ -241,7 +179,8 @@ async def parse_year(s, year_data):
 
 async def parse_years(years_data):
     async with aiohttp.ClientSession() as s:
-        await asyncio.gather(*(parse_year(s, year_data) for year_data in years_data))
+        for year_data in years_data:
+            await parse_year(s, year_data)
 
 
 years = asyncio.run(get_years())

diff --git a/faculty_directory_scraper/main.py b/faculty_directory_scraper/main.py
@@ -33,17 +33,13 @@ async def main():
 
     data = {}
     async with aiohttp.ClientSession() as s:
-        await asyncio.gather(
-            *(
-                get_professor(
-                    s,
-                    professor,
-                    f"https://faculty.rpi.edu{professor['node']['Path']}",
-                    data,
-                )
-                for professor in faculty["nodes"]
+        for professor in faculty["nodes"]:
+            await get_professor(
+                s,
+                professor,
+                f"https://faculty.rpi.edu{professor['node']['Path']}",
+                data,
             )
-        )
 
     with open("faculty.json", "w") as outfile:
         json.dump(data, outfile, sort_keys=True, indent=2)

diff --git a/sis_scraper/main.py b/sis_scraper/main.py
@@ -8,6 +8,7 @@
 import math
 from tqdm import tqdm
 import urllib.parse
+from copy import deepcopy
 
 load_dotenv()
 
@@ -139,9 +140,66 @@ def toTitle(text):
     return text
 
 
+def calculate_score(columns):
+    if not columns:
+        return 99999999999  # some arbitrarily large number
+
+    def column_sum(column):
+        return sum(map(lambda x: len(x["depts"]) + 3, column))
+
+    mean = sum(map(column_sum, columns)) / len(columns)
+    return sum(map(lambda x: abs(mean - column_sum(x)), columns)) / len(columns)
+
+
+# Recursively finds the most balanced set of columns.
+# Since `best` needs to be passed by reference, it's
+# actually [best], so we only manipulate best[0].
+def optimize_ordering_inner(data, i, columns, best):
+    if i == len(data):
+        this_score = calculate_score(columns)
+        best_score = calculate_score(best[0])
+
+        if this_score < best_score:
+            best[0] = deepcopy(columns)
+        return
+
+    for column in columns:
+        column.append(data[i])
+        optimize_ordering_inner(data, i + 1, columns, best)
+        column.pop()
+
+
+def optimize_column_ordering(data, num_columns=3):
+    """
+    Because we want the QuACS homepage to be as "square-like" as possible,
+    we need to re-order departments in such a way that once they're laid out
+    in multiple columns, each column is a similar height.
+    """
+
+    columns = [[] for _ in range(num_columns)]
+    best_result = [[]]
+
+    optimize_ordering_inner(data, 0, columns, best_result)
+
+    best_result = best_result[0]
+
+    for i in range(len(best_result)):
+        best_result[i] = sorted(
+            best_result[i], key=lambda s: len(s["depts"]), reverse=True
+        )
+
+    best_result = sorted(best_result, key=lambda c: len(c[0]["depts"]), reverse=True)
+
+    flattened = []
+    for column in best_result:
+        flattened.extend(column)
+
+    return flattened
+
+
 payload = f'sid={os.getenv("RIN")}&PIN={urllib.parse.quote(os.getenv("PASSWORD"))}'
 headers = {"Content-Type": "application/x-www-form-urlencoded"}
-with requests.Session() as s:
+with requests.Session() as s:  # We purposefully don't use aiohttp here since SIS doesn't like multiple logged in connections
     s.get(url="https://sis.rpi.edu/rss/twbkwbis.P_WWWLogin")
     response = s.request(
         "POST",
@@ -184,9 +242,6 @@ def toTitle(text):
 
         data = []
 
-        print(response)
-        print(response.text)
-        print(term)
         # print(response.text.encode('utf8'))
         soup = BeautifulSoup(response.text.encode("utf8"), "html.parser")
         table = soup.findAll("table", {"class": "datadisplaytable"})[0]
@@ -299,11 +354,29 @@ def toTitle(text):
         # data = reformatJson(data)
 
         # print(json.dumps(data,sort_keys=False,indent=2))
-        with open(
-            f"data/{term}/courses.json", "w"
-        ) as outfile:  # -{os.getenv("CURRENT_TERM")}
+        with open(f"data/{term}/courses.json", "w") as outfile:
             json.dump(data, outfile, sort_keys=False, indent=2)
 
+        # Remove schools which have no courses, then format it for the homepage
+        with open(f"data/{term}/schools.json", "r") as all_schools_f:
+            all_schools = json.load(all_schools_f)
+
+        schools = []
+        for possible_school in all_schools:
+            res_school = {"name": possible_school["name"], "depts": []}
+            for target_dept in possible_school["depts"]:
+                matching_depts = list(
+                    filter(lambda d: d["code"] == target_dept["code"], data)
+                )
+                if matching_depts:
+                    res_school["depts"].append(target_dept)
+            if res_school["depts"]:
+                schools.append(res_school)
+
+        school_columns = optimize_column_ordering(schools)
+        with open(f"data/{term}/schools.json", "w") as schools_f:
+            json.dump(school_columns, schools_f, sort_keys=False, indent=2)
+
         # Generate binary conflict output
         # (32bit crn + 3*64bit conflicts 5am-midnight(by 30min))for every course
         TIME_START = 700