Skip to content
This repository has been archived by the owner on Jan 8, 2021. It is now read-only.

Commit

Permalink
Remove schools which have no courses associated with them
Browse files Browse the repository at this point in the history
  • Loading branch information
benjaminrsherman committed Nov 1, 2020
1 parent 82138c0 commit 5bb885a
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 83 deletions.
71 changes: 5 additions & 66 deletions catalog_scraper/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import re
import sys
from tqdm import tqdm
from copy import deepcopy

from typing import Tuple, List

Expand All @@ -15,8 +14,7 @@

async def scrapePage(s, url, data):
async with s.get(url) as response:
response_text = await response.text()
soup = BeautifulSoup(response_text.encode("utf8"), "lxml")
soup = BeautifulSoup(await response.text("utf8"), "lxml")

rows = soup.find(
"div", {"id": "advanced_filter_section"}
Expand All @@ -35,8 +33,7 @@ async def scrapePage(s, url, data):
# print(data_url)

async with s.get(data_url) as course_results:
results_text = await course_results.text()
data_soup = BeautifulSoup(results_text.encode("utf8"), "lxml")
data_soup = BeautifulSoup(await course_results.text("utf8"), "lxml")
course = data_soup.find("h1").contents[0].split("-")
course_code = course[0].split()
key = course_code[0].strip() + "-" + course_code[1].strip()
Expand Down Expand Up @@ -87,8 +84,7 @@ async def scrapePage(s, url, data):

async def get_schools(s, url):
async with s.get(url) as homepage:
homepage_text = await homepage.text()
soup = BeautifulSoup(homepage_text.encode("utf8"), "lxml")
soup = BeautifulSoup(await homepage.text("utf8"), "lxml")
schools = soup.find("h3", text="Four-Letter Subject Codes by School")
num_schools = len(
list(
Expand Down Expand Up @@ -119,63 +115,6 @@ async def get_schools(s, url):
return data


def calculate_score(columns):
if not columns:
return 99999999999 # some arbitrarily large number

def column_sum(column):
return sum(map(lambda x: len(x["depts"]), column))

mean = sum(map(column_sum, columns)) / len(columns)
return sum(map(lambda x: abs(mean - column_sum(x)), columns)) / len(columns)


# Recursively finds the most balanced set of columns.
# Since `best` needs to be passed by reference, it's
# actually [best], so we only manipulate best[0].
def optimize_ordering_inner(data, i, columns, best):
if i == len(data):
this_score = calculate_score(columns)
best_score = calculate_score(best[0])

if this_score < best_score:
best[0] = deepcopy(columns)
return

for column in columns:
column.append(data[i])
optimize_ordering_inner(data, i + 1, columns, best)
column.pop()


def optimize_ordering(data, num_columns=3):
"""
Because we want the QuACS homepage to be as "square-like" as possible,
we need to re-order departments in such a way that once they're laid out
in multiple columns, each column is a similar height.
"""

columns = [[] for _ in range(num_columns)]
best_result = [[]]

optimize_ordering_inner(data, 0, columns, best_result)

best_result = best_result[0]

for i in range(len(best_result)):
best_result[i] = sorted(
best_result[i], key=lambda s: len(s["depts"]), reverse=True
)

best_result = sorted(best_result, key=lambda c: len(c[0]["depts"]), reverse=True)

flattened = []
for column in best_result:
flattened.extend(column)

return flattened


HEADERS = {
"Content-Type": "application/x-www-form-urlencoded",
}
Expand Down Expand Up @@ -229,7 +168,6 @@ async def parse_year(s, year_data):
else:
data = await get_schools(s, schools_url)
data = list(map(lambda x: {"name": x[0], "depts": x[1]}, data.items()))
data = optimize_ordering(data)

years = year.split("-")
for directory in (f"{years[0]}09", f"{years[1]}01", f"{years[1]}05"):
Expand All @@ -241,7 +179,8 @@ async def parse_year(s, year_data):

async def parse_years(years_data):
async with aiohttp.ClientSession() as s:
await asyncio.gather(*(parse_year(s, year_data) for year_data in years_data))
for year_data in years_data:
await parse_year(s, year_data)


years = asyncio.run(get_years())
Expand Down
16 changes: 6 additions & 10 deletions faculty_directory_scraper/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,13 @@ async def main():

data = {}
async with aiohttp.ClientSession() as s:
await asyncio.gather(
*(
get_professor(
s,
professor,
f"https://faculty.rpi.edu{professor['node']['Path']}",
data,
)
for professor in faculty["nodes"]
for professor in faculty["nodes"]:
await get_professor(
s,
professor,
f"https://faculty.rpi.edu{professor['node']['Path']}",
data,
)
)

with open("faculty.json", "w") as outfile:
json.dump(data, outfile, sort_keys=True, indent=2)
Expand Down
87 changes: 80 additions & 7 deletions sis_scraper/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import math
from tqdm import tqdm
import urllib.parse
from copy import deepcopy

load_dotenv()

Expand Down Expand Up @@ -139,9 +140,66 @@ def toTitle(text):
return text


def calculate_score(columns):
if not columns:
return 99999999999 # some arbitrarily large number

def column_sum(column):
return sum(map(lambda x: len(x["depts"]) + 3, column))

mean = sum(map(column_sum, columns)) / len(columns)
return sum(map(lambda x: abs(mean - column_sum(x)), columns)) / len(columns)


# Recursively finds the most balanced set of columns.
# Since `best` needs to be passed by reference, it's
# actually [best], so we only manipulate best[0].
def optimize_ordering_inner(data, i, columns, best):
if i == len(data):
this_score = calculate_score(columns)
best_score = calculate_score(best[0])

if this_score < best_score:
best[0] = deepcopy(columns)
return

for column in columns:
column.append(data[i])
optimize_ordering_inner(data, i + 1, columns, best)
column.pop()


def optimize_column_ordering(data, num_columns=3):
"""
Because we want the QuACS homepage to be as "square-like" as possible,
we need to re-order departments in such a way that once they're laid out
in multiple columns, each column is a similar height.
"""

columns = [[] for _ in range(num_columns)]
best_result = [[]]

optimize_ordering_inner(data, 0, columns, best_result)

best_result = best_result[0]

for i in range(len(best_result)):
best_result[i] = sorted(
best_result[i], key=lambda s: len(s["depts"]), reverse=True
)

best_result = sorted(best_result, key=lambda c: len(c[0]["depts"]), reverse=True)

flattened = []
for column in best_result:
flattened.extend(column)

return flattened


payload = f'sid={os.getenv("RIN")}&PIN={urllib.parse.quote(os.getenv("PASSWORD"))}'
headers = {"Content-Type": "application/x-www-form-urlencoded"}
with requests.Session() as s:
with requests.Session() as s: # We purposefully don't use aiohttp here since SIS doesn't like multiple logged in connections
s.get(url="https://sis.rpi.edu/rss/twbkwbis.P_WWWLogin")
response = s.request(
"POST",
Expand Down Expand Up @@ -184,9 +242,6 @@ def toTitle(text):

data = []

print(response)
print(response.text)
print(term)
# print(response.text.encode('utf8'))
soup = BeautifulSoup(response.text.encode("utf8"), "html.parser")
table = soup.findAll("table", {"class": "datadisplaytable"})[0]
Expand Down Expand Up @@ -299,11 +354,29 @@ def toTitle(text):
# data = reformatJson(data)

# print(json.dumps(data,sort_keys=False,indent=2))
with open(
f"data/{term}/courses.json", "w"
) as outfile: # -{os.getenv("CURRENT_TERM")}
with open(f"data/{term}/courses.json", "w") as outfile:
json.dump(data, outfile, sort_keys=False, indent=2)

# Remove schools which have no courses, then format it for the homepage
with open(f"data/{term}/schools.json", "r") as all_schools_f:
all_schools = json.load(all_schools_f)

schools = []
for possible_school in all_schools:
res_school = {"name": possible_school["name"], "depts": []}
for target_dept in possible_school["depts"]:
matching_depts = list(
filter(lambda d: d["code"] == target_dept["code"], data)
)
if matching_depts:
res_school["depts"].append(target_dept)
if res_school["depts"]:
schools.append(res_school)

school_columns = optimize_column_ordering(schools)
with open(f"data/{term}/schools.json", "w") as schools_f:
json.dump(school_columns, schools_f, sort_keys=False, indent=2)

# Generate binary conflict output
# (32bit crn + 3*64bit conflicts 5am-midnight(by 30min))for every course
TIME_START = 700
Expand Down

0 comments on commit 5bb885a

Please sign in to comment.