Skip to content
This repository has been archived by the owner on Jan 8, 2021. It is now read-only.

Commit

Permalink
Update SIS scraper with new login system
Browse files Browse the repository at this point in the history
  • Loading branch information
benjaminrsherman committed Dec 1, 2020
1 parent eda2c6e commit c4b3bc4
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 8 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/scrape.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@ name: Scrape new data
on:
repository_dispatch:
types: scrape
push:
branches:
- master
schedule:
- cron: '0 * * * *'

jobs:
scrape-schools:
Expand Down
25 changes: 17 additions & 8 deletions sis_scraper/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,19 +124,29 @@ def optimize_column_ordering(data, num_columns=3):
return flattened


payload = f'sid={os.getenv("RIN")}&PIN={urllib.parse.quote(os.getenv("PASSWORD"))}'
headers = {"Content-Type": "application/x-www-form-urlencoded"}
LOGIN_BASE_PARAMS = f"username={os.getenv('RIN')}&password={urllib.parse.quote(os.getenv('PASSWORD'))}&_eventId=submit"
headers = {
# "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:83.0) Gecko/20100101 Firefox/83.0",
"Content-Type": "application/x-www-form-urlencoded",
}
with requests.Session() as s: # We purposefully don't use aiohttp here since SIS doesn't like multiple logged in connections
s.get(url="https://sis.rpi.edu/rss/twbkwbis.P_WWWLogin")
# We first need to get the CSRF token from the login page
login_url = "https://cas-auth-ent.rpi.edu/cas/login?service=https%3A%2F%2Fbannerapp04-bnrprd.server.rpi.edu%3A443%2Fssomanager%2Fc%2FSSB"
login_page = s.get(url=login_url)
login_soup = BeautifulSoup(login_page.text.encode("utf8"), "html.parser")
csrf = login_soup.find("input", attrs={"name": "execution"})["value"]
login_params = LOGIN_BASE_PARAMS + f"&execution={csrf}"

response = s.request(
"POST",
"https://sis.rpi.edu/rss/twbkwbis.P_ValLogin",
login_url,
headers=headers,
data=payload,
data=login_params,
)

if b"Welcome" not in response.text.encode("utf8"):
print("Failed to log into sis")
if b"Main Menu" not in response.text.encode("utf8"):
print(response.text.encode("utf8"))
print("Failed to authenticate CAS")
exit(1)

for term in tqdm(os.listdir("data")):
Expand Down Expand Up @@ -169,7 +179,6 @@ def optimize_column_ordering(data, num_columns=3):

data = []

# print(response.text.encode('utf8'))
soup = BeautifulSoup(response.text.encode("utf8"), "html.parser")
table = soup.findAll("table", {"class": "datadisplaytable"})[0]
rows = table.findAll("tr")
Expand Down

0 comments on commit c4b3bc4

Please sign in to comment.