Skip to content
This repository has been archived by the owner on Jan 8, 2021. It is now read-only.

Commit

Permalink
Various improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
benjaminrsherman committed Nov 1, 2020
1 parent c9cace2 commit e0a7fbc
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 40 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/scrape.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ jobs:
name: Scrapes schools per year
runs-on: ubuntu-latest
steps:
- name: Cancel Previous Runs
uses: styfle/[email protected]
with:
access_token: ${{ github.token }}

- name: Checkout scrapers
uses: actions/checkout@v2
with:
Expand Down
44 changes: 24 additions & 20 deletions catalog_scraper/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,30 +157,34 @@ async def get_years() -> List[Tuple[str, str, str]]:


async def parse_year(s, year_data):
year, courses_url, schools_url = year_data

if sys.argv[1] == "catalog":
data = {}
while True:
if courses_url is None:
break
courses_url = await scrapePage(s, courses_url, data)
else:
data = await get_schools(s, schools_url)
data = list(map(lambda x: {"name": x[0], "depts": x[1]}, data.items()))

years = year.split("-")
for directory in (f"{years[0]}09", f"{years[1]}01", f"{years[1]}05"):
directory = "data/" + directory
os.makedirs(directory, exist_ok=True)
with open(f"{directory}/{sys.argv[1]}.json", "w") as outfile:
json.dump(data, outfile, sort_keys=False, indent=2)
try:
year, courses_url, schools_url = year_data

if sys.argv[1] == "catalog":
data = {}
while True:
if courses_url is None:
break
courses_url = await scrapePage(s, courses_url, data)
else:
data = await get_schools(s, schools_url)
data = list(map(lambda x: {"name": x[0], "depts": x[1]}, data.items()))

years = year.split("-")
for directory in (f"{years[0]}09", f"{years[1]}01", f"{years[1]}05"):
directory = "data/" + directory
os.makedirs(directory, exist_ok=True)
with open(f"{directory}/{sys.argv[1]}.json", "w") as outfile:
json.dump(data, outfile, sort_keys=False, indent=2)
except Exception as e:
print(year_data)
print(e)
raise e


async def parse_years(years_data):
async with aiohttp.ClientSession() as s:
for year_data in years_data:
await parse_year(s, year_data)
await asyncio.gather(*(parse_year(s, year_data) for year_data in years_data))


years = asyncio.run(get_years())
Expand Down
43 changes: 23 additions & 20 deletions prerequisites_scraper/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,28 +195,31 @@ async def get_prereq_string(s, term, crn):
return data


async def parse_term(s, term):
prerequisites = {}

crns = []
with open(f"data/{term}/courses.json") as json_file:
courses = json.load(json_file)
for department in courses:
for course in department["courses"]:
for section in course["sections"]:
crns.append(section["crn"])

for crn in tqdm(crns, desc=term):
try:
prerequisites[crn] = await get_prereq_string(s, term, crn)
except Exception as e:
print(f"CRN: {crn} - {e}")
prerequisites[crn] = {}

with open(f"data/{term}/prerequisites.json", "w") as outfile:
json.dump(prerequisites, outfile, indent=4)


async def main():
async with aiohttp.ClientSession() as s:
for term in os.listdir("data"):
prerequisites = {}

crns = []
with open(f"data/{term}/courses.json") as json_file:
courses = json.load(json_file)
for department in courses:
for course in department["courses"]:
for section in course["sections"]:
crns.append(section["crn"])

for crn in tqdm(crns, desc=term):
try:
prerequisites[crn] = await get_prereq_string(s, term, crn)
except Exception as e:
print(f"CRN: {crn} - {e}")
prerequisites[crn] = {}

with open(f"data/{term}/prerequisites.json", "w") as outfile:
json.dump(prerequisites, outfile, indent=4)
await asyncio.gather(*(parse_term(s, term) for term in os.listdir("data")))


if __name__ == "__main__":
Expand Down

0 comments on commit e0a7fbc

Please sign in to comment.