Skip to content
This repository has been archived by the owner on Jan 8, 2021. It is now read-only.

Commit

Permalink
sis-scraper: smart minute granularity to automatically scale number o…
Browse files Browse the repository at this point in the history
…f buckets (#16)

Before we went to 1 minute granularity, we used to scale the number of buckets by only resolving conflicts within a certain accuracy (typically 10 mintue buckets).

This optimization brings back minute granularity, computing the largest we can use without losing accuracy (typically 5 minutes). The result is an 80% reduction of buckets for spring 2021 as 5 minute granularity works, speeding up the later passes.
  • Loading branch information
johnnyapol authored Dec 19, 2020
1 parent c4b3bc4 commit f49324b
Showing 1 changed file with 27 additions and 16 deletions.
43 changes: 27 additions & 16 deletions sis_scraper/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,27 +307,21 @@ def optimize_column_ordering(data, num_columns=3):
with open(f"data/{term}/schools.json", "w") as schools_f:
json.dump(school_columns, schools_f, sort_keys=False, indent=2)

# Generate binary conflict output
# day * 24 hours/day * 60minutes/hour = total buckets
offset = lambda x: x * 24 * 60

day_offsets = {
"M": offset(0),
"T": offset(1),
"W": offset(2),
"R": offset(3),
"F": offset(4),
"S": offset(5),
"U": offset(6),
}

unique_ranges = set()
get_date = lambda x: date(1, int(x[0]), int(x[1]))

divide = list(range(1, 61))

for dept in data:
for course in dept["courses"]:
for section in course["sections"]:
for time in section["timeslots"]:
if time["timeStart"] >= 0 and time["timeEnd"] >= 0:
divide = [
x
for x in divide
if (time["timeStart"] % x == time["timeEnd"] % x == 0)
]
start = time["dateStart"].split("/")

if len(start) < 2:
Expand All @@ -337,6 +331,23 @@ def optimize_column_ordering(data, num_columns=3):
unique_ranges = list(unique_ranges)
unique_ranges.sort(reverse=True)

MINUTE_GRANULARITY = max(divide)
NUM_MIN_PER_HOUR = 60 // MINUTE_GRANULARITY

# Generate binary conflict output
# day * 24 hours/day * NUM_MIN_PER_HOUR = total buckets
offset = lambda x: x * 24 * NUM_MIN_PER_HOUR

day_offsets = {
"M": offset(0),
"T": offset(1),
"W": offset(2),
"R": offset(3),
"F": offset(4),
"S": offset(5),
"U": offset(6),
}

BITS_PER_SLICE = offset(len(day_offsets))
BIT_VEC_SIZE = BITS_PER_SLICE * len(unique_ranges)

Expand Down Expand Up @@ -368,7 +379,7 @@ def optimize_column_ordering(data, num_columns=3):

for day in time["days"]:
for hour in range(0, 2400, 100):
for minute in range(60):
for minute in range(0, 60, MINUTE_GRANULARITY):
if (
time["timeStart"] <= hour + minute
and time["timeEnd"] > hour + minute
Expand All @@ -377,7 +388,7 @@ def optimize_column_ordering(data, num_columns=3):
hour_idx = hour // 100
index = BITS_PER_SLICE * i + (
day_offsets[day]
+ hour_idx * 60
+ hour_idx * NUM_MIN_PER_HOUR
+ minute_idx
)
conflict[index] = 1
Expand Down

0 comments on commit f49324b

Please sign in to comment.