From f49324b4e727fb5e2762702eb70458b50238822b Mon Sep 17 00:00:00 2001 From: "John C. Allwein" <5902494+johnnyapol@users.noreply.github.com> Date: Sat, 19 Dec 2020 15:27:30 -0500 Subject: [PATCH] sis-scraper: smart minute granularity to automatically scale number of buckets (#16) Before we went to 1 minute granularity, we used to scale the number of buckets by only resolving conflicts within a certain accuracy (typically 10 mintue buckets). This optimization brings back minute granularity, computing the largest we can use without losing accuracy (typically 5 minutes). The result is an 80% reduction of buckets for spring 2021 as 5 minute granularity works, speeding up the later passes. --- sis_scraper/main.py | 43 +++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/sis_scraper/main.py b/sis_scraper/main.py index 33d698b..11323a1 100644 --- a/sis_scraper/main.py +++ b/sis_scraper/main.py @@ -307,27 +307,21 @@ def optimize_column_ordering(data, num_columns=3): with open(f"data/{term}/schools.json", "w") as schools_f: json.dump(school_columns, schools_f, sort_keys=False, indent=2) - # Generate binary conflict output - # day * 24 hours/day * 60minutes/hour = total buckets - offset = lambda x: x * 24 * 60 - - day_offsets = { - "M": offset(0), - "T": offset(1), - "W": offset(2), - "R": offset(3), - "F": offset(4), - "S": offset(5), - "U": offset(6), - } - unique_ranges = set() get_date = lambda x: date(1, int(x[0]), int(x[1])) + divide = list(range(1, 61)) + for dept in data: for course in dept["courses"]: for section in course["sections"]: for time in section["timeslots"]: + if time["timeStart"] >= 0 and time["timeEnd"] >= 0: + divide = [ + x + for x in divide + if (time["timeStart"] % x == time["timeEnd"] % x == 0) + ] start = time["dateStart"].split("/") if len(start) < 2: @@ -337,6 +331,23 @@ def optimize_column_ordering(data, num_columns=3): unique_ranges = list(unique_ranges) unique_ranges.sort(reverse=True) + MINUTE_GRANULARITY = max(divide) + NUM_MIN_PER_HOUR = 60 // MINUTE_GRANULARITY + + # Generate binary conflict output + # day * 24 hours/day * NUM_MIN_PER_HOUR = total buckets + offset = lambda x: x * 24 * NUM_MIN_PER_HOUR + + day_offsets = { + "M": offset(0), + "T": offset(1), + "W": offset(2), + "R": offset(3), + "F": offset(4), + "S": offset(5), + "U": offset(6), + } + BITS_PER_SLICE = offset(len(day_offsets)) BIT_VEC_SIZE = BITS_PER_SLICE * len(unique_ranges) @@ -368,7 +379,7 @@ def optimize_column_ordering(data, num_columns=3): for day in time["days"]: for hour in range(0, 2400, 100): - for minute in range(60): + for minute in range(0, 60, MINUTE_GRANULARITY): if ( time["timeStart"] <= hour + minute and time["timeEnd"] > hour + minute @@ -377,7 +388,7 @@ def optimize_column_ordering(data, num_columns=3): hour_idx = hour // 100 index = BITS_PER_SLICE * i + ( day_offsets[day] - + hour_idx * 60 + + hour_idx * NUM_MIN_PER_HOUR + minute_idx ) conflict[index] = 1