sis-scraper: smart minute granularity to automatically scale number o…

…f buckets (#16) Before we went to 1 minute granularity, we used to scale the number of buckets by only resolving conflicts within a certain accuracy (typically 10 mintue buckets). This optimization brings back minute granularity, computing the largest we can use without losing accuracy (typically 5 minutes). The result is an 80% reduction of buckets for spring 2021 as 5 minute granularity works, speeding up the later passes.
quacs · Dec 19, 2020 · f49324b · f49324b
1 parent c4b3bc4
commit f49324b
Showing 1 changed file with 27 additions and 16 deletions.
diff --git a/sis_scraper/main.py b/sis_scraper/main.py
@@ -307,27 +307,21 @@ def optimize_column_ordering(data, num_columns=3):
         with open(f"data/{term}/schools.json", "w") as schools_f:
             json.dump(school_columns, schools_f, sort_keys=False, indent=2)
 
-        # Generate binary conflict output
-        # day * 24 hours/day * 60minutes/hour = total buckets
-        offset = lambda x: x * 24 * 60
-
-        day_offsets = {
-            "M": offset(0),
-            "T": offset(1),
-            "W": offset(2),
-            "R": offset(3),
-            "F": offset(4),
-            "S": offset(5),
-            "U": offset(6),
-        }
-
         unique_ranges = set()
         get_date = lambda x: date(1, int(x[0]), int(x[1]))
 
+        divide = list(range(1, 61))
+
         for dept in data:
             for course in dept["courses"]:
                 for section in course["sections"]:
                     for time in section["timeslots"]:
+                        if time["timeStart"] >= 0 and time["timeEnd"] >= 0:
+                            divide = [
+                                x
+                                for x in divide
+                                if (time["timeStart"] % x == time["timeEnd"] % x == 0)
+                            ]
                         start = time["dateStart"].split("/")
 
                         if len(start) < 2:
@@ -337,6 +331,23 @@ def optimize_column_ordering(data, num_columns=3):
         unique_ranges = list(unique_ranges)
         unique_ranges.sort(reverse=True)
 
+        MINUTE_GRANULARITY = max(divide)
+        NUM_MIN_PER_HOUR = 60 // MINUTE_GRANULARITY
+
+        # Generate binary conflict output
+        # day * 24 hours/day * NUM_MIN_PER_HOUR = total buckets
+        offset = lambda x: x * 24 * NUM_MIN_PER_HOUR
+
+        day_offsets = {
+            "M": offset(0),
+            "T": offset(1),
+            "W": offset(2),
+            "R": offset(3),
+            "F": offset(4),
+            "S": offset(5),
+            "U": offset(6),
+        }
+
         BITS_PER_SLICE = offset(len(day_offsets))
         BIT_VEC_SIZE = BITS_PER_SLICE * len(unique_ranges)
 
@@ -368,7 +379,7 @@ def optimize_column_ordering(data, num_columns=3):
 
                             for day in time["days"]:
                                 for hour in range(0, 2400, 100):
-                                    for minute in range(60):
+                                    for minute in range(0, 60, MINUTE_GRANULARITY):
                                         if (
                                             time["timeStart"] <= hour + minute
                                             and time["timeEnd"] > hour + minute
@@ -377,7 +388,7 @@ def optimize_column_ordering(data, num_columns=3):
                                             hour_idx = hour // 100
                                             index = BITS_PER_SLICE * i + (
                                                 day_offsets[day]
-                                                + hour_idx * 60
+                                                + hour_idx * NUM_MIN_PER_HOUR
                                                 + minute_idx
                                             )
                                             conflict[index] = 1