From f49324b4e727fb5e2762702eb70458b50238822b Mon Sep 17 00:00:00 2001
From: "John C. Allwein" <5902494+johnnyapol@users.noreply.github.com>
Date: Sat, 19 Dec 2020 15:27:30 -0500
Subject: [PATCH] sis-scraper: smart minute granularity to automatically scale
 number of buckets (#16)

Before we went to 1 minute granularity, we used to scale the number of buckets by only resolving conflicts within a certain accuracy (typically 10 mintue buckets).

This optimization brings back minute granularity, computing the largest we can use without losing accuracy (typically 5 minutes). The result is an 80% reduction of buckets for spring 2021 as 5 minute granularity works, speeding up the later passes.
---
 sis_scraper/main.py | 43 +++++++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/sis_scraper/main.py b/sis_scraper/main.py
index 33d698b..11323a1 100644
--- a/sis_scraper/main.py
+++ b/sis_scraper/main.py
@@ -307,27 +307,21 @@ def optimize_column_ordering(data, num_columns=3):
         with open(f"data/{term}/schools.json", "w") as schools_f:
             json.dump(school_columns, schools_f, sort_keys=False, indent=2)
 
-        # Generate binary conflict output
-        # day * 24 hours/day * 60minutes/hour = total buckets
-        offset = lambda x: x * 24 * 60
-
-        day_offsets = {
-            "M": offset(0),
-            "T": offset(1),
-            "W": offset(2),
-            "R": offset(3),
-            "F": offset(4),
-            "S": offset(5),
-            "U": offset(6),
-        }
-
         unique_ranges = set()
         get_date = lambda x: date(1, int(x[0]), int(x[1]))
 
+        divide = list(range(1, 61))
+
         for dept in data:
             for course in dept["courses"]:
                 for section in course["sections"]:
                     for time in section["timeslots"]:
+                        if time["timeStart"] >= 0 and time["timeEnd"] >= 0:
+                            divide = [
+                                x
+                                for x in divide
+                                if (time["timeStart"] % x == time["timeEnd"] % x == 0)
+                            ]
                         start = time["dateStart"].split("/")
 
                         if len(start) < 2:
@@ -337,6 +331,23 @@ def optimize_column_ordering(data, num_columns=3):
         unique_ranges = list(unique_ranges)
         unique_ranges.sort(reverse=True)
 
+        MINUTE_GRANULARITY = max(divide)
+        NUM_MIN_PER_HOUR = 60 // MINUTE_GRANULARITY
+
+        # Generate binary conflict output
+        # day * 24 hours/day * NUM_MIN_PER_HOUR = total buckets
+        offset = lambda x: x * 24 * NUM_MIN_PER_HOUR
+
+        day_offsets = {
+            "M": offset(0),
+            "T": offset(1),
+            "W": offset(2),
+            "R": offset(3),
+            "F": offset(4),
+            "S": offset(5),
+            "U": offset(6),
+        }
+
         BITS_PER_SLICE = offset(len(day_offsets))
         BIT_VEC_SIZE = BITS_PER_SLICE * len(unique_ranges)
 
@@ -368,7 +379,7 @@ def optimize_column_ordering(data, num_columns=3):
 
                             for day in time["days"]:
                                 for hour in range(0, 2400, 100):
-                                    for minute in range(60):
+                                    for minute in range(0, 60, MINUTE_GRANULARITY):
                                         if (
                                             time["timeStart"] <= hour + minute
                                             and time["timeEnd"] > hour + minute
@@ -377,7 +388,7 @@ def optimize_column_ordering(data, num_columns=3):
                                             hour_idx = hour // 100
                                             index = BITS_PER_SLICE * i + (
                                                 day_offsets[day]
-                                                + hour_idx * 60
+                                                + hour_idx * NUM_MIN_PER_HOUR
                                                 + minute_idx
                                             )
                                             conflict[index] = 1