chore(format): format with ruff

SplusEins · Sep 8, 2024 · 4c3fd27 · 4c3fd27
1 parent b91b53a
commit 4c3fd27
Show file tree

Hide file tree

Showing 6 changed files with 131 additions and 81 deletions.
diff --git a/.editorconfig b/.editorconfig
@@ -9,6 +9,7 @@ trim_trailing_whitespace = true
 insert_final_newline = true
 charset = utf-8
 end_of_line = lf
+max_line_length=120
 
 [*.bat]
 indent_style = tab

diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,15 @@
+[project]
+name = "sked-parser"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = []
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.ruff]
+# Allow lines to be as long as 120.
+line-length = 120
diff --git a/sked_parser/__main__.py b/sked_parser/__main__.py
@@ -1,4 +1,5 @@
 """Console script for sked_parser."""
+
 import argparse
 import logging
 import os
@@ -15,7 +16,7 @@
 
 def load_yaml_conf(yaml_file):
     """Helper function to load the configuration yaml files"""
-    with open(yaml_file, 'r') as stream:
+    with open(yaml_file, "r") as stream:
         return yaml.safe_load(stream)
 
 
@@ -24,17 +25,35 @@ def main():
     log.setLevel(logging.DEBUG)
     ch = logging.StreamHandler()
     ch.setLevel(logging.DEBUG)
-    formatter = logging.Formatter('%(levelname)s (%(filename)s:%(lineno)d) %(message)s')
+    formatter = logging.Formatter("%(levelname)s (%(filename)s:%(lineno)d) %(message)s")
     ch.setFormatter(formatter)
     log.addHandler(ch)
 
     # Add argparse for help text and future enhancements
-    parser = argparse.ArgumentParser(description='Convert sked timetables from overview URLs into a readable format for spluseins.de',
-                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument("-c", "--config-file", type=str, help="Path to the main yaml configuration file. Defaults to the provided `sked_parser/config.yaml`")
-    parser.add_argument("-s", "--secrets-file", type=str, default="secrets.yaml", help="Path to the yaml secrets file containing ostfalia user and password")
-    parser.add_argument("-o", "--out-file", type=str, action='append',
-                        help="Where to store the resulting json file. Can be specified multiple times.")
+    parser = argparse.ArgumentParser(
+        description="Convert sked timetables from overview URLs into a readable format for spluseins.de",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "-c",
+        "--config-file",
+        type=str,
+        help="Path to the main yaml configuration file. Defaults to the provided `sked_parser/config.yaml`",
+    )
+    parser.add_argument(
+        "-s",
+        "--secrets-file",
+        type=str,
+        default="secrets.yaml",
+        help="Path to the yaml secrets file containing ostfalia user and password",
+    )
+    parser.add_argument(
+        "-o",
+        "--out-file",
+        type=str,
+        action="append",
+        help="Where to store the resulting json file. Can be specified multiple times.",
+    )
     args = parser.parse_args()
 
     # Config contains the urls and other configuration.
@@ -47,15 +66,17 @@ def main():
 
     # Load username and password to access ostfalia sked URLs either from yaml if exists or from environment
     secrets = {}
-    secrets['user'] = os.environ.get('OSTFALIA_USER')
-    secrets['pass'] = os.environ.get('OSTFALIA_PASS')
+    secrets["user"] = os.environ.get("OSTFALIA_USER")
+    secrets["pass"] = os.environ.get("OSTFALIA_PASS")
     secrets_path = Path(args.secrets_file).resolve()
     if secrets_path.exists():
         secrets = load_yaml_conf(secrets_path)
-        secrets['user'] = secrets['sked']['user']
-        secrets['pass'] = secrets['sked']['pass']
-    if secrets['user'] is None or secrets['pass'] is None:
-        raise Exception("Please specify your Ostalia credentials either via a secrets.yaml file or via environment variables.")
+        secrets["user"] = secrets["sked"]["user"]
+        secrets["pass"] = secrets["sked"]["pass"]
+    if secrets["user"] is None or secrets["pass"] is None:
+        raise Exception(
+            "Please specify your Ostalia credentials either via a secrets.yaml file or via environment variables."
+        )
 
     #
     if args.out_file is None:

diff --git a/sked_parser/app.py b/sked_parser/app.py
@@ -8,60 +8,69 @@
 
 
 def write_timetable_json(tables, file_path):
-    with open(file_path, 'w') as f:
+    with open(file_path, "w") as f:
         json.dump(tables, f, indent=2, ensure_ascii=False)
-        f.write('\n')
+        f.write("\n")
 
 
 def raise_for_duplicated_ids(dict_to_check):
     """Helper function that prints an error if key `id` of that dict list has duplicated values.
-        Also raises if dict key 'id' does not exist."""
-    ids = [item['id'] for item in dict_to_check]
+    Also raises if dict key 'id' does not exist."""
+    ids = [item["id"] for item in dict_to_check]
     duplicated_ids = set([x for x in ids if ids.count(x) > 1])
     if len(duplicated_ids) > 0:
         log.critical(f"Zwei oder mehr Pläne haben die gleiche ID bekommen: {duplicated_ids}")
 
 
 def is_valid_item(table, blacklist):
     """Returns whether a table is allowed in spluseins. Used for filtering some unwanted items (Klausurenpläne)"""
-    if table['faculty'] == 'Elektrotechnik' and "block" in table['skedPath'].lower():
+    if table["faculty"] == "Elektrotechnik" and "block" in table["skedPath"].lower():
         # Blockveranstaltungen (Fakultät E) erstmal raus
         return False
-    if table['faculty'] == 'Soziale Arbeit' and "fernstudiengang" in table['label'].lower():
+    if table["faculty"] == "Soziale Arbeit" and "fernstudiengang" in table["label"].lower():
         # schlechte formatierung, wird ignoriert
         return False
     for forbidden in blacklist:
-        if forbidden.lower() in table['skedPath'].lower():
-            log.info("Skipping timetable with forbidden path: " + table['skedPath'])
+        if forbidden.lower() in table["skedPath"].lower():
+            log.info("Skipping timetable with forbidden path: " + table["skedPath"])
             return False
-        if forbidden.lower() in table['label'].lower():
-            log.info("Skipping timetable with forbidden label: " + table['label'])
+        if forbidden.lower() in table["label"].lower():
+            log.info("Skipping timetable with forbidden label: " + table["label"])
             return False
     return True
 
 
 def main(config, secrets, out_files):
     tables = []
     for plan in config["plans"]:
-        tuples = scraper.get_links(plan['url'], secrets, plan['faculty'])
+        tuples = scraper.get_links(plan["url"], secrets, plan["faculty"])
         if len(tuples) == 0:
             log.warning(f"URL {plan['url']} hat keine Pläne.")
         for label, sked_path in tuples:
             label = label.replace("\n", " ").replace("\r", " ")  # for logging purposes
             faculty_short = scraper.get_faculty_shortcode(sked_path)
             degree = scraper.guess_degree(label, sked_path)
             semester = scraper.extract_semester(label, sked_path) or "Sonstige"
-            sked_id = scraper.create_id(sked_path, faculty_short, config['current_sem'], semester)
-            label = scraper.optimize_label(label, plan.get('shorthand_syntax', False))
-            plan_type = plan.get('type', 'graphical')
+            sked_id = scraper.create_id(sked_path, faculty_short, config["current_sem"], semester)
+            label = scraper.optimize_label(label, plan.get("shorthand_syntax", False))
+            plan_type = plan.get("type", "graphical")
             if "alt" in sked_path:
                 label += " alt"
-            tables.append(dict(skedPath=sked_path, label=label, faculty=plan['faculty'],
-                               type=plan_type, id=sked_id, semester=semester, degree=degree))
+            tables.append(
+                dict(
+                    skedPath=sked_path,
+                    label=label,
+                    faculty=plan["faculty"],
+                    type=plan_type,
+                    id=sked_id,
+                    semester=semester,
+                    degree=degree,
+                )
+            )
         sleep(1)
     tables = [table for table in tables if is_valid_item(table, set(config["timetable_blacklist"]))]
     # Sort first by faculty, then by master/bachelor, then by semester and last by alphabetical label
-    tables = sorted(tables, key=lambda x: (x['faculty'], x['degree'], str(x['semester']), x['label'], x['id']))
+    tables = sorted(tables, key=lambda x: (x["faculty"], x["degree"], str(x["semester"]), x["label"], x["id"]))
     raise_for_duplicated_ids(tables)
     for out_file in out_files:
         write_timetable_json(tables, out_file)

diff --git a/sked_parser/scraper.py b/sked_parser/scraper.py
@@ -15,8 +15,7 @@
 
 # Create a requests session with our own user agent, so it's clear who manages the automated requests.
 session = requests.Session()
-session.headers.update({'User-Agent': 'Sked parser for spluseins.de',
-                        'From': '[email protected]'})
+session.headers.update({"User-Agent": "Sked parser for spluseins.de", "From": "[email protected]"})
 
 
 def get_links(overview_url, auth, faculty=""):
@@ -30,20 +29,20 @@ def get_links(overview_url, auth, faculty=""):
     Returns:
         Set[Tuple]: List of tuples with (url description, sked path)
     """
-    resp = session.get(overview_url, auth=HTTPBasicAuth(auth['user'], auth['pass']))
-    soup = BeautifulSoup(resp.content, 'lxml')
+    resp = session.get(overview_url, auth=HTTPBasicAuth(auth["user"], auth["pass"]))
+    soup = BeautifulSoup(resp.content, "lxml")
     tables = set()
-    valid_url_regex = re.compile(r'^\w/.+\.(html|csv)$', re.IGNORECASE)
-    for this_url in soup.find_all('a', href=True):
-        absolute_url = urljoin(overview_url, this_url['href'])
+    valid_url_regex = re.compile(r"^\w/.+\.(html|csv)$", re.IGNORECASE)
+    for this_url in soup.find_all("a", href=True):
+        absolute_url = urljoin(overview_url, this_url["href"])
         part_url = absolute_url.removeprefix("https://stundenplan.ostfalia.de/")
-        if part_url.endswith('index.html'):
+        if part_url.endswith("index.html"):
             continue
         if valid_url_regex.match(part_url):
             desc = this_url.text.strip()
             if "Recht" in faculty:
                 # Prepend the content of the previous paragraph to the description because it contains the real name of the plan
-                if (this_url.parent.parent.name == 'ol'):
+                if this_url.parent.parent.name == "ol":
                     desc = this_url.parent.parent.previous + " " + desc
             tables.add((desc, part_url))
     return tables
@@ -54,40 +53,40 @@ def create_id(sked_path, faculty_short, current_sem_str, extracted_semester):
     # Unqoute the URL first
     sked_path = unquote(sked_path)
     # Get a basic id from the url page, which is the last part excluding the .extension
-    id_re = re.compile(r'\w/(?:.*/)?(.+?)\.+(html|csv)', re.IGNORECASE)
+    id_re = re.compile(r"\w/(?:.*/)?(.+?)\.+(html|csv)", re.IGNORECASE)
     m = id_re.search(sked_path)
     if not m:
         raise Exception(f"Path {sked_path} did not match to ID regex, so we can't extract an ID")
     sked_id = m.group(1).lower().strip()
 
     # Replace any non alphanumeric chars with underscore and remove duplicated underscores
-    sked_id = re.sub(r'\W+', '_', sked_id, flags=re.ASCII)
-    sked_id = sked_id = re.sub(r'_+(?=_|$)', '', sked_id)
+    sked_id = re.sub(r"\W+", "_", sked_id, flags=re.ASCII)
+    sked_id = sked_id = re.sub(r"_+(?=_|$)", "", sked_id)
 
     # Remove any strings like SoSe, WS, SS including the year from the id
-    sked_id = re.sub(r'((s|w)s|(so|w)se)(_?\d+)(_\d+)?_?', '', sked_id)
+    sked_id = re.sub(r"((s|w)s|(so|w)se)(_?\d+)(_\d+)?_?", "", sked_id)
 
     # Remove some faculty specific stuff to shorten the id:
-    sked_id = sked_id.replace('semester_', '')
-    sked_id = sked_id.replace('_semester', '')
-    sked_id = sked_id.replace('_sem', '')
-    sked_id = sked_id.replace('soziale_arbeit', '')
-    sked_id = sked_id.replace('wirtschaftsingenieur_', '')
-    sked_id = sked_id.replace('energie_und_gebaeudetechnik_', '')
-    sked_id = sked_id.replace('bio_und_umwelttechnik_', '')
-    sked_id = sked_id.replace('bachelor', '')
-    sked_id = sked_id.replace('b_sc', '')
-    sked_id = sked_id.replace('m_sc', 'm')
-    sked_id = sked_id.replace('energie_', '')
-    sked_id = sked_id.replace('umwelt_', '')
-    sked_id = sked_id.replace('stdgrp_', '')  # weird faculty S specific string
-    sked_id = sked_id.replace('stjg_', '')  # weird faculty K specific string
+    sked_id = sked_id.replace("semester_", "")
+    sked_id = sked_id.replace("_semester", "")
+    sked_id = sked_id.replace("_sem", "")
+    sked_id = sked_id.replace("soziale_arbeit", "")
+    sked_id = sked_id.replace("wirtschaftsingenieur_", "")
+    sked_id = sked_id.replace("energie_und_gebaeudetechnik_", "")
+    sked_id = sked_id.replace("bio_und_umwelttechnik_", "")
+    sked_id = sked_id.replace("bachelor", "")
+    sked_id = sked_id.replace("b_sc", "")
+    sked_id = sked_id.replace("m_sc", "m")
+    sked_id = sked_id.replace("energie_", "")
+    sked_id = sked_id.replace("umwelt_", "")
+    sked_id = sked_id.replace("stdgrp_", "")  # weird faculty S specific string
+    sked_id = sked_id.replace("stjg_", "")  # weird faculty K specific string
     # Remove unneccessary chars at end or beginning of string
     sked_id = sked_id.strip("_ ")
 
-    if (isinstance(extracted_semester, int)):
+    if isinstance(extracted_semester, int):
         # If semester was successfully extracted, scrape all single digits from ID and add extracted semester back
-        sked_id = re.sub(r'(?<!\d)' + f"{extracted_semester}" + r'(?=_|$)', '', sked_id)
+        sked_id = re.sub(r"(?<!\d)" + f"{extracted_semester}" + r"(?=_|$)", "", sked_id)
         sked_id = f"{sked_id}_{extracted_semester}"
 
     # Prefix the label with the faculty shortcut
@@ -98,7 +97,7 @@ def create_id(sked_path, faculty_short, current_sem_str, extracted_semester):
     # Append the current semester string (sth like ws20) at the end
     sked_id = f"{sked_id}_{current_sem_str}"
     # Again remove duplicated underscores that have been introduced by the removals before
-    sked_id = re.sub(r'_+(?=_|$)', '', sked_id)
+    sked_id = re.sub(r"_+(?=_|$)", "", sked_id)
     return sked_id
 
 
@@ -113,7 +112,7 @@ def extract_semester(desc, url):
         if keyword in desc.lower() or keyword in url.lower():
             return None
     # Try to extract the semester by finding a number followed by non word characters and something starting with Sem
-    sem_regex = re.compile(r'(?:^|\D)(\d)\W+(Sem|html$)', re.IGNORECASE)
+    sem_regex = re.compile(r"(?:^|\D)(\d)\W+(Sem|html$)", re.IGNORECASE)
     m_desc = sem_regex.search(desc)
     m_url = sem_regex.search(url)
     if m_desc:
@@ -122,7 +121,7 @@ def extract_semester(desc, url):
         # Use the semester from URL if description search was unsuccessful
         return int(m_url.group(1))
     else:
-        log.warning(f"Kein Semester bestimmbar bei \"{desc}\" mit sked path \"{url}\"")
+        log.warning(f'Kein Semester bestimmbar bei "{desc}" mit sked path "{url}"')
         return None
 
 
@@ -136,31 +135,31 @@ def get_faculty_shortcode(sked_path):
 
 def optimize_label(desc, uses_shorthand_syntax):
     """Optimize the user visible label by removing faculty names and try to use only the shorthand of that course if possible"""
-    desc = desc.replace('S-', '')
-    desc = desc.replace('I-', '')
-    desc = desc.replace('B.Sc.', '')
-    desc = desc.replace('I-M.Sc.', '')
-    desc = desc.replace('Bachelor', '')
-    desc = desc.replace('Master', '')
-    desc = desc.replace('- WiSe 21/22', '')
-    desc = desc.replace('.csv', '')
-    desc = re.sub(r'\s+', ' ', desc)  # replace all (even duplicated) whitespaces by single space
+    desc = desc.replace("S-", "")
+    desc = desc.replace("I-", "")
+    desc = desc.replace("B.Sc.", "")
+    desc = desc.replace("I-M.Sc.", "")
+    desc = desc.replace("Bachelor", "")
+    desc = desc.replace("Master", "")
+    desc = desc.replace("- WiSe 21/22", "")
+    desc = desc.replace(".csv", "")
+    desc = re.sub(r"\s+", " ", desc)  # replace all (even duplicated) whitespaces by single space
     if uses_shorthand_syntax:
         # Those faculties writes their modules as "long name (shorthand) additional info"
         # So discard the long name and use only the shorthand but keep the info
-        shorthand_re = re.compile(r'^.*?\((\D+?)\)(.*)$')
+        shorthand_re = re.compile(r"^.*?\((\D+?)\)(.*)$")
         m = shorthand_re.search(desc)
         if m:
             shorthand = m.group(1).strip()
             additional_stuff = m.group(2).strip()
             desc = f"{shorthand} {additional_stuff}"
     # Remove any semester related information
-    desc = re.sub(r'(\d\. ?-)?-? ?\d\.?\W+Sem(?:ester|\.)?', '', desc)
-    desc = desc.replace('Semester', '')
+    desc = re.sub(r"(\d\. ?-)?-? ?\d\.?\W+Sem(?:ester|\.)?", "", desc)
+    desc = desc.replace("Semester", "")
     # Strip any remaining single digits
-    desc = re.sub(r'[_-]\d(?=_|$)', '', desc)
+    desc = re.sub(r"[_-]\d(?=_|$)", "", desc)
     # Remove duplicated spaces
-    desc = desc.replace('  ', ' ')
+    desc = desc.replace("  ", " ")
     return desc.strip("-_ ")
 
 

diff --git a/tests/test_scraper.py b/tests/test_scraper.py
@@ -117,7 +117,9 @@ def test_extract_id():
     faculty_short = "e"
     current_sem_str = "ws"
     extracted_semester = 1
-    def sked_path(part_str): return f"e/semester/{part_str}.html"
+
+    def sked_path(part_str):
+        return f"e/semester/{part_str}.html"
 
     # Simple string
     in_str = "eit"
@@ -138,10 +140,13 @@ def sked_path(part_str): return f"e/semester/{part_str}.html"
     assert create_id("e/E-IST.html", faculty_short, current_sem_str, extracted_semester) == "e_ist_1_ws"
     # Complicated semester specification
     in_str = "PSA_M_1. Semester_Schwerpunkt"
-    assert create_id(sked_path(in_str), faculty_short, current_sem_str,
-                     extracted_semester) == "e_psa_m_schwerpunkt_1_ws"
+    assert (
+        create_id(sked_path(in_str), faculty_short, current_sem_str, extracted_semester) == "e_psa_m_schwerpunkt_1_ws"
+    )
 
 
 def test_is_master():
-    def sked_path(part_str): return f"e/semester/{part_str}.html"
+    def sked_path(part_str):
+        return f"e/semester/{part_str}.html"
+
     assert guess_degree("", sked_path("b_stgrp_ma_glob_1")) == "Master"