diff --git a/.editorconfig b/.editorconfig index d4a2c44..f1711d4 100644 --- a/.editorconfig +++ b/.editorconfig @@ -9,6 +9,7 @@ trim_trailing_whitespace = true insert_final_newline = true charset = utf-8 end_of_line = lf +max_line_length=120 [*.bat] indent_style = tab diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..98ec64a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,15 @@ +[project] +name = "sked-parser" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.ruff] +# Allow lines to be as long as 120. +line-length = 120 diff --git a/sked_parser/__main__.py b/sked_parser/__main__.py index 2d1f246..ac76935 100644 --- a/sked_parser/__main__.py +++ b/sked_parser/__main__.py @@ -1,4 +1,5 @@ """Console script for sked_parser.""" + import argparse import logging import os @@ -15,7 +16,7 @@ def load_yaml_conf(yaml_file): """Helper function to load the configuration yaml files""" - with open(yaml_file, 'r') as stream: + with open(yaml_file, "r") as stream: return yaml.safe_load(stream) @@ -24,17 +25,35 @@ def main(): log.setLevel(logging.DEBUG) ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) - formatter = logging.Formatter('%(levelname)s (%(filename)s:%(lineno)d) %(message)s') + formatter = logging.Formatter("%(levelname)s (%(filename)s:%(lineno)d) %(message)s") ch.setFormatter(formatter) log.addHandler(ch) # Add argparse for help text and future enhancements - parser = argparse.ArgumentParser(description='Convert sked timetables from overview URLs into a readable format for spluseins.de', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument("-c", "--config-file", type=str, help="Path to the main yaml configuration file. Defaults to the provided `sked_parser/config.yaml`") - parser.add_argument("-s", "--secrets-file", type=str, default="secrets.yaml", help="Path to the yaml secrets file containing ostfalia user and password") - parser.add_argument("-o", "--out-file", type=str, action='append', - help="Where to store the resulting json file. Can be specified multiple times.") + parser = argparse.ArgumentParser( + description="Convert sked timetables from overview URLs into a readable format for spluseins.de", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "-c", + "--config-file", + type=str, + help="Path to the main yaml configuration file. Defaults to the provided `sked_parser/config.yaml`", + ) + parser.add_argument( + "-s", + "--secrets-file", + type=str, + default="secrets.yaml", + help="Path to the yaml secrets file containing ostfalia user and password", + ) + parser.add_argument( + "-o", + "--out-file", + type=str, + action="append", + help="Where to store the resulting json file. Can be specified multiple times.", + ) args = parser.parse_args() # Config contains the urls and other configuration. @@ -47,15 +66,17 @@ def main(): # Load username and password to access ostfalia sked URLs either from yaml if exists or from environment secrets = {} - secrets['user'] = os.environ.get('OSTFALIA_USER') - secrets['pass'] = os.environ.get('OSTFALIA_PASS') + secrets["user"] = os.environ.get("OSTFALIA_USER") + secrets["pass"] = os.environ.get("OSTFALIA_PASS") secrets_path = Path(args.secrets_file).resolve() if secrets_path.exists(): secrets = load_yaml_conf(secrets_path) - secrets['user'] = secrets['sked']['user'] - secrets['pass'] = secrets['sked']['pass'] - if secrets['user'] is None or secrets['pass'] is None: - raise Exception("Please specify your Ostalia credentials either via a secrets.yaml file or via environment variables.") + secrets["user"] = secrets["sked"]["user"] + secrets["pass"] = secrets["sked"]["pass"] + if secrets["user"] is None or secrets["pass"] is None: + raise Exception( + "Please specify your Ostalia credentials either via a secrets.yaml file or via environment variables." + ) # if args.out_file is None: diff --git a/sked_parser/app.py b/sked_parser/app.py index 5addbf4..66f538d 100644 --- a/sked_parser/app.py +++ b/sked_parser/app.py @@ -8,15 +8,15 @@ def write_timetable_json(tables, file_path): - with open(file_path, 'w') as f: + with open(file_path, "w") as f: json.dump(tables, f, indent=2, ensure_ascii=False) - f.write('\n') + f.write("\n") def raise_for_duplicated_ids(dict_to_check): """Helper function that prints an error if key `id` of that dict list has duplicated values. - Also raises if dict key 'id' does not exist.""" - ids = [item['id'] for item in dict_to_check] + Also raises if dict key 'id' does not exist.""" + ids = [item["id"] for item in dict_to_check] duplicated_ids = set([x for x in ids if ids.count(x) > 1]) if len(duplicated_ids) > 0: log.critical(f"Zwei oder mehr Pläne haben die gleiche ID bekommen: {duplicated_ids}") @@ -24,18 +24,18 @@ def raise_for_duplicated_ids(dict_to_check): def is_valid_item(table, blacklist): """Returns whether a table is allowed in spluseins. Used for filtering some unwanted items (Klausurenpläne)""" - if table['faculty'] == 'Elektrotechnik' and "block" in table['skedPath'].lower(): + if table["faculty"] == "Elektrotechnik" and "block" in table["skedPath"].lower(): # Blockveranstaltungen (Fakultät E) erstmal raus return False - if table['faculty'] == 'Soziale Arbeit' and "fernstudiengang" in table['label'].lower(): + if table["faculty"] == "Soziale Arbeit" and "fernstudiengang" in table["label"].lower(): # schlechte formatierung, wird ignoriert return False for forbidden in blacklist: - if forbidden.lower() in table['skedPath'].lower(): - log.info("Skipping timetable with forbidden path: " + table['skedPath']) + if forbidden.lower() in table["skedPath"].lower(): + log.info("Skipping timetable with forbidden path: " + table["skedPath"]) return False - if forbidden.lower() in table['label'].lower(): - log.info("Skipping timetable with forbidden label: " + table['label']) + if forbidden.lower() in table["label"].lower(): + log.info("Skipping timetable with forbidden label: " + table["label"]) return False return True @@ -43,7 +43,7 @@ def is_valid_item(table, blacklist): def main(config, secrets, out_files): tables = [] for plan in config["plans"]: - tuples = scraper.get_links(plan['url'], secrets, plan['faculty']) + tuples = scraper.get_links(plan["url"], secrets, plan["faculty"]) if len(tuples) == 0: log.warning(f"URL {plan['url']} hat keine Pläne.") for label, sked_path in tuples: @@ -51,17 +51,26 @@ def main(config, secrets, out_files): faculty_short = scraper.get_faculty_shortcode(sked_path) degree = scraper.guess_degree(label, sked_path) semester = scraper.extract_semester(label, sked_path) or "Sonstige" - sked_id = scraper.create_id(sked_path, faculty_short, config['current_sem'], semester) - label = scraper.optimize_label(label, plan.get('shorthand_syntax', False)) - plan_type = plan.get('type', 'graphical') + sked_id = scraper.create_id(sked_path, faculty_short, config["current_sem"], semester) + label = scraper.optimize_label(label, plan.get("shorthand_syntax", False)) + plan_type = plan.get("type", "graphical") if "alt" in sked_path: label += " alt" - tables.append(dict(skedPath=sked_path, label=label, faculty=plan['faculty'], - type=plan_type, id=sked_id, semester=semester, degree=degree)) + tables.append( + dict( + skedPath=sked_path, + label=label, + faculty=plan["faculty"], + type=plan_type, + id=sked_id, + semester=semester, + degree=degree, + ) + ) sleep(1) tables = [table for table in tables if is_valid_item(table, set(config["timetable_blacklist"]))] # Sort first by faculty, then by master/bachelor, then by semester and last by alphabetical label - tables = sorted(tables, key=lambda x: (x['faculty'], x['degree'], str(x['semester']), x['label'], x['id'])) + tables = sorted(tables, key=lambda x: (x["faculty"], x["degree"], str(x["semester"]), x["label"], x["id"])) raise_for_duplicated_ids(tables) for out_file in out_files: write_timetable_json(tables, out_file) diff --git a/sked_parser/scraper.py b/sked_parser/scraper.py index dc4bab9..ee691a5 100644 --- a/sked_parser/scraper.py +++ b/sked_parser/scraper.py @@ -15,8 +15,7 @@ # Create a requests session with our own user agent, so it's clear who manages the automated requests. session = requests.Session() -session.headers.update({'User-Agent': 'Sked parser for spluseins.de', - 'From': 'team@spluseins.de'}) +session.headers.update({"User-Agent": "Sked parser for spluseins.de", "From": "team@spluseins.de"}) def get_links(overview_url, auth, faculty=""): @@ -30,20 +29,20 @@ def get_links(overview_url, auth, faculty=""): Returns: Set[Tuple]: List of tuples with (url description, sked path) """ - resp = session.get(overview_url, auth=HTTPBasicAuth(auth['user'], auth['pass'])) - soup = BeautifulSoup(resp.content, 'lxml') + resp = session.get(overview_url, auth=HTTPBasicAuth(auth["user"], auth["pass"])) + soup = BeautifulSoup(resp.content, "lxml") tables = set() - valid_url_regex = re.compile(r'^\w/.+\.(html|csv)$', re.IGNORECASE) - for this_url in soup.find_all('a', href=True): - absolute_url = urljoin(overview_url, this_url['href']) + valid_url_regex = re.compile(r"^\w/.+\.(html|csv)$", re.IGNORECASE) + for this_url in soup.find_all("a", href=True): + absolute_url = urljoin(overview_url, this_url["href"]) part_url = absolute_url.removeprefix("https://stundenplan.ostfalia.de/") - if part_url.endswith('index.html'): + if part_url.endswith("index.html"): continue if valid_url_regex.match(part_url): desc = this_url.text.strip() if "Recht" in faculty: # Prepend the content of the previous paragraph to the description because it contains the real name of the plan - if (this_url.parent.parent.name == 'ol'): + if this_url.parent.parent.name == "ol": desc = this_url.parent.parent.previous + " " + desc tables.add((desc, part_url)) return tables @@ -54,40 +53,40 @@ def create_id(sked_path, faculty_short, current_sem_str, extracted_semester): # Unqoute the URL first sked_path = unquote(sked_path) # Get a basic id from the url page, which is the last part excluding the .extension - id_re = re.compile(r'\w/(?:.*/)?(.+?)\.+(html|csv)', re.IGNORECASE) + id_re = re.compile(r"\w/(?:.*/)?(.+?)\.+(html|csv)", re.IGNORECASE) m = id_re.search(sked_path) if not m: raise Exception(f"Path {sked_path} did not match to ID regex, so we can't extract an ID") sked_id = m.group(1).lower().strip() # Replace any non alphanumeric chars with underscore and remove duplicated underscores - sked_id = re.sub(r'\W+', '_', sked_id, flags=re.ASCII) - sked_id = sked_id = re.sub(r'_+(?=_|$)', '', sked_id) + sked_id = re.sub(r"\W+", "_", sked_id, flags=re.ASCII) + sked_id = sked_id = re.sub(r"_+(?=_|$)", "", sked_id) # Remove any strings like SoSe, WS, SS including the year from the id - sked_id = re.sub(r'((s|w)s|(so|w)se)(_?\d+)(_\d+)?_?', '', sked_id) + sked_id = re.sub(r"((s|w)s|(so|w)se)(_?\d+)(_\d+)?_?", "", sked_id) # Remove some faculty specific stuff to shorten the id: - sked_id = sked_id.replace('semester_', '') - sked_id = sked_id.replace('_semester', '') - sked_id = sked_id.replace('_sem', '') - sked_id = sked_id.replace('soziale_arbeit', '') - sked_id = sked_id.replace('wirtschaftsingenieur_', '') - sked_id = sked_id.replace('energie_und_gebaeudetechnik_', '') - sked_id = sked_id.replace('bio_und_umwelttechnik_', '') - sked_id = sked_id.replace('bachelor', '') - sked_id = sked_id.replace('b_sc', '') - sked_id = sked_id.replace('m_sc', 'm') - sked_id = sked_id.replace('energie_', '') - sked_id = sked_id.replace('umwelt_', '') - sked_id = sked_id.replace('stdgrp_', '') # weird faculty S specific string - sked_id = sked_id.replace('stjg_', '') # weird faculty K specific string + sked_id = sked_id.replace("semester_", "") + sked_id = sked_id.replace("_semester", "") + sked_id = sked_id.replace("_sem", "") + sked_id = sked_id.replace("soziale_arbeit", "") + sked_id = sked_id.replace("wirtschaftsingenieur_", "") + sked_id = sked_id.replace("energie_und_gebaeudetechnik_", "") + sked_id = sked_id.replace("bio_und_umwelttechnik_", "") + sked_id = sked_id.replace("bachelor", "") + sked_id = sked_id.replace("b_sc", "") + sked_id = sked_id.replace("m_sc", "m") + sked_id = sked_id.replace("energie_", "") + sked_id = sked_id.replace("umwelt_", "") + sked_id = sked_id.replace("stdgrp_", "") # weird faculty S specific string + sked_id = sked_id.replace("stjg_", "") # weird faculty K specific string # Remove unneccessary chars at end or beginning of string sked_id = sked_id.strip("_ ") - if (isinstance(extracted_semester, int)): + if isinstance(extracted_semester, int): # If semester was successfully extracted, scrape all single digits from ID and add extracted semester back - sked_id = re.sub(r'(?