Skip to content

Commit

Permalink
chore(format): format with ruff
Browse files Browse the repository at this point in the history
  • Loading branch information
l3d00m committed Sep 8, 2024
1 parent b91b53a commit 4c3fd27
Show file tree
Hide file tree
Showing 6 changed files with 131 additions and 81 deletions.
1 change: 1 addition & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ trim_trailing_whitespace = true
insert_final_newline = true
charset = utf-8
end_of_line = lf
max_line_length=120

[*.bat]
indent_style = tab
Expand Down
15 changes: 15 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[project]
name = "sked-parser"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.12"
dependencies = []

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.ruff]
# Allow lines to be as long as 120.
line-length = 120
49 changes: 35 additions & 14 deletions sked_parser/__main__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Console script for sked_parser."""

import argparse
import logging
import os
Expand All @@ -15,7 +16,7 @@

def load_yaml_conf(yaml_file):
"""Helper function to load the configuration yaml files"""
with open(yaml_file, 'r') as stream:
with open(yaml_file, "r") as stream:
return yaml.safe_load(stream)


Expand All @@ -24,17 +25,35 @@ def main():
log.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(levelname)s (%(filename)s:%(lineno)d) %(message)s')
formatter = logging.Formatter("%(levelname)s (%(filename)s:%(lineno)d) %(message)s")
ch.setFormatter(formatter)
log.addHandler(ch)

# Add argparse for help text and future enhancements
parser = argparse.ArgumentParser(description='Convert sked timetables from overview URLs into a readable format for spluseins.de',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("-c", "--config-file", type=str, help="Path to the main yaml configuration file. Defaults to the provided `sked_parser/config.yaml`")
parser.add_argument("-s", "--secrets-file", type=str, default="secrets.yaml", help="Path to the yaml secrets file containing ostfalia user and password")
parser.add_argument("-o", "--out-file", type=str, action='append',
help="Where to store the resulting json file. Can be specified multiple times.")
parser = argparse.ArgumentParser(
description="Convert sked timetables from overview URLs into a readable format for spluseins.de",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"-c",
"--config-file",
type=str,
help="Path to the main yaml configuration file. Defaults to the provided `sked_parser/config.yaml`",
)
parser.add_argument(
"-s",
"--secrets-file",
type=str,
default="secrets.yaml",
help="Path to the yaml secrets file containing ostfalia user and password",
)
parser.add_argument(
"-o",
"--out-file",
type=str,
action="append",
help="Where to store the resulting json file. Can be specified multiple times.",
)
args = parser.parse_args()

# Config contains the urls and other configuration.
Expand All @@ -47,15 +66,17 @@ def main():

# Load username and password to access ostfalia sked URLs either from yaml if exists or from environment
secrets = {}
secrets['user'] = os.environ.get('OSTFALIA_USER')
secrets['pass'] = os.environ.get('OSTFALIA_PASS')
secrets["user"] = os.environ.get("OSTFALIA_USER")
secrets["pass"] = os.environ.get("OSTFALIA_PASS")
secrets_path = Path(args.secrets_file).resolve()
if secrets_path.exists():
secrets = load_yaml_conf(secrets_path)
secrets['user'] = secrets['sked']['user']
secrets['pass'] = secrets['sked']['pass']
if secrets['user'] is None or secrets['pass'] is None:
raise Exception("Please specify your Ostalia credentials either via a secrets.yaml file or via environment variables.")
secrets["user"] = secrets["sked"]["user"]
secrets["pass"] = secrets["sked"]["pass"]
if secrets["user"] is None or secrets["pass"] is None:
raise Exception(
"Please specify your Ostalia credentials either via a secrets.yaml file or via environment variables."
)

#
if args.out_file is None:
Expand Down
43 changes: 26 additions & 17 deletions sked_parser/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,60 +8,69 @@


def write_timetable_json(tables, file_path):
with open(file_path, 'w') as f:
with open(file_path, "w") as f:
json.dump(tables, f, indent=2, ensure_ascii=False)
f.write('\n')
f.write("\n")


def raise_for_duplicated_ids(dict_to_check):
"""Helper function that prints an error if key `id` of that dict list has duplicated values.
Also raises if dict key 'id' does not exist."""
ids = [item['id'] for item in dict_to_check]
Also raises if dict key 'id' does not exist."""
ids = [item["id"] for item in dict_to_check]
duplicated_ids = set([x for x in ids if ids.count(x) > 1])
if len(duplicated_ids) > 0:
log.critical(f"Zwei oder mehr Pläne haben die gleiche ID bekommen: {duplicated_ids}")


def is_valid_item(table, blacklist):
"""Returns whether a table is allowed in spluseins. Used for filtering some unwanted items (Klausurenpläne)"""
if table['faculty'] == 'Elektrotechnik' and "block" in table['skedPath'].lower():
if table["faculty"] == "Elektrotechnik" and "block" in table["skedPath"].lower():
# Blockveranstaltungen (Fakultät E) erstmal raus
return False
if table['faculty'] == 'Soziale Arbeit' and "fernstudiengang" in table['label'].lower():
if table["faculty"] == "Soziale Arbeit" and "fernstudiengang" in table["label"].lower():
# schlechte formatierung, wird ignoriert
return False
for forbidden in blacklist:
if forbidden.lower() in table['skedPath'].lower():
log.info("Skipping timetable with forbidden path: " + table['skedPath'])
if forbidden.lower() in table["skedPath"].lower():
log.info("Skipping timetable with forbidden path: " + table["skedPath"])
return False
if forbidden.lower() in table['label'].lower():
log.info("Skipping timetable with forbidden label: " + table['label'])
if forbidden.lower() in table["label"].lower():
log.info("Skipping timetable with forbidden label: " + table["label"])
return False
return True


def main(config, secrets, out_files):
tables = []
for plan in config["plans"]:
tuples = scraper.get_links(plan['url'], secrets, plan['faculty'])
tuples = scraper.get_links(plan["url"], secrets, plan["faculty"])
if len(tuples) == 0:
log.warning(f"URL {plan['url']} hat keine Pläne.")
for label, sked_path in tuples:
label = label.replace("\n", " ").replace("\r", " ") # for logging purposes
faculty_short = scraper.get_faculty_shortcode(sked_path)
degree = scraper.guess_degree(label, sked_path)
semester = scraper.extract_semester(label, sked_path) or "Sonstige"
sked_id = scraper.create_id(sked_path, faculty_short, config['current_sem'], semester)
label = scraper.optimize_label(label, plan.get('shorthand_syntax', False))
plan_type = plan.get('type', 'graphical')
sked_id = scraper.create_id(sked_path, faculty_short, config["current_sem"], semester)
label = scraper.optimize_label(label, plan.get("shorthand_syntax", False))
plan_type = plan.get("type", "graphical")
if "alt" in sked_path:
label += " alt"
tables.append(dict(skedPath=sked_path, label=label, faculty=plan['faculty'],
type=plan_type, id=sked_id, semester=semester, degree=degree))
tables.append(
dict(
skedPath=sked_path,
label=label,
faculty=plan["faculty"],
type=plan_type,
id=sked_id,
semester=semester,
degree=degree,
)
)
sleep(1)
tables = [table for table in tables if is_valid_item(table, set(config["timetable_blacklist"]))]
# Sort first by faculty, then by master/bachelor, then by semester and last by alphabetical label
tables = sorted(tables, key=lambda x: (x['faculty'], x['degree'], str(x['semester']), x['label'], x['id']))
tables = sorted(tables, key=lambda x: (x["faculty"], x["degree"], str(x["semester"]), x["label"], x["id"]))
raise_for_duplicated_ids(tables)
for out_file in out_files:
write_timetable_json(tables, out_file)
Expand Down
91 changes: 45 additions & 46 deletions sked_parser/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@

# Create a requests session with our own user agent, so it's clear who manages the automated requests.
session = requests.Session()
session.headers.update({'User-Agent': 'Sked parser for spluseins.de',
'From': '[email protected]'})
session.headers.update({"User-Agent": "Sked parser for spluseins.de", "From": "[email protected]"})


def get_links(overview_url, auth, faculty=""):
Expand All @@ -30,20 +29,20 @@ def get_links(overview_url, auth, faculty=""):
Returns:
Set[Tuple]: List of tuples with (url description, sked path)
"""
resp = session.get(overview_url, auth=HTTPBasicAuth(auth['user'], auth['pass']))
soup = BeautifulSoup(resp.content, 'lxml')
resp = session.get(overview_url, auth=HTTPBasicAuth(auth["user"], auth["pass"]))
soup = BeautifulSoup(resp.content, "lxml")
tables = set()
valid_url_regex = re.compile(r'^\w/.+\.(html|csv)$', re.IGNORECASE)
for this_url in soup.find_all('a', href=True):
absolute_url = urljoin(overview_url, this_url['href'])
valid_url_regex = re.compile(r"^\w/.+\.(html|csv)$", re.IGNORECASE)
for this_url in soup.find_all("a", href=True):
absolute_url = urljoin(overview_url, this_url["href"])
part_url = absolute_url.removeprefix("https://stundenplan.ostfalia.de/")
if part_url.endswith('index.html'):
if part_url.endswith("index.html"):
continue
if valid_url_regex.match(part_url):
desc = this_url.text.strip()
if "Recht" in faculty:
# Prepend the content of the previous paragraph to the description because it contains the real name of the plan
if (this_url.parent.parent.name == 'ol'):
if this_url.parent.parent.name == "ol":
desc = this_url.parent.parent.previous + " " + desc
tables.add((desc, part_url))
return tables
Expand All @@ -54,40 +53,40 @@ def create_id(sked_path, faculty_short, current_sem_str, extracted_semester):
# Unqoute the URL first
sked_path = unquote(sked_path)
# Get a basic id from the url page, which is the last part excluding the .extension
id_re = re.compile(r'\w/(?:.*/)?(.+?)\.+(html|csv)', re.IGNORECASE)
id_re = re.compile(r"\w/(?:.*/)?(.+?)\.+(html|csv)", re.IGNORECASE)
m = id_re.search(sked_path)
if not m:
raise Exception(f"Path {sked_path} did not match to ID regex, so we can't extract an ID")
sked_id = m.group(1).lower().strip()

# Replace any non alphanumeric chars with underscore and remove duplicated underscores
sked_id = re.sub(r'\W+', '_', sked_id, flags=re.ASCII)
sked_id = sked_id = re.sub(r'_+(?=_|$)', '', sked_id)
sked_id = re.sub(r"\W+", "_", sked_id, flags=re.ASCII)
sked_id = sked_id = re.sub(r"_+(?=_|$)", "", sked_id)

# Remove any strings like SoSe, WS, SS including the year from the id
sked_id = re.sub(r'((s|w)s|(so|w)se)(_?\d+)(_\d+)?_?', '', sked_id)
sked_id = re.sub(r"((s|w)s|(so|w)se)(_?\d+)(_\d+)?_?", "", sked_id)

# Remove some faculty specific stuff to shorten the id:
sked_id = sked_id.replace('semester_', '')
sked_id = sked_id.replace('_semester', '')
sked_id = sked_id.replace('_sem', '')
sked_id = sked_id.replace('soziale_arbeit', '')
sked_id = sked_id.replace('wirtschaftsingenieur_', '')
sked_id = sked_id.replace('energie_und_gebaeudetechnik_', '')
sked_id = sked_id.replace('bio_und_umwelttechnik_', '')
sked_id = sked_id.replace('bachelor', '')
sked_id = sked_id.replace('b_sc', '')
sked_id = sked_id.replace('m_sc', 'm')
sked_id = sked_id.replace('energie_', '')
sked_id = sked_id.replace('umwelt_', '')
sked_id = sked_id.replace('stdgrp_', '') # weird faculty S specific string
sked_id = sked_id.replace('stjg_', '') # weird faculty K specific string
sked_id = sked_id.replace("semester_", "")
sked_id = sked_id.replace("_semester", "")
sked_id = sked_id.replace("_sem", "")
sked_id = sked_id.replace("soziale_arbeit", "")
sked_id = sked_id.replace("wirtschaftsingenieur_", "")
sked_id = sked_id.replace("energie_und_gebaeudetechnik_", "")
sked_id = sked_id.replace("bio_und_umwelttechnik_", "")
sked_id = sked_id.replace("bachelor", "")
sked_id = sked_id.replace("b_sc", "")
sked_id = sked_id.replace("m_sc", "m")
sked_id = sked_id.replace("energie_", "")
sked_id = sked_id.replace("umwelt_", "")
sked_id = sked_id.replace("stdgrp_", "") # weird faculty S specific string
sked_id = sked_id.replace("stjg_", "") # weird faculty K specific string
# Remove unneccessary chars at end or beginning of string
sked_id = sked_id.strip("_ ")

if (isinstance(extracted_semester, int)):
if isinstance(extracted_semester, int):
# If semester was successfully extracted, scrape all single digits from ID and add extracted semester back
sked_id = re.sub(r'(?<!\d)' + f"{extracted_semester}" + r'(?=_|$)', '', sked_id)
sked_id = re.sub(r"(?<!\d)" + f"{extracted_semester}" + r"(?=_|$)", "", sked_id)
sked_id = f"{sked_id}_{extracted_semester}"

# Prefix the label with the faculty shortcut
Expand All @@ -98,7 +97,7 @@ def create_id(sked_path, faculty_short, current_sem_str, extracted_semester):
# Append the current semester string (sth like ws20) at the end
sked_id = f"{sked_id}_{current_sem_str}"
# Again remove duplicated underscores that have been introduced by the removals before
sked_id = re.sub(r'_+(?=_|$)', '', sked_id)
sked_id = re.sub(r"_+(?=_|$)", "", sked_id)
return sked_id


Expand All @@ -113,7 +112,7 @@ def extract_semester(desc, url):
if keyword in desc.lower() or keyword in url.lower():
return None
# Try to extract the semester by finding a number followed by non word characters and something starting with Sem
sem_regex = re.compile(r'(?:^|\D)(\d)\W+(Sem|html$)', re.IGNORECASE)
sem_regex = re.compile(r"(?:^|\D)(\d)\W+(Sem|html$)", re.IGNORECASE)
m_desc = sem_regex.search(desc)
m_url = sem_regex.search(url)
if m_desc:
Expand All @@ -122,7 +121,7 @@ def extract_semester(desc, url):
# Use the semester from URL if description search was unsuccessful
return int(m_url.group(1))
else:
log.warning(f"Kein Semester bestimmbar bei \"{desc}\" mit sked path \"{url}\"")
log.warning(f'Kein Semester bestimmbar bei "{desc}" mit sked path "{url}"')
return None


Expand All @@ -136,31 +135,31 @@ def get_faculty_shortcode(sked_path):

def optimize_label(desc, uses_shorthand_syntax):
"""Optimize the user visible label by removing faculty names and try to use only the shorthand of that course if possible"""
desc = desc.replace('S-', '')
desc = desc.replace('I-', '')
desc = desc.replace('B.Sc.', '')
desc = desc.replace('I-M.Sc.', '')
desc = desc.replace('Bachelor', '')
desc = desc.replace('Master', '')
desc = desc.replace('- WiSe 21/22', '')
desc = desc.replace('.csv', '')
desc = re.sub(r'\s+', ' ', desc) # replace all (even duplicated) whitespaces by single space
desc = desc.replace("S-", "")
desc = desc.replace("I-", "")
desc = desc.replace("B.Sc.", "")
desc = desc.replace("I-M.Sc.", "")
desc = desc.replace("Bachelor", "")
desc = desc.replace("Master", "")
desc = desc.replace("- WiSe 21/22", "")
desc = desc.replace(".csv", "")
desc = re.sub(r"\s+", " ", desc) # replace all (even duplicated) whitespaces by single space
if uses_shorthand_syntax:
# Those faculties writes their modules as "long name (shorthand) additional info"
# So discard the long name and use only the shorthand but keep the info
shorthand_re = re.compile(r'^.*?\((\D+?)\)(.*)$')
shorthand_re = re.compile(r"^.*?\((\D+?)\)(.*)$")
m = shorthand_re.search(desc)
if m:
shorthand = m.group(1).strip()
additional_stuff = m.group(2).strip()
desc = f"{shorthand} {additional_stuff}"
# Remove any semester related information
desc = re.sub(r'(\d\. ?-)?-? ?\d\.?\W+Sem(?:ester|\.)?', '', desc)
desc = desc.replace('Semester', '')
desc = re.sub(r"(\d\. ?-)?-? ?\d\.?\W+Sem(?:ester|\.)?", "", desc)
desc = desc.replace("Semester", "")
# Strip any remaining single digits
desc = re.sub(r'[_-]\d(?=_|$)', '', desc)
desc = re.sub(r"[_-]\d(?=_|$)", "", desc)
# Remove duplicated spaces
desc = desc.replace(' ', ' ')
desc = desc.replace(" ", " ")
return desc.strip("-_ ")


Expand Down
13 changes: 9 additions & 4 deletions tests/test_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,9 @@ def test_extract_id():
faculty_short = "e"
current_sem_str = "ws"
extracted_semester = 1
def sked_path(part_str): return f"e/semester/{part_str}.html"

def sked_path(part_str):
return f"e/semester/{part_str}.html"

# Simple string
in_str = "eit"
Expand All @@ -138,10 +140,13 @@ def sked_path(part_str): return f"e/semester/{part_str}.html"
assert create_id("e/E-IST.html", faculty_short, current_sem_str, extracted_semester) == "e_ist_1_ws"
# Complicated semester specification
in_str = "PSA_M_1. Semester_Schwerpunkt"
assert create_id(sked_path(in_str), faculty_short, current_sem_str,
extracted_semester) == "e_psa_m_schwerpunkt_1_ws"
assert (
create_id(sked_path(in_str), faculty_short, current_sem_str, extracted_semester) == "e_psa_m_schwerpunkt_1_ws"
)


def test_is_master():
def sked_path(part_str): return f"e/semester/{part_str}.html"
def sked_path(part_str):
return f"e/semester/{part_str}.html"

assert guess_degree("", sked_path("b_stgrp_ma_glob_1")) == "Master"

0 comments on commit 4c3fd27

Please sign in to comment.