-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
131 additions
and
81 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
[project] | ||
name = "sked-parser" | ||
version = "0.1.0" | ||
description = "Add your description here" | ||
readme = "README.md" | ||
requires-python = ">=3.12" | ||
dependencies = [] | ||
|
||
[build-system] | ||
requires = ["hatchling"] | ||
build-backend = "hatchling.build" | ||
|
||
[tool.ruff] | ||
# Allow lines to be as long as 120. | ||
line-length = 120 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,8 +15,7 @@ | |
|
||
# Create a requests session with our own user agent, so it's clear who manages the automated requests. | ||
session = requests.Session() | ||
session.headers.update({'User-Agent': 'Sked parser for spluseins.de', | ||
'From': '[email protected]'}) | ||
session.headers.update({"User-Agent": "Sked parser for spluseins.de", "From": "[email protected]"}) | ||
|
||
|
||
def get_links(overview_url, auth, faculty=""): | ||
|
@@ -30,20 +29,20 @@ def get_links(overview_url, auth, faculty=""): | |
Returns: | ||
Set[Tuple]: List of tuples with (url description, sked path) | ||
""" | ||
resp = session.get(overview_url, auth=HTTPBasicAuth(auth['user'], auth['pass'])) | ||
soup = BeautifulSoup(resp.content, 'lxml') | ||
resp = session.get(overview_url, auth=HTTPBasicAuth(auth["user"], auth["pass"])) | ||
soup = BeautifulSoup(resp.content, "lxml") | ||
tables = set() | ||
valid_url_regex = re.compile(r'^\w/.+\.(html|csv)$', re.IGNORECASE) | ||
for this_url in soup.find_all('a', href=True): | ||
absolute_url = urljoin(overview_url, this_url['href']) | ||
valid_url_regex = re.compile(r"^\w/.+\.(html|csv)$", re.IGNORECASE) | ||
for this_url in soup.find_all("a", href=True): | ||
absolute_url = urljoin(overview_url, this_url["href"]) | ||
part_url = absolute_url.removeprefix("https://stundenplan.ostfalia.de/") | ||
if part_url.endswith('index.html'): | ||
if part_url.endswith("index.html"): | ||
continue | ||
if valid_url_regex.match(part_url): | ||
desc = this_url.text.strip() | ||
if "Recht" in faculty: | ||
# Prepend the content of the previous paragraph to the description because it contains the real name of the plan | ||
if (this_url.parent.parent.name == 'ol'): | ||
if this_url.parent.parent.name == "ol": | ||
desc = this_url.parent.parent.previous + " " + desc | ||
tables.add((desc, part_url)) | ||
return tables | ||
|
@@ -54,40 +53,40 @@ def create_id(sked_path, faculty_short, current_sem_str, extracted_semester): | |
# Unqoute the URL first | ||
sked_path = unquote(sked_path) | ||
# Get a basic id from the url page, which is the last part excluding the .extension | ||
id_re = re.compile(r'\w/(?:.*/)?(.+?)\.+(html|csv)', re.IGNORECASE) | ||
id_re = re.compile(r"\w/(?:.*/)?(.+?)\.+(html|csv)", re.IGNORECASE) | ||
m = id_re.search(sked_path) | ||
if not m: | ||
raise Exception(f"Path {sked_path} did not match to ID regex, so we can't extract an ID") | ||
sked_id = m.group(1).lower().strip() | ||
|
||
# Replace any non alphanumeric chars with underscore and remove duplicated underscores | ||
sked_id = re.sub(r'\W+', '_', sked_id, flags=re.ASCII) | ||
sked_id = sked_id = re.sub(r'_+(?=_|$)', '', sked_id) | ||
sked_id = re.sub(r"\W+", "_", sked_id, flags=re.ASCII) | ||
sked_id = sked_id = re.sub(r"_+(?=_|$)", "", sked_id) | ||
|
||
# Remove any strings like SoSe, WS, SS including the year from the id | ||
sked_id = re.sub(r'((s|w)s|(so|w)se)(_?\d+)(_\d+)?_?', '', sked_id) | ||
sked_id = re.sub(r"((s|w)s|(so|w)se)(_?\d+)(_\d+)?_?", "", sked_id) | ||
|
||
# Remove some faculty specific stuff to shorten the id: | ||
sked_id = sked_id.replace('semester_', '') | ||
sked_id = sked_id.replace('_semester', '') | ||
sked_id = sked_id.replace('_sem', '') | ||
sked_id = sked_id.replace('soziale_arbeit', '') | ||
sked_id = sked_id.replace('wirtschaftsingenieur_', '') | ||
sked_id = sked_id.replace('energie_und_gebaeudetechnik_', '') | ||
sked_id = sked_id.replace('bio_und_umwelttechnik_', '') | ||
sked_id = sked_id.replace('bachelor', '') | ||
sked_id = sked_id.replace('b_sc', '') | ||
sked_id = sked_id.replace('m_sc', 'm') | ||
sked_id = sked_id.replace('energie_', '') | ||
sked_id = sked_id.replace('umwelt_', '') | ||
sked_id = sked_id.replace('stdgrp_', '') # weird faculty S specific string | ||
sked_id = sked_id.replace('stjg_', '') # weird faculty K specific string | ||
sked_id = sked_id.replace("semester_", "") | ||
sked_id = sked_id.replace("_semester", "") | ||
sked_id = sked_id.replace("_sem", "") | ||
sked_id = sked_id.replace("soziale_arbeit", "") | ||
sked_id = sked_id.replace("wirtschaftsingenieur_", "") | ||
sked_id = sked_id.replace("energie_und_gebaeudetechnik_", "") | ||
sked_id = sked_id.replace("bio_und_umwelttechnik_", "") | ||
sked_id = sked_id.replace("bachelor", "") | ||
sked_id = sked_id.replace("b_sc", "") | ||
sked_id = sked_id.replace("m_sc", "m") | ||
sked_id = sked_id.replace("energie_", "") | ||
sked_id = sked_id.replace("umwelt_", "") | ||
sked_id = sked_id.replace("stdgrp_", "") # weird faculty S specific string | ||
sked_id = sked_id.replace("stjg_", "") # weird faculty K specific string | ||
# Remove unneccessary chars at end or beginning of string | ||
sked_id = sked_id.strip("_ ") | ||
|
||
if (isinstance(extracted_semester, int)): | ||
if isinstance(extracted_semester, int): | ||
# If semester was successfully extracted, scrape all single digits from ID and add extracted semester back | ||
sked_id = re.sub(r'(?<!\d)' + f"{extracted_semester}" + r'(?=_|$)', '', sked_id) | ||
sked_id = re.sub(r"(?<!\d)" + f"{extracted_semester}" + r"(?=_|$)", "", sked_id) | ||
sked_id = f"{sked_id}_{extracted_semester}" | ||
|
||
# Prefix the label with the faculty shortcut | ||
|
@@ -98,7 +97,7 @@ def create_id(sked_path, faculty_short, current_sem_str, extracted_semester): | |
# Append the current semester string (sth like ws20) at the end | ||
sked_id = f"{sked_id}_{current_sem_str}" | ||
# Again remove duplicated underscores that have been introduced by the removals before | ||
sked_id = re.sub(r'_+(?=_|$)', '', sked_id) | ||
sked_id = re.sub(r"_+(?=_|$)", "", sked_id) | ||
return sked_id | ||
|
||
|
||
|
@@ -113,7 +112,7 @@ def extract_semester(desc, url): | |
if keyword in desc.lower() or keyword in url.lower(): | ||
return None | ||
# Try to extract the semester by finding a number followed by non word characters and something starting with Sem | ||
sem_regex = re.compile(r'(?:^|\D)(\d)\W+(Sem|html$)', re.IGNORECASE) | ||
sem_regex = re.compile(r"(?:^|\D)(\d)\W+(Sem|html$)", re.IGNORECASE) | ||
m_desc = sem_regex.search(desc) | ||
m_url = sem_regex.search(url) | ||
if m_desc: | ||
|
@@ -122,7 +121,7 @@ def extract_semester(desc, url): | |
# Use the semester from URL if description search was unsuccessful | ||
return int(m_url.group(1)) | ||
else: | ||
log.warning(f"Kein Semester bestimmbar bei \"{desc}\" mit sked path \"{url}\"") | ||
log.warning(f'Kein Semester bestimmbar bei "{desc}" mit sked path "{url}"') | ||
return None | ||
|
||
|
||
|
@@ -136,31 +135,31 @@ def get_faculty_shortcode(sked_path): | |
|
||
def optimize_label(desc, uses_shorthand_syntax): | ||
"""Optimize the user visible label by removing faculty names and try to use only the shorthand of that course if possible""" | ||
desc = desc.replace('S-', '') | ||
desc = desc.replace('I-', '') | ||
desc = desc.replace('B.Sc.', '') | ||
desc = desc.replace('I-M.Sc.', '') | ||
desc = desc.replace('Bachelor', '') | ||
desc = desc.replace('Master', '') | ||
desc = desc.replace('- WiSe 21/22', '') | ||
desc = desc.replace('.csv', '') | ||
desc = re.sub(r'\s+', ' ', desc) # replace all (even duplicated) whitespaces by single space | ||
desc = desc.replace("S-", "") | ||
desc = desc.replace("I-", "") | ||
desc = desc.replace("B.Sc.", "") | ||
desc = desc.replace("I-M.Sc.", "") | ||
desc = desc.replace("Bachelor", "") | ||
desc = desc.replace("Master", "") | ||
desc = desc.replace("- WiSe 21/22", "") | ||
desc = desc.replace(".csv", "") | ||
desc = re.sub(r"\s+", " ", desc) # replace all (even duplicated) whitespaces by single space | ||
if uses_shorthand_syntax: | ||
# Those faculties writes their modules as "long name (shorthand) additional info" | ||
# So discard the long name and use only the shorthand but keep the info | ||
shorthand_re = re.compile(r'^.*?\((\D+?)\)(.*)$') | ||
shorthand_re = re.compile(r"^.*?\((\D+?)\)(.*)$") | ||
m = shorthand_re.search(desc) | ||
if m: | ||
shorthand = m.group(1).strip() | ||
additional_stuff = m.group(2).strip() | ||
desc = f"{shorthand} {additional_stuff}" | ||
# Remove any semester related information | ||
desc = re.sub(r'(\d\. ?-)?-? ?\d\.?\W+Sem(?:ester|\.)?', '', desc) | ||
desc = desc.replace('Semester', '') | ||
desc = re.sub(r"(\d\. ?-)?-? ?\d\.?\W+Sem(?:ester|\.)?", "", desc) | ||
desc = desc.replace("Semester", "") | ||
# Strip any remaining single digits | ||
desc = re.sub(r'[_-]\d(?=_|$)', '', desc) | ||
desc = re.sub(r"[_-]\d(?=_|$)", "", desc) | ||
# Remove duplicated spaces | ||
desc = desc.replace(' ', ' ') | ||
desc = desc.replace(" ", " ") | ||
return desc.strip("-_ ") | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters