mirror of
https://github.com/vale981/vertiefungs_scraper
synced 2025-03-04 09:11:39 -05:00
127 lines
3.3 KiB
Python
127 lines
3.3 KiB
Python
from bs4 import BeautifulSoup
|
|
import requests
|
|
from uritools import urijoin
|
|
import re
|
|
from enum import Enum
|
|
import json
|
|
|
|
base = "https://tu-dresden.de/"
|
|
time_regex = re.compile(
|
|
r"(MO|DI|MI|DO|FR)\(([0-9])\)\s*(ugW|gw)*", re.MULTILINE | re.IGNORECASE
|
|
)
|
|
days = ["MO", "DI", "MI", "DO", "FR"]
|
|
|
|
|
|
class Week(str, Enum):
|
|
GW = "gw"
|
|
UGW = "ugw"
|
|
BOTH = "both"
|
|
|
|
|
|
def parse_dates(text):
|
|
parsed = []
|
|
for day, time, week in time_regex.findall(text):
|
|
day = days.index(day)
|
|
week = (
|
|
(Week.GW if re.match("gw", week, re.IGNORECASE) else Week.UGW)
|
|
if week
|
|
else Week.BOTH
|
|
)
|
|
|
|
parsed.append(dict(day=day, week=week, time=int(time)))
|
|
|
|
return parsed
|
|
|
|
|
|
def parse_lecture(lect_link, name):
|
|
lect_html = requests.get(urijoin(base, lect_link.get("href"))).text
|
|
lect = BeautifulSoup(lect_html, features="html.parser")
|
|
lect_times = (
|
|
lect.find("div", class_="tudbox")
|
|
.find(lambda tag: tag.name == "td" and "Zeit/Ort:" in tag.text)
|
|
.find_next_sibling("td")
|
|
.text.strip()
|
|
)
|
|
|
|
dates = [
|
|
dict(
|
|
name=lect_link.text,
|
|
vert_name=name,
|
|
week=date["week"],
|
|
time=date["time"],
|
|
day=date["day"],
|
|
type="lect",
|
|
)
|
|
for date in parse_dates(lect_times)
|
|
]
|
|
|
|
tuts = None
|
|
tuts_row = lect.find("div", class_="tudbox").find(
|
|
lambda tag: tag.name == "td" and "Übungen:" in tag.text
|
|
)
|
|
|
|
if tuts_row:
|
|
dates += [
|
|
dict(
|
|
name=lect_link.text,
|
|
vert_name=name,
|
|
date=date,
|
|
week=date["week"],
|
|
time=date["time"],
|
|
day=date["day"],
|
|
type="tut",
|
|
)
|
|
for date in parse_dates(
|
|
list(tuts_row.find_next_sibling("td").findAll("td"))[-1].text
|
|
)
|
|
]
|
|
|
|
return dates
|
|
|
|
|
|
def get_lectures(vert_table, vert_name):
|
|
return [
|
|
event
|
|
for lect in vert_table.findAll("a")
|
|
for event in parse_lecture(lect, vert_name)
|
|
]
|
|
|
|
|
|
def get_vert_tables():
|
|
vert_html = requests.get(
|
|
"https://tu-dresden.de/mn/physik/studium/lehrveranstaltungen/vertiefungsgebiete-bachelor-und-master/katalog_wintersemester"
|
|
).text
|
|
soup = BeautifulSoup(vert_html, features="html.parser")
|
|
|
|
vert_tables = soup.findAll("table", class_="BodyTable")
|
|
|
|
verts = [
|
|
lecture
|
|
for vert in vert_tables
|
|
for lecture in get_lectures(vert, vert.previous_sibling.text)
|
|
]
|
|
return verts
|
|
|
|
|
|
def get_lectures_for_time(verts, time, tut=False, week=None):
|
|
lects = [[] for _ in days]
|
|
lnames = [[] for _ in days]
|
|
for _, vert in verts.items():
|
|
for lect in vert["lectures"]:
|
|
times = lect["tutorial_times"] if tut else lect["lecture_times"]
|
|
if times:
|
|
for l_time in times:
|
|
if l_time["time"] == time:
|
|
if lect["name"] not in lnames[l_time["day"]]:
|
|
if week is not None and l_time["week"].value != week.value:
|
|
continue
|
|
|
|
lects[l_time["day"]].append(lect)
|
|
lnames[l_time["day"]].append(lect["name"])
|
|
|
|
return lects
|
|
|
|
|
|
if __name__ == "__main__":
|
|
all = get_vert_tables()
|
|
print(json.dumps(all))
|