vertiefungs_scraper/main.py
2020-10-09 16:23:00 +02:00

127 lines
3.3 KiB
Python

from bs4 import BeautifulSoup
import requests
from uritools import urijoin
import re
from enum import Enum
import json
base = "https://tu-dresden.de/"
time_regex = re.compile(
r"(MO|DI|MI|DO|FR)\(([0-9])\)\s*(ugW|gw)*", re.MULTILINE | re.IGNORECASE
)
days = ["MO", "DI", "MI", "DO", "FR"]
class Week(str, Enum):
GW = "gw"
UGW = "ugw"
BOTH = "both"
def parse_dates(text):
parsed = []
for day, time, week in time_regex.findall(text):
day = days.index(day)
week = (
(Week.GW if re.match("gw", week, re.IGNORECASE) else Week.UGW)
if week
else Week.BOTH
)
parsed.append(dict(day=day, week=week, time=int(time)))
return parsed
def parse_lecture(lect_link, name):
lect_html = requests.get(urijoin(base, lect_link.get("href"))).text
lect = BeautifulSoup(lect_html, features="html.parser")
lect_times = (
lect.find("div", class_="tudbox")
.find(lambda tag: tag.name == "td" and "Zeit/Ort:" in tag.text)
.find_next_sibling("td")
.text.strip()
)
dates = [
dict(
name=lect_link.text,
vert_name=name,
week=date["week"],
time=date["time"],
day=date["day"],
type="lect",
)
for date in parse_dates(lect_times)
]
tuts = None
tuts_row = lect.find("div", class_="tudbox").find(
lambda tag: tag.name == "td" and "Übungen:" in tag.text
)
if tuts_row:
dates += [
dict(
name=lect_link.text,
vert_name=name,
date=date,
week=date["week"],
time=date["time"],
day=date["day"],
type="tut",
)
for date in parse_dates(
list(tuts_row.find_next_sibling("td").findAll("td"))[-1].text
)
]
return dates
def get_lectures(vert_table, vert_name):
return [
event
for lect in vert_table.findAll("a")
for event in parse_lecture(lect, vert_name)
]
def get_vert_tables():
vert_html = requests.get(
"https://tu-dresden.de/mn/physik/studium/lehrveranstaltungen/vertiefungsgebiete-bachelor-und-master/katalog_wintersemester"
).text
soup = BeautifulSoup(vert_html, features="html.parser")
vert_tables = soup.findAll("table", class_="BodyTable")
verts = [
lecture
for vert in vert_tables
for lecture in get_lectures(vert, vert.previous_sibling.text)
]
return verts
def get_lectures_for_time(verts, time, tut=False, week=None):
lects = [[] for _ in days]
lnames = [[] for _ in days]
for _, vert in verts.items():
for lect in vert["lectures"]:
times = lect["tutorial_times"] if tut else lect["lecture_times"]
if times:
for l_time in times:
if l_time["time"] == time:
if lect["name"] not in lnames[l_time["day"]]:
if week is not None and l_time["week"].value != week.value:
continue
lects[l_time["day"]].append(lect)
lnames[l_time["day"]].append(lect["name"])
return lects
if __name__ == "__main__":
all = get_vert_tables()
print(json.dumps(all))