In [3]:
import requests
import os
from bs4 import BeautifulSoup
import json

In [4]:
url = "https://www.arztsuche-bw.de/index.php?suchen=1&sorting=name&direction=ASC&arztgruppe=psychotherapeut&landkreis=Karlsruhe+-+Stadt"
offset_str = "&offset="
datapath = "./data"
outpath = os.path.join(datapath, "raw")
extractedpath = os.path.join(datapath, "extracted")
npages = 14 # we can see on the website there are 14 pages
ntherapists = 20 # 20 items per page

In [None]:

if not os.path.exists(outpath):
    os.makedirs(outpath)

for page in range(npages + 1):
    print(f"Crawling page {page}", end="\r")

    res = requests.get(url + offset_str + str(page * ntherapists)) 

    if res.status_code == 200:
        with open(os.path.join(outpath, f"page_{page}.html"), "w+") as outfile:
            outfile.write(res.text)

Crawling page 15

In [41]:
for page in range(npages):
    with open(os.path.join(outpath, f"page_{page}.html"), "r") as crawled_page:
        parsed = BeautifulSoup(crawled_page.read())
        result = parsed.find_all("ul", {"class": "resultlist"})[0]
        entries = result.find_all("li", {"class": "resultrow"})

        for i, entry in enumerate(entries):
            with open(os.path.join(extractedpath, f"page_{page}_{i}.html"), "w+") as extractedfile:
                extractedfile.write(str(entry.prettify()))

In [8]:
def parse_appointment_table(tableid, parsed):
    # extract all possible appointments
    termintabelle = parsed.find_all("table", {"class": "termintabelle"})[tableid]
    rows = termintabelle.find_all("tr")
    appointments = {}

    for row in rows:
        entries = row.find_all("td")
        day = entries[0].text.strip()
        # Python types are fuckig cursed
        times = list(filter(lambda e: e, map(lambda e: e.text.strip(), entries[1:])))
        appointments[day] = times
    
    return appointments

In [9]:
def extract_qualifications(index, parsed):
    qualis = parsed.find_all("dl", {"class": "bulletlist"})
    cleaned = list(filter(lambda e: e ,map(lambda e: "".join(e.text.splitlines()).strip(), qualis[index].children)))
    return cleaned

In [10]:
def parse_extracted_html(infilepath):

    doctor = {}
    doctor["filepath"] = infilepath

    # read in file
    with open(infilepath, "r") as infile:
        parsed = BeautifulSoup(infile.read())

    # extract name
    name = " ".join(parsed.find_all("dd", {"class": "name"})[0].findChildren("dt")[0].text.splitlines())
    doctor["name"] = name.strip()

    ntables = len(parsed.find_all("table", {"class": "termintabelle"}))

    # table order changes if no phone appointments are available -_-
    if ntables == 2:
        appointments = parse_appointment_table(1, parsed)
        phone_appointments = parse_appointment_table(0, parsed)
        doctor["phone_appointments"] = phone_appointments
    else:
        appointments = parse_appointment_table(0, parsed)

    doctor["appointments"] = appointments

    # extract type of doctor
    resorts = extract_qualifications(0, parsed)
    doctor["resorts"] = resorts[1:]

    # extract therapy types of doctor
    therapy_types = extract_qualifications(1, parsed)
    doctor["therapy_types"] = therapy_types[1:]

    # Address
    addresslines = parsed.find_all("p", {"class": "anschrift-arzt"})[0].text.splitlines()
    doctor["address"] = " ".join(map(lambda t: t.strip(), addresslines))

    # contact
    # some have *no* contact info??
    contactscol = parsed.find_all("dd", {"class": "adresse"})[0].findChildren("dd")
    if len(contactscol) != 0:
        contacts = contactscol[0]
        contact_text = contacts.text.split("<br>")[0].splitlines()
        pseudo_phone_list = list(filter(lambda e: e, map(lambda e: e.strip(), contact_text)))[0].split(" ")
        
        # appearently some therapists don't have a phone…
        if  "Telefon" in pseudo_phone_list[0] or "Mobil" in pseudo_phone_list[0]:
            doctor["phone"] = pseudo_phone_list[1]

        # Email (check if it exists)
        mails = contacts.findChildren("a", {"class": "obfuscatedEmail"})
        mails = list(map(lambda e: e.text.strip(), mails))

        if len(mails) != 0 :
            doctor["mails"] = mails 

    return doctor

In [38]:
therapists = []
for page in range(npages):
    for therapist in range(ntherapists):
        # only 15 therapists on last page
        if page == npages - 1 and therapist == 15:
            break
        print(f"parsing {page}|{therapist}…", end="\r")
        therapists.append(parse_extracted_html(os.path.join(extractedpath, f"page_{page}_{therapist}.html")))


# fix broken times
for therapist in therapists:
    # skip therapists w.o. phone appointments
    if 'phone_appointments' not in therapist:
        continue
    
    newphonetimes = {}
    for day in therapist['phone_appointments']:
        phonetimes = therapist['phone_appointments'][day]
        newphonetimes[day] = []

        for phonetime in phonetimes:

            # check for weird line breaks
            if len(phonetime.split('\n')) == 1:
                newphonetimes[day].append(phonetime)
            else:
                # remove em
                newphonetimes[day] += list(filter(lambda t: t, map(lambda t: t.strip(), phonetime.split('\n'))))
    therapist['phone_appointments'] = newphonetimes

print(therapists)

with open(os.path.join(datapath, "therapists.json"), "w+") as outfile:
    json.dump(therapists, outfile, indent=2)

        

[{'filepath': './data/extracted/page_0_0.html', 'name': 'Dipl.-Psych. Katinka Alphei', 'appointments': {'Mo :': ['08:00 - 13:00', '14:00 - 19:00'], 'Di :': ['08:00 - 13:00', '14:00 - 19:00'], 'Mi :': ['08:00 - 13:00'], 'nach Vereinbarung': []}, 'resorts': ['Psychologischer Psychotherapeut'], 'therapy_types': ['Autogenes Training Erwachsene', 'Jacobsonsche Relaxationstherapie', 'Tiefenpsychologisch fundierte Einzeltherapie – Erwachsene', 'Tiefenpsychologisch fundierte Gruppentherapie – Erwachsene'], 'address': ' Dipl.-Psych. Katinka Alphei  Herrenstr. 18  76133 Karlsruhe  Ortsteil: Innenstadt-West  Landkreis: Karlsruhe - Stadt ', 'phone': '0721/50055048', 'mails': ['praxis-alphei@posteo.de']}, {'filepath': './data/extracted/page_0_1.html', 'name': 'Dipl.-Psych. Tobias Amberger', 'phone_appointments': {'Di :': ['16:20 - 19:40']}, 'appointments': {'Mo :': ['08:00 - 16:00'], 'Di :': ['08:00 - 16:00'], 'Mi :': ['08:00 - 16:00'], 'Do :': ['08:00 - 16:00'], 'Fr :': ['08:00 - 12:30'], 'nach Ve