{ "cells": [ { "cell_type": "code", "execution_count": 220, "id": "83707cda", "metadata": {}, "outputs": [], "source": [ "import requests\n", "import os\n", "from bs4 import BeautifulSoup\n", "import json" ] }, { "cell_type": "code", "execution_count": 229, "id": "72fb7570", "metadata": {}, "outputs": [], "source": [ "url = \"https://www.arztsuche-bw.de/index.php?suchen=1&sorting=name&direction=ASC&arztgruppe=psychotherapeut&landkreis=Karlsruhe+-+Stadt\"\n", "offset_str = \"&offset=\"\n", "datapath = \"./data\"\n", "outpath = os.path.join(datapath, \"raw\")\n", "extractedpath = os.path.join(datapath, \"extracted\")\n", "npages = 14 # we can see on the website there are 14 pages\n", "ntherapists = 20 # 20 items per page" ] }, { "cell_type": "code", "execution_count": null, "id": "8eff4531", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Crawling page 15\r" ] } ], "source": [ "\n", "if not os.path.exists(outpath):\n", " os.makedirs(outpath)\n", "\n", "for page in range(npages + 1):\n", " print(f\"Crawling page {page}\", end=\"\\r\")\n", "\n", " res = requests.get(url + offset_str + str(page * ntherapists)) \n", "\n", " if res.status_code == 200:\n", " with open(os.path.join(outpath, f\"page_{page}.html\"), \"w+\") as outfile:\n", " outfile.write(res.text)" ] }, { "cell_type": "code", "execution_count": 41, "id": "7d7f9ca1", "metadata": {}, "outputs": [], "source": [ "for page in range(npages):\n", " with open(os.path.join(outpath, f\"page_{page}.html\"), \"r\") as crawled_page:\n", " parsed = BeautifulSoup(crawled_page.read())\n", " result = parsed.find_all(\"ul\", {\"class\": \"resultlist\"})[0]\n", " entries = result.find_all(\"li\", {\"class\": \"resultrow\"})\n", "\n", " for i, entry in enumerate(entries):\n", " with open(os.path.join(extractedpath, f\"page_{page}_{i}.html\"), \"w+\") as extractedfile:\n", " extractedfile.write(str(entry.prettify()))" ] }, { "cell_type": "code", "execution_count": 114, "id": "0db40ad6", "metadata": {}, "outputs": [], "source": [ "def parse_appointment_table(tableid, parsed):\n", " # extract all possible appointments\n", " termintabelle = parsed.find_all(\"table\", {\"class\": \"termintabelle\"})[tableid]\n", " rows = termintabelle.find_all(\"tr\")\n", " appointments = {}\n", "\n", " for row in rows:\n", " entries = row.find_all(\"td\")\n", " day = entries[0].text.strip()\n", " # Python types are fuckig cursed\n", " times = list(filter(lambda e: e, map(lambda e: e.text.strip(), entries[1:])))\n", " appointments[day] = times\n", " \n", " return appointments" ] }, { "cell_type": "code", "execution_count": 139, "id": "8558c73b", "metadata": {}, "outputs": [], "source": [ "def extract_qualifications(index, parsed):\n", " qualis = parsed.find_all(\"dl\", {\"class\": \"bulletlist\"})\n", " cleaned = list(filter(lambda e: e ,map(lambda e: \"\".join(e.text.splitlines()).strip(), qualis[index].children)))\n", " return cleaned" ] }, { "cell_type": "code", "execution_count": 265, "id": "e5837d3f", "metadata": {}, "outputs": [], "source": [ "def parse_extracted_html(infilepath):\n", "\n", " doctor = {}\n", " doctor[\"filepath\"] = infilepath\n", "\n", " # read in file\n", " with open(infilepath, \"r\") as infile:\n", " parsed = BeautifulSoup(infile.read())\n", "\n", " # extract name\n", " name = \" \".join(parsed.find_all(\"dd\", {\"class\": \"name\"})[0].findChildren(\"dt\")[0].text.splitlines())\n", " doctor[\"name\"] = name.strip()\n", "\n", " ntables = len(parsed.find_all(\"table\", {\"class\": \"termintabelle\"}))\n", "\n", " # table order changes if no phone appointments are available -_-\n", " if ntables == 2:\n", " appointments = parse_appointment_table(1, parsed)\n", " phone_appointments = parse_appointment_table(0, parsed)\n", " doctor[\"phone_appointments\"] = phone_appointments\n", " else:\n", " appointments = parse_appointment_table(0, parsed)\n", "\n", " doctor[\"appointments\"] = appointments\n", "\n", " # extract type of doctor\n", " resorts = extract_qualifications(0, parsed)\n", " doctor[\"resorts\"] = resorts[1:]\n", "\n", " # extract therapy types of doctor\n", " therapy_types = extract_qualifications(1, parsed)\n", " doctor[\"therapy_types\"] = therapy_types[1:]\n", "\n", " # Address\n", " addresslines = parsed.find_all(\"p\", {\"class\": \"anschrift-arzt\"})[0].text.splitlines()\n", " doctor[\"address\"] = \" \".join(map(lambda t: t.strip(), addresslines))\n", "\n", " # contact\n", " # some have *no* contact info??\n", " contactscol = parsed.find_all(\"dd\", {\"class\": \"adresse\"})[0].findChildren(\"dd\")\n", " if len(contactscol) != 0:\n", " contacts = contactscol[0]\n", " contact_text = contacts.text.split(\"
\")[0].splitlines()\n", " pseudo_phone_list = list(filter(lambda e: e, map(lambda e: e.strip(), contact_text)))[0].split(\" \")\n", " \n", " # appearently some therapists don't have a phone…\n", " if pseudo_phone_list[0] in [\"Telefon\", \"Mobil\"]:\n", " doctor[\"phone\"] = pseudo_phone_list[1]\n", "\n", " # Email (check if it exists)\n", " mails = contacts.findChildren(\"a\", {\"class\": \"obfuscatedEmail\"})\n", " mails = list(map(lambda e: e.text.strip(), mails))\n", "\n", " if len(mails) != 0 :\n", " doctor[\"mails\"] = mails \n", "\n", " return doctor" ] }, { "cell_type": "code", "execution_count": 269, "id": "8748c681", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "parsing 13|14…\r" ] } ], "source": [ "therapists = []\n", "for page in range(npages):\n", " for therapist in range(ntherapists):\n", " # only 15 therapists on last page\n", " if page == npages - 1 and therapist == 15:\n", " break\n", " print(f\"parsing {page}|{therapist}…\", end=\"\\r\")\n", " therapists.append(parse_extracted_html(os.path.join(extractedpath, f\"page_{page}_{therapist}.html\")))\n", "\n", "with open(os.path.join(datapath, \"therapists.json\"), \"w+\") as outfile:\n", " json.dump(therapists, outfile, indent=2)\n", "\n", " " ] } ], "metadata": { "kernelspec": { "display_name": "python-minimal kernel", "language": "python", "name": "python-minimal" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.7" } }, "nbformat": 4, "nbformat_minor": 5 }