CRAWL ALLTHEM DATA ^^
This commit is contained in:
parent
b0ed32e0d2
commit
244b77ea13
292 changed files with 90674 additions and 86 deletions
233
Crawler.ipynb
Normal file
233
Crawler.ipynb
Normal file
|
|
@ -0,0 +1,233 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 220,
|
||||
"id": "83707cda",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"import os\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"import json"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 229,
|
||||
"id": "72fb7570",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"url = \"https://www.arztsuche-bw.de/index.php?suchen=1&sorting=name&direction=ASC&arztgruppe=psychotherapeut&landkreis=Karlsruhe+-+Stadt\"\n",
|
||||
"offset_str = \"&offset=\"\n",
|
||||
"datapath = \"./data\"\n",
|
||||
"outpath = os.path.join(datapath, \"raw\")\n",
|
||||
"extractedpath = os.path.join(datapath, \"extracted\")\n",
|
||||
"npages = 14 # we can see on the website there are 14 pages\n",
|
||||
"ntherapists = 20 # 20 items per page"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8eff4531",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Crawling page 15\r"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"if not os.path.exists(outpath):\n",
|
||||
" os.makedirs(outpath)\n",
|
||||
"\n",
|
||||
"for page in range(npages + 1):\n",
|
||||
" print(f\"Crawling page {page}\", end=\"\\r\")\n",
|
||||
"\n",
|
||||
" res = requests.get(url + offset_str + str(page * ntherapists)) \n",
|
||||
"\n",
|
||||
" if res.status_code == 200:\n",
|
||||
" with open(os.path.join(outpath, f\"page_{page}.html\"), \"w+\") as outfile:\n",
|
||||
" outfile.write(res.text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 41,
|
||||
"id": "7d7f9ca1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for page in range(npages):\n",
|
||||
" with open(os.path.join(outpath, f\"page_{page}.html\"), \"r\") as crawled_page:\n",
|
||||
" parsed = BeautifulSoup(crawled_page.read())\n",
|
||||
" result = parsed.find_all(\"ul\", {\"class\": \"resultlist\"})[0]\n",
|
||||
" entries = result.find_all(\"li\", {\"class\": \"resultrow\"})\n",
|
||||
"\n",
|
||||
" for i, entry in enumerate(entries):\n",
|
||||
" with open(os.path.join(extractedpath, f\"page_{page}_{i}.html\"), \"w+\") as extractedfile:\n",
|
||||
" extractedfile.write(str(entry.prettify()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 114,
|
||||
"id": "0db40ad6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def parse_appointment_table(tableid, parsed):\n",
|
||||
" # extract all possible appointments\n",
|
||||
" termintabelle = parsed.find_all(\"table\", {\"class\": \"termintabelle\"})[tableid]\n",
|
||||
" rows = termintabelle.find_all(\"tr\")\n",
|
||||
" appointments = {}\n",
|
||||
"\n",
|
||||
" for row in rows:\n",
|
||||
" entries = row.find_all(\"td\")\n",
|
||||
" day = entries[0].text.strip()\n",
|
||||
" # Python types are fuckig cursed\n",
|
||||
" times = list(filter(lambda e: e, map(lambda e: e.text.strip(), entries[1:])))\n",
|
||||
" appointments[day] = times\n",
|
||||
" \n",
|
||||
" return appointments"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 139,
|
||||
"id": "8558c73b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def extract_qualifications(index, parsed):\n",
|
||||
" qualis = parsed.find_all(\"dl\", {\"class\": \"bulletlist\"})\n",
|
||||
" cleaned = list(filter(lambda e: e ,map(lambda e: \"\".join(e.text.splitlines()).strip(), qualis[index].children)))\n",
|
||||
" return cleaned"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 265,
|
||||
"id": "e5837d3f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def parse_extracted_html(infilepath):\n",
|
||||
"\n",
|
||||
" doctor = {}\n",
|
||||
" doctor[\"filepath\"] = infilepath\n",
|
||||
"\n",
|
||||
" # read in file\n",
|
||||
" with open(infilepath, \"r\") as infile:\n",
|
||||
" parsed = BeautifulSoup(infile.read())\n",
|
||||
"\n",
|
||||
" # extract name\n",
|
||||
" name = \" \".join(parsed.find_all(\"dd\", {\"class\": \"name\"})[0].findChildren(\"dt\")[0].text.splitlines())\n",
|
||||
" doctor[\"name\"] = name.strip()\n",
|
||||
"\n",
|
||||
" ntables = len(parsed.find_all(\"table\", {\"class\": \"termintabelle\"}))\n",
|
||||
"\n",
|
||||
" # table order changes if no phone appointments are available -_-\n",
|
||||
" if ntables == 2:\n",
|
||||
" appointments = parse_appointment_table(1, parsed)\n",
|
||||
" phone_appointments = parse_appointment_table(0, parsed)\n",
|
||||
" doctor[\"phone_appointments\"] = phone_appointments\n",
|
||||
" else:\n",
|
||||
" appointments = parse_appointment_table(0, parsed)\n",
|
||||
"\n",
|
||||
" doctor[\"appointments\"] = appointments\n",
|
||||
"\n",
|
||||
" # extract type of doctor\n",
|
||||
" resorts = extract_qualifications(0, parsed)\n",
|
||||
" doctor[\"resorts\"] = resorts[1:]\n",
|
||||
"\n",
|
||||
" # extract therapy types of doctor\n",
|
||||
" therapy_types = extract_qualifications(1, parsed)\n",
|
||||
" doctor[\"therapy_types\"] = therapy_types[1:]\n",
|
||||
"\n",
|
||||
" # Address\n",
|
||||
" addresslines = parsed.find_all(\"p\", {\"class\": \"anschrift-arzt\"})[0].text.splitlines()\n",
|
||||
" doctor[\"address\"] = \" \".join(map(lambda t: t.strip(), addresslines))\n",
|
||||
"\n",
|
||||
" # contact\n",
|
||||
" # some have *no* contact info??\n",
|
||||
" contactscol = parsed.find_all(\"dd\", {\"class\": \"adresse\"})[0].findChildren(\"dd\")\n",
|
||||
" if len(contactscol) != 0:\n",
|
||||
" contacts = contactscol[0]\n",
|
||||
" contact_text = contacts.text.split(\"<br>\")[0].splitlines()\n",
|
||||
" pseudo_phone_list = list(filter(lambda e: e, map(lambda e: e.strip(), contact_text)))[0].split(\" \")\n",
|
||||
" \n",
|
||||
" # appearently some therapists don't have a phone…\n",
|
||||
" if pseudo_phone_list[0] in [\"Telefon\", \"Mobil\"]:\n",
|
||||
" doctor[\"phone\"] = pseudo_phone_list[1]\n",
|
||||
"\n",
|
||||
" # Email (check if it exists)\n",
|
||||
" mails = contacts.findChildren(\"a\", {\"class\": \"obfuscatedEmail\"})\n",
|
||||
" mails = list(map(lambda e: e.text.strip(), mails))\n",
|
||||
"\n",
|
||||
" if len(mails) != 0 :\n",
|
||||
" doctor[\"mails\"] = mails \n",
|
||||
"\n",
|
||||
" return doctor"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 269,
|
||||
"id": "8748c681",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"parsing 13|14…\r"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"therapists = []\n",
|
||||
"for page in range(npages):\n",
|
||||
" for therapist in range(ntherapists):\n",
|
||||
" # only 15 therapists on last page\n",
|
||||
" if page == npages - 1 and therapist == 15:\n",
|
||||
" break\n",
|
||||
" print(f\"parsing {page}|{therapist}…\", end=\"\\r\")\n",
|
||||
" therapists.append(parse_extracted_html(os.path.join(extractedpath, f\"page_{page}_{therapist}.html\")))\n",
|
||||
"\n",
|
||||
"with open(os.path.join(datapath, \"therapists.json\"), \"w+\") as outfile:\n",
|
||||
" json.dump(therapists, outfile, indent=2)\n",
|
||||
"\n",
|
||||
" "
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "python-minimal kernel",
|
||||
"language": "python",
|
||||
"name": "python-minimal"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue