86 lines
1.9 KiB
Text
86 lines
1.9 KiB
Text
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "83707cda",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import requests\n",
|
|
"import os"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "72fb7570",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"url = \"https://www.arztsuche-bw.de/index.php?suchen=1&sorting=name&direction=ASC&arztgruppe=psychotherapeut&landkreis=Karlsruhe+-+Stadt\"\n",
|
|
"offset_str = \"&offset=\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "8eff4531",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Crawling page 13\r"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"outpath = \"./data\"\n",
|
|
"\n",
|
|
"if not os.path.exists(outpath):\n",
|
|
" os.makedirs(outpath)\n",
|
|
"\n",
|
|
"# we can see on the website there are 14 pages\n",
|
|
"for page in range(14):\n",
|
|
" print(f\"Crawling page {page}\", end=\"\\r\")\n",
|
|
" \n",
|
|
" res = requests.get(url + offset_str + str(page * 20)) # 20 items per page\n",
|
|
"\n",
|
|
" if res.status_code == 200:\n",
|
|
" with open(os.path.join(outpath, f\"page_{page}.html\"), \"w+\") as outfile:\n",
|
|
" outfile.write(res.text)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "7d7f9ca1",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "python-minimal kernel",
|
|
"language": "python",
|
|
"name": "python-minimal"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.7"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|