crawled allll the data :3
This commit is contained in:
parent
f2400c541a
commit
b0ed32e0d2
15 changed files with 6872 additions and 6 deletions
77
Main.ipynb
77
Main.ipynb
|
|
@ -2,20 +2,83 @@
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 5,
|
||||||
"id": "72fb7570",
|
"id": "83707cda",
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"vscode": {
|
"outputs": [],
|
||||||
"languageId": "plaintext"
|
"source": [
|
||||||
}
|
"import requests\n",
|
||||||
|
"import os"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "72fb7570",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"url = \"https://www.arztsuche-bw.de/index.php?suchen=1&sorting=name&direction=ASC&arztgruppe=psychotherapeut&landkreis=Karlsruhe+-+Stadt\"\n",
|
||||||
|
"offset_str = \"&offset=\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"id": "8eff4531",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Crawling page 13\r"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"outpath = \"./data\"\n",
|
||||||
|
"\n",
|
||||||
|
"if not os.path.exists(outpath):\n",
|
||||||
|
" os.makedirs(outpath)\n",
|
||||||
|
"\n",
|
||||||
|
"# we can see on the website there are 14 pages\n",
|
||||||
|
"for page in range(14):\n",
|
||||||
|
" print(f\"Crawling page {page}\", end=\"\\r\")\n",
|
||||||
|
" \n",
|
||||||
|
" res = requests.get(url + offset_str + str(page * 20)) # 20 items per page\n",
|
||||||
|
"\n",
|
||||||
|
" if res.status_code == 200:\n",
|
||||||
|
" with open(os.path.join(outpath, f\"page_{page}.html\"), \"w+\") as outfile:\n",
|
||||||
|
" outfile.write(res.text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "7d7f9ca1",
|
||||||
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": []
|
"source": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "python-minimal kernel",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python-minimal"
|
||||||
|
},
|
||||||
"language_info": {
|
"language_info": {
|
||||||
"name": "python"
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.7"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|
|
||||||
495
data/page_0.html
Normal file
495
data/page_0.html
Normal file
File diff suppressed because one or more lines are too long
495
data/page_1.html
Normal file
495
data/page_1.html
Normal file
File diff suppressed because one or more lines are too long
516
data/page_10.html
Normal file
516
data/page_10.html
Normal file
File diff suppressed because one or more lines are too long
499
data/page_11.html
Normal file
499
data/page_11.html
Normal file
File diff suppressed because one or more lines are too long
466
data/page_12.html
Normal file
466
data/page_12.html
Normal file
File diff suppressed because one or more lines are too long
432
data/page_13.html
Normal file
432
data/page_13.html
Normal file
File diff suppressed because one or more lines are too long
511
data/page_2.html
Normal file
511
data/page_2.html
Normal file
File diff suppressed because one or more lines are too long
486
data/page_3.html
Normal file
486
data/page_3.html
Normal file
File diff suppressed because one or more lines are too long
491
data/page_4.html
Normal file
491
data/page_4.html
Normal file
File diff suppressed because one or more lines are too long
454
data/page_5.html
Normal file
454
data/page_5.html
Normal file
File diff suppressed because one or more lines are too long
454
data/page_6.html
Normal file
454
data/page_6.html
Normal file
File diff suppressed because one or more lines are too long
512
data/page_7.html
Normal file
512
data/page_7.html
Normal file
File diff suppressed because one or more lines are too long
496
data/page_8.html
Normal file
496
data/page_8.html
Normal file
File diff suppressed because one or more lines are too long
496
data/page_9.html
Normal file
496
data/page_9.html
Normal file
File diff suppressed because one or more lines are too long
Loading…
Add table
Add a link
Reference in a new issue