From ae7c9b306f5fddbe58fa2cd0b7f0280d5eb8d597 Mon Sep 17 00:00:00 2001 From: Felix Martin Date: Wed, 19 Oct 2022 22:08:35 -0400 Subject: [PATCH] Begin to scrape Pokemon --- .gitignore | 2 ++ Pipfile | 2 ++ Pipfile.lock | 44 ++++++++++++++++++++++++++++++++++++++++++-- src/main.py | 32 +++++++++++++++++++++++++++++++- 4 files changed, 77 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 5d381cc..835c994 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,8 @@ __pycache__/ *.py[cod] *$py.class +.vscode +pokemon # C extensions *.so diff --git a/Pipfile b/Pipfile index 76bc136..d4294e0 100644 --- a/Pipfile +++ b/Pipfile @@ -9,6 +9,8 @@ python_version = "3.10" [packages] bs4 = "*" ebooklib = "*" +lxml = "*" +requests = "*" [dev-packages] black = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 068582f..b17b019 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "cda2eded9d5e4bc251bfebd3c4ca1abb72405bf9d35187ca794466dff1b9bff8" + "sha256": "1efb2d47545e7351ff8f5cd6187b3f583170ff7f5a8df26c4232fcf2cab59837" }, "pipfile-spec": 6, "requires": { @@ -31,6 +31,22 @@ "index": "pypi", "version": "==0.0.1" }, + "certifi": { + "hashes": [ + "sha256:0d9c601124e5a6ba9712dbc60d9c53c21e34f5f641fe83002317394311bdce14", + "sha256:90c1a32f1d68f940488354e36370f6cca89f0f106db09518524c88d6ed83f382" + ], + "markers": "python_version >= '3.6'", + "version": "==2022.9.24" + }, + "charset-normalizer": { + "hashes": [ + "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845", + "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f" + ], + "markers": "python_full_version >= '3.6.0'", + "version": "==2.1.1" + }, "ebooklib": { "hashes": [ "sha256:fe23e22c28050196c68db3e7b13b257bf39426d927cb395c6f2cc13ac11327f1" @@ -38,6 +54,14 @@ "index": "pypi", "version": "==0.17.1" }, + "idna": { + "hashes": [ + "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4", + "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2" + ], + "markers": "python_version >= '3.5'", + "version": "==3.4" + }, "lxml": { "hashes": [ "sha256:04da965dfebb5dac2619cb90fcf93efdb35b3c6994fea58a157a834f2f94b318", @@ -114,12 +138,20 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==4.9.1" }, + "requests": { + "hashes": [ + "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983", + "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349" + ], + "index": "pypi", + "version": "==2.28.1" + }, "six": { "hashes": [ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.16.0" }, "soupsieve": { @@ -129,6 +161,14 @@ ], "markers": "python_version >= '3.6'", "version": "==2.3.2.post1" + }, + "urllib3": { + "hashes": [ + "sha256:3fa96cf423e6987997fc326ae8df396db2a8b7c667747d47ddd8ecba91f4a74e", + "sha256:b930dd878d5a8afb066a637fbb35144fe7901e3b209d1cd4f524bd0e9deee997" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' and python_version < '4'", + "version": "==1.26.12" } }, "develop": { diff --git a/src/main.py b/src/main.py index 90b83f1..e193db8 100644 --- a/src/main.py +++ b/src/main.py @@ -1,4 +1,34 @@ +import requests +import sys +from bs4 import BeautifulSoup +from typing import List +POKEMON_FILE = "pokemon/pokedex.html" +POKEMON_URL = "https://bulbapedia.bulbagarden.net/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number" + + +def get_pokedex(): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0' + } + r = requests.get(POKEMON_URL, headers=headers) + with open(POKEMON_FILE, "w") as f: + f.write(r.text) + +def pokemon_for_generation_soup(generation_soup: BeautifulSoup): + print(generation_soup) + table_soup: BeautifulSoup = generation_soup.find_next_sibling("table") + tbody_soup: BeautifulSoup = generation_soup.find_next("tbody") + table_row_soups: List[BeautifulSoup()] = tbody_soup.find_all_next("tr") + for table_row_soup in table_row_soups: + print(table_row_soup.find_next("th").next_element.attrs["title"]) + sys.exit(0) + return tbody_soup def main(): - print("hello, world!") + with open(POKEMON_FILE, "r") as r: + soup = BeautifulSoup(r, "html.parser") + pokemon_list_soup: BeautifulSoup = soup.find(id="List_of_Pokémon_by_National_Pokédex_number").parent + generation_soups: BeautifulSoup = pokemon_list_soup.find_next_siblings("h3")[0:1] + pokemon = map(pokemon_for_generation_soup, generation_soups) + print(list(pokemon))