Begin to scrape Pokemon

This commit is contained in:
2022-10-19 22:08:35 -04:00
parent ea85e210fb
commit ae7c9b306f
4 changed files with 77 additions and 3 deletions

2
.gitignore vendored
View File

@@ -3,6 +3,8 @@
__pycache__/ __pycache__/
*.py[cod] *.py[cod]
*$py.class *$py.class
.vscode
pokemon
# C extensions # C extensions
*.so *.so

View File

@@ -9,6 +9,8 @@ python_version = "3.10"
[packages] [packages]
bs4 = "*" bs4 = "*"
ebooklib = "*" ebooklib = "*"
lxml = "*"
requests = "*"
[dev-packages] [dev-packages]
black = "*" black = "*"

44
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{ {
"_meta": { "_meta": {
"hash": { "hash": {
"sha256": "cda2eded9d5e4bc251bfebd3c4ca1abb72405bf9d35187ca794466dff1b9bff8" "sha256": "1efb2d47545e7351ff8f5cd6187b3f583170ff7f5a8df26c4232fcf2cab59837"
}, },
"pipfile-spec": 6, "pipfile-spec": 6,
"requires": { "requires": {
@@ -31,6 +31,22 @@
"index": "pypi", "index": "pypi",
"version": "==0.0.1" "version": "==0.0.1"
}, },
"certifi": {
"hashes": [
"sha256:0d9c601124e5a6ba9712dbc60d9c53c21e34f5f641fe83002317394311bdce14",
"sha256:90c1a32f1d68f940488354e36370f6cca89f0f106db09518524c88d6ed83f382"
],
"markers": "python_version >= '3.6'",
"version": "==2022.9.24"
},
"charset-normalizer": {
"hashes": [
"sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845",
"sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"
],
"markers": "python_full_version >= '3.6.0'",
"version": "==2.1.1"
},
"ebooklib": { "ebooklib": {
"hashes": [ "hashes": [
"sha256:fe23e22c28050196c68db3e7b13b257bf39426d927cb395c6f2cc13ac11327f1" "sha256:fe23e22c28050196c68db3e7b13b257bf39426d927cb395c6f2cc13ac11327f1"
@@ -38,6 +54,14 @@
"index": "pypi", "index": "pypi",
"version": "==0.17.1" "version": "==0.17.1"
}, },
"idna": {
"hashes": [
"sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4",
"sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"
],
"markers": "python_version >= '3.5'",
"version": "==3.4"
},
"lxml": { "lxml": {
"hashes": [ "hashes": [
"sha256:04da965dfebb5dac2619cb90fcf93efdb35b3c6994fea58a157a834f2f94b318", "sha256:04da965dfebb5dac2619cb90fcf93efdb35b3c6994fea58a157a834f2f94b318",
@@ -114,12 +138,20 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==4.9.1" "version": "==4.9.1"
}, },
"requests": {
"hashes": [
"sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983",
"sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349"
],
"index": "pypi",
"version": "==2.28.1"
},
"six": { "six": {
"hashes": [ "hashes": [
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
"sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
], ],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.16.0" "version": "==1.16.0"
}, },
"soupsieve": { "soupsieve": {
@@ -129,6 +161,14 @@
], ],
"markers": "python_version >= '3.6'", "markers": "python_version >= '3.6'",
"version": "==2.3.2.post1" "version": "==2.3.2.post1"
},
"urllib3": {
"hashes": [
"sha256:3fa96cf423e6987997fc326ae8df396db2a8b7c667747d47ddd8ecba91f4a74e",
"sha256:b930dd878d5a8afb066a637fbb35144fe7901e3b209d1cd4f524bd0e9deee997"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' and python_version < '4'",
"version": "==1.26.12"
} }
}, },
"develop": { "develop": {

View File

@@ -1,4 +1,34 @@
import requests
import sys
from bs4 import BeautifulSoup
from typing import List
POKEMON_FILE = "pokemon/pokedex.html"
POKEMON_URL = "https://bulbapedia.bulbagarden.net/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number"
def get_pokedex():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0'
}
r = requests.get(POKEMON_URL, headers=headers)
with open(POKEMON_FILE, "w") as f:
f.write(r.text)
def pokemon_for_generation_soup(generation_soup: BeautifulSoup):
print(generation_soup)
table_soup: BeautifulSoup = generation_soup.find_next_sibling("table")
tbody_soup: BeautifulSoup = generation_soup.find_next("tbody")
table_row_soups: List[BeautifulSoup()] = tbody_soup.find_all_next("tr")
for table_row_soup in table_row_soups:
print(table_row_soup.find_next("th").next_element.attrs["title"])
sys.exit(0)
return tbody_soup
def main(): def main():
print("hello, world!") with open(POKEMON_FILE, "r") as r:
soup = BeautifulSoup(r, "html.parser")
pokemon_list_soup: BeautifulSoup = soup.find(id="List_of_Pokémon_by_National_Pokédex_number").parent
generation_soups: BeautifulSoup = pokemon_list_soup.find_next_siblings("h3")[0:1]
pokemon = map(pokemon_for_generation_soup, generation_soups)
print(list(pokemon))