Implement Pokemon description and image download
parent
ae7c9b306f
commit
10884ce073
1
Pipfile
1
Pipfile
|
@ -10,6 +10,7 @@ python_version = "3.10"
|
|||
bs4 = "*"
|
||||
ebooklib = "*"
|
||||
lxml = "*"
|
||||
pydantic = "*"
|
||||
requests = "*"
|
||||
|
||||
[dev-packages]
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "1efb2d47545e7351ff8f5cd6187b3f583170ff7f5a8df26c4232fcf2cab59837"
|
||||
"sha256": "5e5d63b2697bac028104473e63e0cfee2967b7aa93c011800ea85523c22c3f99"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
|
@ -135,9 +135,51 @@
|
|||
"sha256:fe17d10b97fdf58155f858606bddb4e037b805a60ae023c009f760d8361a4eb8",
|
||||
"sha256:fe749b052bb7233fe5d072fcb549221a8cb1a16725c47c37e42b0b9cb3ff2c3f"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"index": "pypi",
|
||||
"version": "==4.9.1"
|
||||
},
|
||||
"pydantic": {
|
||||
"hashes": [
|
||||
"sha256:05e00dbebbe810b33c7a7362f231893183bcc4251f3f2ff991c31d5c08240c42",
|
||||
"sha256:06094d18dd5e6f2bbf93efa54991c3240964bb663b87729ac340eb5014310624",
|
||||
"sha256:0b959f4d8211fc964772b595ebb25f7652da3f22322c007b6fed26846a40685e",
|
||||
"sha256:19b3b9ccf97af2b7519c42032441a891a5e05c68368f40865a90eb88833c2559",
|
||||
"sha256:1b6ee725bd6e83ec78b1aa32c5b1fa67a3a65badddde3976bca5fe4568f27709",
|
||||
"sha256:1ee433e274268a4b0c8fde7ad9d58ecba12b069a033ecc4645bb6303c062d2e9",
|
||||
"sha256:216f3bcbf19c726b1cc22b099dd409aa371f55c08800bcea4c44c8f74b73478d",
|
||||
"sha256:2d0567e60eb01bccda3a4df01df677adf6b437958d35c12a3ac3e0f078b0ee52",
|
||||
"sha256:2e05aed07fa02231dbf03d0adb1be1d79cabb09025dd45aa094aa8b4e7b9dcda",
|
||||
"sha256:352aedb1d71b8b0736c6d56ad2bd34c6982720644b0624462059ab29bd6e5912",
|
||||
"sha256:355639d9afc76bcb9b0c3000ddcd08472ae75318a6eb67a15866b87e2efa168c",
|
||||
"sha256:37c90345ec7dd2f1bcef82ce49b6235b40f282b94d3eec47e801baf864d15525",
|
||||
"sha256:4b8795290deaae348c4eba0cebb196e1c6b98bdbe7f50b2d0d9a4a99716342fe",
|
||||
"sha256:5760e164b807a48a8f25f8aa1a6d857e6ce62e7ec83ea5d5c5a802eac81bad41",
|
||||
"sha256:6eb843dcc411b6a2237a694f5e1d649fc66c6064d02b204a7e9d194dff81eb4b",
|
||||
"sha256:7b5ba54d026c2bd2cb769d3468885f23f43710f651688e91f5fb1edcf0ee9283",
|
||||
"sha256:7c2abc4393dea97a4ccbb4ec7d8658d4e22c4765b7b9b9445588f16c71ad9965",
|
||||
"sha256:81a7b66c3f499108b448f3f004801fcd7d7165fb4200acb03f1c2402da73ce4c",
|
||||
"sha256:91b8e218852ef6007c2b98cd861601c6a09f1aa32bbbb74fab5b1c33d4a1e410",
|
||||
"sha256:9300fcbebf85f6339a02c6994b2eb3ff1b9c8c14f502058b5bf349d42447dcf5",
|
||||
"sha256:9cabf4a7f05a776e7793e72793cd92cc865ea0e83a819f9ae4ecccb1b8aa6116",
|
||||
"sha256:a1f5a63a6dfe19d719b1b6e6106561869d2efaca6167f84f5ab9347887d78b98",
|
||||
"sha256:a4c805731c33a8db4b6ace45ce440c4ef5336e712508b4d9e1aafa617dc9907f",
|
||||
"sha256:ae544c47bec47a86bc7d350f965d8b15540e27e5aa4f55170ac6a75e5f73b644",
|
||||
"sha256:b97890e56a694486f772d36efd2ba31612739bc6f3caeee50e9e7e3ebd2fdd13",
|
||||
"sha256:bb6ad4489af1bac6955d38ebcb95079a836af31e4c4f74aba1ca05bb9f6027bd",
|
||||
"sha256:bedf309630209e78582ffacda64a21f96f3ed2e51fbf3962d4d488e503420254",
|
||||
"sha256:c1ba1afb396148bbc70e9eaa8c06c1716fdddabaf86e7027c5988bae2a829ab6",
|
||||
"sha256:c33602f93bfb67779f9c507e4d69451664524389546bacfe1bee13cae6dc7488",
|
||||
"sha256:c4aac8e7103bf598373208f6299fa9a5cfd1fc571f2d40bf1dd1955a63d6eeb5",
|
||||
"sha256:c6f981882aea41e021f72779ce2a4e87267458cc4d39ea990729e21ef18f0f8c",
|
||||
"sha256:cc78cc83110d2f275ec1970e7a831f4e371ee92405332ebfe9860a715f8336e1",
|
||||
"sha256:d49f3db871575e0426b12e2f32fdb25e579dea16486a26e5a0474af87cb1ab0a",
|
||||
"sha256:dd3f9a40c16daf323cf913593083698caee97df2804aa36c4b3175d5ac1b92a2",
|
||||
"sha256:e0bedafe4bc165ad0a56ac0bd7695df25c50f76961da29c050712596cf092d6d",
|
||||
"sha256:e9069e1b01525a96e6ff49e25876d90d5a563bc31c658289a8772ae186552236"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.10.2"
|
||||
},
|
||||
"requests": {
|
||||
"hashes": [
|
||||
"sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983",
|
||||
|
@ -162,6 +204,14 @@
|
|||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2.3.2.post1"
|
||||
},
|
||||
"typing-extensions": {
|
||||
"hashes": [
|
||||
"sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa",
|
||||
"sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==4.4.0"
|
||||
},
|
||||
"urllib3": {
|
||||
"hashes": [
|
||||
"sha256:3fa96cf423e6987997fc326ae8df396db2a8b7c667747d47ddd8ecba91f4a74e",
|
||||
|
|
35
src/main.py
35
src/main.py
|
@ -1,34 +1,11 @@
|
|||
import requests
|
||||
import sys
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import List
|
||||
|
||||
POKEMON_FILE = "pokemon/pokedex.html"
|
||||
POKEMON_URL = "https://bulbapedia.bulbagarden.net/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number"
|
||||
import logging
|
||||
import src.pokemon
|
||||
|
||||
|
||||
def get_pokedex():
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0'
|
||||
}
|
||||
r = requests.get(POKEMON_URL, headers=headers)
|
||||
with open(POKEMON_FILE, "w") as f:
|
||||
f.write(r.text)
|
||||
def init_logging():
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
def pokemon_for_generation_soup(generation_soup: BeautifulSoup):
|
||||
print(generation_soup)
|
||||
table_soup: BeautifulSoup = generation_soup.find_next_sibling("table")
|
||||
tbody_soup: BeautifulSoup = generation_soup.find_next("tbody")
|
||||
table_row_soups: List[BeautifulSoup()] = tbody_soup.find_all_next("tr")
|
||||
for table_row_soup in table_row_soups:
|
||||
print(table_row_soup.find_next("th").next_element.attrs["title"])
|
||||
sys.exit(0)
|
||||
return tbody_soup
|
||||
|
||||
def main():
|
||||
with open(POKEMON_FILE, "r") as r:
|
||||
soup = BeautifulSoup(r, "html.parser")
|
||||
pokemon_list_soup: BeautifulSoup = soup.find(id="List_of_Pokémon_by_National_Pokédex_number").parent
|
||||
generation_soups: BeautifulSoup = pokemon_list_soup.find_next_siblings("h3")[0:1]
|
||||
pokemon = map(pokemon_for_generation_soup, generation_soups)
|
||||
print(list(pokemon))
|
||||
init_logging()
|
||||
p = src.pokemon.get_pokemon()
|
||||
|
|
|
@ -0,0 +1,123 @@
|
|||
import requests
|
||||
import sys
|
||||
import os
|
||||
import logging
|
||||
from pydantic import BaseModel
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import List
|
||||
|
||||
|
||||
POKEMON_CACHE_DIRECTORY = "pokemon"
|
||||
BULBAPEDIA_BASE_URL = "https://bulbapedia.bulbagarden.net"
|
||||
NATIONAL_INDEX_URL = BULBAPEDIA_BASE_URL + "/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number"
|
||||
|
||||
|
||||
class Pokemon(BaseModel):
|
||||
name: str
|
||||
index: str
|
||||
html_url: str
|
||||
img_url: str
|
||||
html_filepath: str
|
||||
img_filepath: str
|
||||
json_filepath: str
|
||||
description: str = ""
|
||||
|
||||
|
||||
def download_to_file(url: str, filepath: str, override=False):
|
||||
""" Downloads url into filepath. """
|
||||
if os.path.isfile(filepath) and override is False:
|
||||
logging.debug(f"'{filepath}' exists.")
|
||||
return
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0'
|
||||
}
|
||||
r = requests.get(url, headers=headers)
|
||||
if r.status_code != 200:
|
||||
logging.warning(f"Could not download '{filepath}'")
|
||||
return
|
||||
|
||||
# Works for text and images
|
||||
with open(filepath, "wb") as f:
|
||||
for c in r:
|
||||
f.write(c)
|
||||
logging.debug(f"'{filepath}' downloaded.")
|
||||
|
||||
|
||||
def get_pokemon() -> List[Pokemon]:
|
||||
""" Scrape Pokemon from the Bulbapedia national dex """
|
||||
NATIONAL_INDEX_FILEPATH = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html")
|
||||
download_to_file(NATIONAL_INDEX_URL, NATIONAL_INDEX_FILEPATH)
|
||||
with open(NATIONAL_INDEX_FILEPATH, "r") as r:
|
||||
soup = BeautifulSoup(r, "html.parser")
|
||||
pokemon_list_soup: BeautifulSoup = soup.find(id="List_of_Pokémon_by_National_Pokédex_number").parent
|
||||
generation_soups: BeautifulSoup = pokemon_list_soup.find_next_siblings("h3")
|
||||
|
||||
table_row_soups = []
|
||||
for generation_soup in generation_soups:
|
||||
table_soup: BeautifulSoup = generation_soup.find_next_sibling("table")
|
||||
tbody_soup: BeautifulSoup = generation_soup.find_next("tbody")
|
||||
# skip first row because it is the header
|
||||
table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:]
|
||||
|
||||
pokemon = []
|
||||
for table_row_soup in table_row_soups:
|
||||
name = table_row_soup.find_next("th").next_element.attrs["title"]
|
||||
|
||||
# ignore Galarian and Alolan Pokemon so
|
||||
if pokemon and pokemon[-1].name == name:
|
||||
continue
|
||||
|
||||
# load Pokemon from JSON if it already exists
|
||||
json_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".json")
|
||||
if os.path.isfile(json_filepath):
|
||||
p = Pokemon.parse_file(json_filepath)
|
||||
pokemon.append(p)
|
||||
logging.info(f"Loaded {p.json_filepath}.")
|
||||
continue
|
||||
|
||||
index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
|
||||
html_url = BULBAPEDIA_BASE_URL + table_row_soup.find_next("th").next_element.attrs["href"]
|
||||
img_url = table_row_soup.find("img").attrs["src"]
|
||||
html_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
|
||||
img_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
|
||||
p = Pokemon(name=name,
|
||||
index=index,
|
||||
html_url=html_url,
|
||||
img_url=img_url,
|
||||
html_filepath=html_filepath,
|
||||
img_filepath=img_filepath,
|
||||
json_filepath=json_filepath)
|
||||
pokemon.append(p)
|
||||
extend_pokemon(p)
|
||||
with open(p.json_filepath, 'w') as f:
|
||||
f.write(p.json())
|
||||
logging.info(f"Saved {p.json_filepath}.")
|
||||
|
||||
# Filter out speculative Pokemon
|
||||
pokemon = [p for p in pokemon if not p.description.startswith("This article's contents will change")]
|
||||
|
||||
return pokemon
|
||||
|
||||
|
||||
def extend_pokemon(p: Pokemon):
|
||||
""" Add description and download Pokemon image """
|
||||
download_to_file(p.html_url, p.html_filepath)
|
||||
with open(p.html_filepath, "r") as r:
|
||||
soup = BeautifulSoup(r, "html.parser")
|
||||
content_soup: BeautifulSoup = soup.find(id='mw-content-text').contents[0]
|
||||
|
||||
# description
|
||||
p_soup = content_soup.find("p")
|
||||
description = []
|
||||
while p_soup.name == 'p':
|
||||
description.append(p_soup.get_text())
|
||||
p_soup = p_soup.next_sibling
|
||||
p.description = "".join(description)
|
||||
|
||||
# image
|
||||
img_url = content_soup.find("table").find_next_sibling("table").find("img").attrs["src"]
|
||||
img_url = img_url.replace("//", "https://")
|
||||
p.img_url = img_url
|
||||
download_to_file(img_url, p.img_filepath)
|
||||
|
Loading…
Reference in New Issue