Implement Pokemon description and image download

2022-10-22 10:18:17 -04:00
parent ae7c9b306f
commit 10884ce073
4 changed files with 182 additions and 31 deletions
--- a/src/main.py
+++ b/src/main.py
@@ -1,34 +1,11 @@
-import requests
-import sys
-from bs4 import BeautifulSoup
-from typing import List
-
-POKEMON_FILE = "pokemon/pokedex.html"
-POKEMON_URL = "https://bulbapedia.bulbagarden.net/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number"
+import logging
+import src.pokemon


-def get_pokedex():
-    headers = {
-    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0'
-    }
-    r = requests.get(POKEMON_URL, headers=headers)
-    with open(POKEMON_FILE, "w") as f:
-        f.write(r.text)
+def init_logging():
+    logging.basicConfig(level=logging.DEBUG)

-def pokemon_for_generation_soup(generation_soup: BeautifulSoup):
-    print(generation_soup)
-    table_soup: BeautifulSoup = generation_soup.find_next_sibling("table")
-    tbody_soup: BeautifulSoup = generation_soup.find_next("tbody")
-    table_row_soups: List[BeautifulSoup()] = tbody_soup.find_all_next("tr")
-    for table_row_soup in table_row_soups:
-        print(table_row_soup.find_next("th").next_element.attrs["title"])
-    sys.exit(0)
-    return tbody_soup

 def main():
-    with open(POKEMON_FILE, "r") as r:
-        soup = BeautifulSoup(r, "html.parser")
-    pokemon_list_soup: BeautifulSoup = soup.find(id="List_of_Pokémon_by_National_Pokédex_number").parent
-    generation_soups: BeautifulSoup = pokemon_list_soup.find_next_siblings("h3")[0:1]
-    pokemon = map(pokemon_for_generation_soup, generation_soups)
-    print(list(pokemon))
+    init_logging()
+    p = src.pokemon.get_pokemon()
--- a/src/pokemon.py
+++ b/src/pokemon.py
@@ -0,0 +1,123 @@
+import requests
+import sys
+import os
+import logging
+from pydantic import BaseModel
+from bs4 import BeautifulSoup
+from typing import List
+
+
+POKEMON_CACHE_DIRECTORY = "pokemon"
+BULBAPEDIA_BASE_URL = "https://bulbapedia.bulbagarden.net"
+NATIONAL_INDEX_URL = BULBAPEDIA_BASE_URL + "/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number"
+
+
+class Pokemon(BaseModel):
+    name: str
+    index: str
+    html_url: str
+    img_url: str
+    html_filepath: str
+    img_filepath: str
+    json_filepath: str
+    description: str = ""
+
+
+def download_to_file(url: str, filepath: str, override=False):
+    """ Downloads url into filepath. """
+    if os.path.isfile(filepath) and override is False:
+        logging.debug(f"'{filepath}' exists.")
+        return
+
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0'
+    }
+    r = requests.get(url, headers=headers)
+    if r.status_code != 200:
+        logging.warning(f"Could not download '{filepath}'")
+        return
+
+    # Works for text and images
+    with open(filepath, "wb") as f:
+        for c in r:
+            f.write(c)
+    logging.debug(f"'{filepath}' downloaded.")
+
+
+def get_pokemon() -> List[Pokemon]:
+    """ Scrape Pokemon from the Bulbapedia national dex """
+    NATIONAL_INDEX_FILEPATH = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html")
+    download_to_file(NATIONAL_INDEX_URL, NATIONAL_INDEX_FILEPATH)
+    with open(NATIONAL_INDEX_FILEPATH, "r") as r:
+        soup = BeautifulSoup(r, "html.parser")
+    pokemon_list_soup: BeautifulSoup = soup.find(id="List_of_Pokémon_by_National_Pokédex_number").parent
+    generation_soups: BeautifulSoup = pokemon_list_soup.find_next_siblings("h3")
+
+    table_row_soups = []
+    for generation_soup in generation_soups:
+        table_soup: BeautifulSoup = generation_soup.find_next_sibling("table")
+        tbody_soup: BeautifulSoup = generation_soup.find_next("tbody")
+        # skip first row because it is the header
+        table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:]
+
+    pokemon = []
+    for table_row_soup in table_row_soups:
+        name = table_row_soup.find_next("th").next_element.attrs["title"]
+
+        # ignore Galarian and Alolan Pokemon so
+        if pokemon and pokemon[-1].name == name:
+            continue
+
+        # load Pokemon from JSON if it already exists
+        json_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".json")
+        if os.path.isfile(json_filepath):
+            p = Pokemon.parse_file(json_filepath)
+            pokemon.append(p)
+            logging.info(f"Loaded {p.json_filepath}.")
+            continue
+
+        index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
+        html_url = BULBAPEDIA_BASE_URL + table_row_soup.find_next("th").next_element.attrs["href"]
+        img_url = table_row_soup.find("img").attrs["src"]
+        html_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
+        img_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
+        p = Pokemon(name=name,
+                    index=index,
+                    html_url=html_url,
+                    img_url=img_url,
+                    html_filepath=html_filepath,
+                    img_filepath=img_filepath,
+                    json_filepath=json_filepath)
+        pokemon.append(p)
+        extend_pokemon(p)
+        with open(p.json_filepath, 'w') as f:
+            f.write(p.json())
+            logging.info(f"Saved {p.json_filepath}.")
+
+    # Filter out speculative Pokemon
+    pokemon = [p for p in pokemon if not p.description.startswith("This article's contents will change")]
+
+    return pokemon
+
+
+def extend_pokemon(p: Pokemon):
+    """ Add description and download Pokemon image """
+    download_to_file(p.html_url, p.html_filepath)
+    with open(p.html_filepath, "r") as r:
+        soup = BeautifulSoup(r, "html.parser")
+    content_soup: BeautifulSoup = soup.find(id='mw-content-text').contents[0]
+
+    # description
+    p_soup = content_soup.find("p")
+    description = []
+    while p_soup.name == 'p':
+        description.append(p_soup.get_text())
+        p_soup = p_soup.next_sibling
+    p.description = "".join(description)
+
+    # image
+    img_url = content_soup.find("table").find_next_sibling("table").find("img").attrs["src"]
+    img_url = img_url.replace("//", "https://")
+    p.img_url = img_url
+    download_to_file(img_url, p.img_filepath)
+