From 7d9209d52e5f8cc28120d55c59c39df56d1ec38b Mon Sep 17 00:00:00 2001 From: felixm Date: Fri, 28 Oct 2022 13:44:21 -0400 Subject: [PATCH] Handle Mr. Mime, Nidoran, farfetch'd, and sirfetch'd to fix #1 --- src/epub.py | 60 +++++++++++++++++++++++++++++++++++++------- src/pokemon.py | 32 ++++++++++++----------- test/test_pokemon.py | 1 + 3 files changed, 70 insertions(+), 23 deletions(-) diff --git a/src/epub.py b/src/epub.py index a02f986..5161641 100644 --- a/src/epub.py +++ b/src/epub.py @@ -23,7 +23,8 @@ def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml: content = ["

Pokedex

"] for p in pokemon: - content.append(f'

{p.name}

') + p_id = p.name.lower().replace(". ", "") + content.append(f'') content.append( f'

[Pokemon {p.name}]

' ) @@ -39,25 +40,59 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]): r = re.compile("([:,.!?“”‘’… ]+)") soup: BeautifulSoup = BeautifulSoup(chapter.content, "html.parser") - def pokemon_name_to_link(key: str, word: str) -> Tag: + def pokemon_name_to_link(p: Pokemon, name_as_in_book: str) -> Tag: tag = soup.new_tag("a") - tag.string = word - tag.attrs["href"] = f"np_pokedex.xhtml#{POKEMON_ID_PREFIX}{key}" - tag.attrs["style"] = "color:black;text-decoration:none" + tag.string = name_as_in_book + tag.attrs["href"] = f"np_pokedex.xhtml#{POKEMON_ID_PREFIX}{p.link_id}" + # tag.attrs["style"] = "color:black;text-decoration:none" return tag def patch_string(section: NavigableString) -> List: """Replace Pokemon with link to Pokemon; requires splitting up the NavigableString into a list of NavigableStrings and Tags.""" result = [[]] - for word in r.split(str(section)): + index, chunks = 0, r.split(str(section)) + while index < len(chunks): + word = chunks[index] if word.lower() in pokemon_lookup: - pokemon_lookup[word.lower()].appears_in_book = True - link = pokemon_name_to_link(word.lower(), word) + p = pokemon_lookup[word.lower()] + p.appears_in_book = True + link = pokemon_name_to_link(p, word) + result.append(link) + result.append([]) + elif word == "Mr" and index + 2 < len(chunks) and \ + chunks[index + 1] == ". " and chunks[index + 2] == "Mime": + # Handle "Mr. Mime" which is split into ["Mr", ". ", "Mime"] + p = pokemon_lookup["mr. mime"] + p.appears_in_book = True + name = "".join(chunks[index:index + 3]) + link = pokemon_name_to_link(p, name) + index += 2 + result.append(link) + result.append([]) + elif word.lower() == "farfetch" and index + 2 < len(chunks) and \ + chunks[index + 1] == "’" and chunks[index + 2] == "d": + # Handle "farfetch'ed" + p = pokemon_lookup["farfetch'd"] + p.appears_in_book = True + name = "".join(chunks[index:index + 3]) + link = pokemon_name_to_link(p, name) + index += 2 + result.append(link) + result.append([]) + elif word.lower() == "sirfetch" and index + 2 < len(chunks) and \ + chunks[index + 1] == "’" and chunks[index + 2] == "d": + # Handle "sirfetch'ed" + p = pokemon_lookup["sirfetch'd"] + p.appears_in_book = True + name = "".join(chunks[index:index + 3]) + link = pokemon_name_to_link(p, name) + index += 2 result.append(link) result.append([]) else: result[-1].append(word) + index += 1 # convert words back into strings for i in range(len(result)): @@ -81,6 +116,13 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]): chapter.content = str(soup) +def get_pokemon_lookup(pokemon: List[Pokemon]) -> Dict[str, Pokemon]: + pokemon_lookup = {p.name.lower(): p for p in pokemon} + pokemon_lookup["nidoran"] = pokemon_lookup["nidoran♂"] + pokemon_lookup["barrierd"] = pokemon_lookup["mr. mime"] + return pokemon_lookup + + def patch(epub_filename: str, pokemon: List[Pokemon]): try: book = epub.read_epub(epub_filename) @@ -88,7 +130,7 @@ def patch(epub_filename: str, pokemon: List[Pokemon]): logging.exception("Failed to open epub.") sys.exit(1) - pokemon_lookup = {p.name.lower(): p for p in pokemon} + pokemon_lookup = get_pokemon_lookup(pokemon) chapters = [ b for b in book.get_items() diff --git a/src/pokemon.py b/src/pokemon.py index 8ff4dac..acbf5a6 100644 --- a/src/pokemon.py +++ b/src/pokemon.py @@ -2,6 +2,7 @@ import requests import sys import os import logging +import re from rich.progress import track from pydantic import BaseModel from bs4 import BeautifulSoup @@ -17,6 +18,7 @@ NATIONAL_INDEX_URL = ( class Pokemon(BaseModel): name: str + link_id: str index: str html_url: str img_url: str @@ -68,6 +70,7 @@ def get_pokemon_table_row_soups(national_index_filename: str) -> List[BeautifulS def extract_pokemon_from_table_row(table_row_soup: BeautifulSoup) -> Pokemon: name = table_row_soup.find_next("th").next_element.attrs["title"] + link_id = re.sub("[^a-z]", "", name.lower()) # load Pokemon from JSON if it already exists json_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".json") @@ -86,6 +89,7 @@ def extract_pokemon_from_table_row(table_row_soup: BeautifulSoup) -> Pokemon: img_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png") return Pokemon( name=name, + link_id=link_id, index=index, html_url=html_url, img_url=img_url, @@ -138,18 +142,18 @@ def extend_pokemon(p: Pokemon): soup = BeautifulSoup(r, "html.parser") content_soup: BeautifulSoup = soup.find(id="mw-content-text").contents[0] - # description - p_soup = content_soup.find("p") - description = [] - while p_soup.name == "p": - description.append(p_soup.get_text()) - p_soup = p_soup.next_sibling - p.description = "".join(description) + if not p.description: + p_soup = content_soup.find("p") + description = [] + while p_soup.name == "p": + description.append(p_soup.get_text()) + p_soup = p_soup.next_sibling + p.description = "".join(description) - # image - img_url = ( - content_soup.find("table").find_next_sibling("table").find("img").attrs["src"] - ) - img_url = img_url.replace("//", "https://") - p.img_url = img_url - download_to_file(img_url, p.img_filename) + if not os.path.isfile(p.img_filename): + img_url = ( + content_soup.find("table").find_next_sibling("table").find("img").attrs["src"] + ) + img_url = img_url.replace("//", "https://") + p.img_url = img_url + download_to_file(img_url, p.img_filename) diff --git a/test/test_pokemon.py b/test/test_pokemon.py index 601cfd0..fd32736 100644 --- a/test/test_pokemon.py +++ b/test/test_pokemon.py @@ -19,6 +19,7 @@ def test_extract_pokemon_from_table_row(tmp_path): row_soups = pokemon.get_pokemon_table_row_soups(national_index) p = pokemon.extract_pokemon_from_table_row(row_soups[42]) assert p.name == 'Vulpix' + assert p.link_id == 'vulpix' assert p.index == '#037' assert p.html_url == 'https://bulbapedia.bulbagarden.net/wiki/Vulpix_(Pok%C3%A9mon)' assert p.img_url == '//archives.bulbagarden.net/media/upload/thumb/3/35/037Vulpix-Alola.png/70px-037Vulpix-Alola.png'