Handle Mr. Mime, Nidoran, farfetch'd, and sirfetch'd to fix #1

This commit is contained in:
2022-10-28 13:44:21 -04:00
parent 9c200a1246
commit 7d9209d52e
3 changed files with 70 additions and 23 deletions

View File

@@ -23,7 +23,8 @@ def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml:
content = ["<h1>Pokedex</h1>"] content = ["<h1>Pokedex</h1>"]
for p in pokemon: for p in pokemon:
content.append(f'<h2 id="{POKEMON_ID_PREFIX}{p.name.lower()}">{p.name}</h2>') p_id = p.name.lower().replace(". ", "")
content.append(f'<h2 id="{POKEMON_ID_PREFIX}{p.link_id}">{p.name}</h2>')
content.append( content.append(
f' <p><img alt="[Pokemon {p.name}]" src="../{p.img_filename}"/><br/></p>' f' <p><img alt="[Pokemon {p.name}]" src="../{p.img_filename}"/><br/></p>'
) )
@@ -39,25 +40,59 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
r = re.compile("([:,.!?“”‘’… ]+)") r = re.compile("([:,.!?“”‘’… ]+)")
soup: BeautifulSoup = BeautifulSoup(chapter.content, "html.parser") soup: BeautifulSoup = BeautifulSoup(chapter.content, "html.parser")
def pokemon_name_to_link(key: str, word: str) -> Tag: def pokemon_name_to_link(p: Pokemon, name_as_in_book: str) -> Tag:
tag = soup.new_tag("a") tag = soup.new_tag("a")
tag.string = word tag.string = name_as_in_book
tag.attrs["href"] = f"np_pokedex.xhtml#{POKEMON_ID_PREFIX}{key}" tag.attrs["href"] = f"np_pokedex.xhtml#{POKEMON_ID_PREFIX}{p.link_id}"
tag.attrs["style"] = "color:black;text-decoration:none" # tag.attrs["style"] = "color:black;text-decoration:none"
return tag return tag
def patch_string(section: NavigableString) -> List: def patch_string(section: NavigableString) -> List:
"""Replace Pokemon with link to Pokemon; requires splitting up the """Replace Pokemon with link to Pokemon; requires splitting up the
NavigableString into a list of NavigableStrings and Tags.""" NavigableString into a list of NavigableStrings and Tags."""
result = [[]] result = [[]]
for word in r.split(str(section)): index, chunks = 0, r.split(str(section))
while index < len(chunks):
word = chunks[index]
if word.lower() in pokemon_lookup: if word.lower() in pokemon_lookup:
pokemon_lookup[word.lower()].appears_in_book = True p = pokemon_lookup[word.lower()]
link = pokemon_name_to_link(word.lower(), word) p.appears_in_book = True
link = pokemon_name_to_link(p, word)
result.append(link)
result.append([])
elif word == "Mr" and index + 2 < len(chunks) and \
chunks[index + 1] == ". " and chunks[index + 2] == "Mime":
# Handle "Mr. Mime" which is split into ["Mr", ". ", "Mime"]
p = pokemon_lookup["mr. mime"]
p.appears_in_book = True
name = "".join(chunks[index:index + 3])
link = pokemon_name_to_link(p, name)
index += 2
result.append(link)
result.append([])
elif word.lower() == "farfetch" and index + 2 < len(chunks) and \
chunks[index + 1] == "" and chunks[index + 2] == "d":
# Handle "farfetch'ed"
p = pokemon_lookup["farfetch'd"]
p.appears_in_book = True
name = "".join(chunks[index:index + 3])
link = pokemon_name_to_link(p, name)
index += 2
result.append(link)
result.append([])
elif word.lower() == "sirfetch" and index + 2 < len(chunks) and \
chunks[index + 1] == "" and chunks[index + 2] == "d":
# Handle "sirfetch'ed"
p = pokemon_lookup["sirfetch'd"]
p.appears_in_book = True
name = "".join(chunks[index:index + 3])
link = pokemon_name_to_link(p, name)
index += 2
result.append(link) result.append(link)
result.append([]) result.append([])
else: else:
result[-1].append(word) result[-1].append(word)
index += 1
# convert words back into strings # convert words back into strings
for i in range(len(result)): for i in range(len(result)):
@@ -81,6 +116,13 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
chapter.content = str(soup) chapter.content = str(soup)
def get_pokemon_lookup(pokemon: List[Pokemon]) -> Dict[str, Pokemon]:
pokemon_lookup = {p.name.lower(): p for p in pokemon}
pokemon_lookup["nidoran"] = pokemon_lookup["nidoran♂"]
pokemon_lookup["barrierd"] = pokemon_lookup["mr. mime"]
return pokemon_lookup
def patch(epub_filename: str, pokemon: List[Pokemon]): def patch(epub_filename: str, pokemon: List[Pokemon]):
try: try:
book = epub.read_epub(epub_filename) book = epub.read_epub(epub_filename)
@@ -88,7 +130,7 @@ def patch(epub_filename: str, pokemon: List[Pokemon]):
logging.exception("Failed to open epub.") logging.exception("Failed to open epub.")
sys.exit(1) sys.exit(1)
pokemon_lookup = {p.name.lower(): p for p in pokemon} pokemon_lookup = get_pokemon_lookup(pokemon)
chapters = [ chapters = [
b b
for b in book.get_items() for b in book.get_items()

View File

@@ -2,6 +2,7 @@ import requests
import sys import sys
import os import os
import logging import logging
import re
from rich.progress import track from rich.progress import track
from pydantic import BaseModel from pydantic import BaseModel
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@@ -17,6 +18,7 @@ NATIONAL_INDEX_URL = (
class Pokemon(BaseModel): class Pokemon(BaseModel):
name: str name: str
link_id: str
index: str index: str
html_url: str html_url: str
img_url: str img_url: str
@@ -68,6 +70,7 @@ def get_pokemon_table_row_soups(national_index_filename: str) -> List[BeautifulS
def extract_pokemon_from_table_row(table_row_soup: BeautifulSoup) -> Pokemon: def extract_pokemon_from_table_row(table_row_soup: BeautifulSoup) -> Pokemon:
name = table_row_soup.find_next("th").next_element.attrs["title"] name = table_row_soup.find_next("th").next_element.attrs["title"]
link_id = re.sub("[^a-z]", "", name.lower())
# load Pokemon from JSON if it already exists # load Pokemon from JSON if it already exists
json_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".json") json_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".json")
@@ -86,6 +89,7 @@ def extract_pokemon_from_table_row(table_row_soup: BeautifulSoup) -> Pokemon:
img_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png") img_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
return Pokemon( return Pokemon(
name=name, name=name,
link_id=link_id,
index=index, index=index,
html_url=html_url, html_url=html_url,
img_url=img_url, img_url=img_url,
@@ -138,18 +142,18 @@ def extend_pokemon(p: Pokemon):
soup = BeautifulSoup(r, "html.parser") soup = BeautifulSoup(r, "html.parser")
content_soup: BeautifulSoup = soup.find(id="mw-content-text").contents[0] content_soup: BeautifulSoup = soup.find(id="mw-content-text").contents[0]
# description if not p.description:
p_soup = content_soup.find("p") p_soup = content_soup.find("p")
description = [] description = []
while p_soup.name == "p": while p_soup.name == "p":
description.append(p_soup.get_text()) description.append(p_soup.get_text())
p_soup = p_soup.next_sibling p_soup = p_soup.next_sibling
p.description = "".join(description) p.description = "".join(description)
# image if not os.path.isfile(p.img_filename):
img_url = ( img_url = (
content_soup.find("table").find_next_sibling("table").find("img").attrs["src"] content_soup.find("table").find_next_sibling("table").find("img").attrs["src"]
) )
img_url = img_url.replace("//", "https://") img_url = img_url.replace("//", "https://")
p.img_url = img_url p.img_url = img_url
download_to_file(img_url, p.img_filename) download_to_file(img_url, p.img_filename)

View File

@@ -19,6 +19,7 @@ def test_extract_pokemon_from_table_row(tmp_path):
row_soups = pokemon.get_pokemon_table_row_soups(national_index) row_soups = pokemon.get_pokemon_table_row_soups(national_index)
p = pokemon.extract_pokemon_from_table_row(row_soups[42]) p = pokemon.extract_pokemon_from_table_row(row_soups[42])
assert p.name == 'Vulpix' assert p.name == 'Vulpix'
assert p.link_id == 'vulpix'
assert p.index == '#037' assert p.index == '#037'
assert p.html_url == 'https://bulbapedia.bulbagarden.net/wiki/Vulpix_(Pok%C3%A9mon)' assert p.html_url == 'https://bulbapedia.bulbagarden.net/wiki/Vulpix_(Pok%C3%A9mon)'
assert p.img_url == '//archives.bulbagarden.net/media/upload/thumb/3/35/037Vulpix-Alola.png/70px-037Vulpix-Alola.png' assert p.img_url == '//archives.bulbagarden.net/media/upload/thumb/3/35/037Vulpix-Alola.png/70px-037Vulpix-Alola.png'