Handle annoying Pokemon clearer and only hyperlink first occurrence of Pokemon in chapter

Remove dead code and auto-reformat
Handle Mr. Mime, Nidoran, farfetch'd, and sirfetch'd to fix #1
2022-12-04 20:16:10 -05:00 · 2022-10-28 13:47:27 -04:00 · 2022-10-28 13:44:21 -04:00 · 2022-10-24 20:36:31 -04:00 · 2022-10-24 20:31:29 -04:00
11 changed files with 24546 additions and 92 deletions
--- a/.gitignore
+++ b/.gitignore
@ -7,6 +7,7 @@ __pycache__/
 pokemon
 tmp
 ptoos.epub
+ptoos-with-links.epub

 # C extensions
 *.so
--- a/README.md
+++ b/README.md
@ -5,8 +5,28 @@ to descriptions and pictures of the Pokemon within the e-book itself.

 It works with the epub that you can download from [Daystar Eld's Patreon](https://www.patreon.com/daystareld/).

+## Usage
+
 ```shell
+pip install --user pipenv
 pipenv install
 pipenv shell
 python ptoos-xray.py "DaystarEld - Pokemon The Origin of Species.epub"
 ```
+
+## Run tests
+
+```shell
+pipenv install --dev
+pipenv run pytest
+```
+
+## Compress Pokemon PNGs
+
+Use `pngquant` to compress the PNGs and get a smaller epub file.
+
+## Credits
+
+Full credit for the Pokemon names, images, and descriptions goes to
+[Bulbapedia](https://bulbapedia.bulbagarden.net) under
+[Attribution-NonCommercial-ShareAlike 2.5](https://creativecommons.org/licenses/by-nc-sa/2.5/).
--- a/pytest.ini
+++ b/pytest.ini
@ -0,0 +1,2 @@
+[pytest]
+pythonpath = src
--- a/src/epub.py
+++ b/src/epub.py
@ -1,30 +1,46 @@
 import ebooklib
 import logging
 import re
+import sys
+from dataclasses import dataclass
 from bs4 import BeautifulSoup, Tag
 from bs4.element import NavigableString
 from ebooklib import epub
 from src.pokemon import Pokemon
-from typing import List, Dict
+from typing import List, Dict, Optional
 from rich.progress import track
 from rich.console import Console

 POKEMON_ID_PREFIX = "pokemon-id-"
+POKEDEX_UID = "np_pokedex"
+
+
+@dataclass
+class AnnoyingPokemon:
+    name_chunks: List[str]
+    length_chunks: int
+    name_in_pokedex: str
+
+
+ANNOYING_POKEMON = [
+    AnnoyingPokemon(["Mr", ".", "Mime"], 3, "mr. mime"),
+    AnnoyingPokemon(["farfetch", "’", "d"], 3, "farfetch'd"),
+    AnnoyingPokemon(["sirfetch", "’", "d"], 3, "sirfetch'd"),
+]


 def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml:
    POKEDEX_TITLE = "Pokedex"
    POKEDEX_FILE = "content/np_pokedex.xhtml"
-    POKEDEX_UID = "np_pokedex"
    chapter = epub.EpubHtml(
        title=POKEDEX_TITLE, file_name=POKEDEX_FILE, uid=POKEDEX_UID
    )
    content = ["<h1>Pokedex</h1>"]

    for p in pokemon:
-        content.append(f'<h2 id="{POKEMON_ID_PREFIX}{p.name.lower()}">{p.name}</h2>')
+        content.append(f'<h2 id="{POKEMON_ID_PREFIX}{p.link_id}">{p.name}</h2>')
        content.append(
-            f'  <p><img alt="[Pokemon {p.name}]" src="../{p.img_filepath}"/><br/></p>'
+            f'  <p><img alt="[Pokemon {p.name}]" src="../{p.img_filename}"/><br/></p>'
        )
        for paragraph in p.description.split("\n"):
            content.append(f"  <p>{paragraph}</p>")
@ -35,28 +51,58 @@ def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml:


 def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
-    r = re.compile("([:,.!?“”‘’… ]+)")
+    special_chars_regex = re.compile("([:,.!?“”‘’… ]+)")
    soup: BeautifulSoup = BeautifulSoup(chapter.content, "html.parser")

-    def pokemon_name_to_link(key: str, word: str) -> Tag:
+    # Set to remember which Pokemon have already gotten a link for that
+    # chapter.
+    pokemon_added_for_chapter = set()
+
+    def pokemon_to_link(p: Pokemon, name_as_in_book: str) -> Tag:
        tag = soup.new_tag("a")
-        tag.string = word
-        tag.attrs["href"] = f"np_pokedex.xhtml#{POKEMON_ID_PREFIX}{key}"
-        tag.attrs["style"] = "color:black;text-decoration:none"
+        tag.string = name_as_in_book
+        tag.attrs["href"] = f"np_pokedex.xhtml#{POKEMON_ID_PREFIX}{p.link_id}"
+        # tag.attrs["style"] = "color:black;text-decoration:none"
        return tag

+    def is_annoying_pokemon(index: int, chunks: List[str]) -> Optional[AnnoyingPokemon]:
+        for p in ANNOYING_POKEMON:
+            if p.name_chunks == list(
+                map(lambda s: s.lower(), chunks[index : index + p.length_chunks])
+            ):
+                return p
+        return None
+
    def patch_string(section: NavigableString) -> List:
        """Replace Pokemon with link to Pokemon; requires splitting up the
        NavigableString into a list of NavigableStrings and Tags."""
        result = [[]]
-        for word in r.split(str(section)):
+        index, chunks = 0, special_chars_regex.split(str(section))
+        while index < len(chunks):
+            word = chunks[index]
+            pokemon: Optional[Pokemon] = None
+            increment: int = 1
+
            if word.lower() in pokemon_lookup:
-                pokemon_lookup[word.lower()].appears_in_book = True
-                link = pokemon_name_to_link(word.lower(), word)
+                pokemon = pokemon_lookup[word.lower()]
+            elif annoying_pokemon := is_annoying_pokemon(index, chunks):
+                pokemon = pokemon_lookup[annoying_pokemon.name_in_pokedex]
+                increment = annoying_pokemon.length_chunks
+
+            if pokemon is not None and pokemon.name in pokemon_added_for_chapter:
+                pokemon = None
+
+            if pokemon is not None:
+                pokemon_added_for_chapter.add(pokemon.name)
+                pokemon.appears_in_book = True
+                name = "".join(chunks[index : index + increment])
+                link = pokemon_to_link(pokemon, name)
                result.append(link)
                result.append([])
+                index += increment
            else:
                result[-1].append(word)
+                index += 1

        # convert words back into strings
        for i in range(len(result)):
@ -80,16 +126,32 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
    chapter.content = str(soup)


-def patch(epub_filepath: str, pokemon: List[Pokemon]):
-    book = epub.read_epub(epub_filepath)
-
+def get_pokemon_lookup(pokemon: List[Pokemon]) -> Dict[str, Pokemon]:
    pokemon_lookup = {p.name.lower(): p for p in pokemon}
+    pokemon_lookup["nidoran"] = pokemon_lookup["nidoran♂"]
+    pokemon_lookup["barrierd"] = pokemon_lookup["mr. mime"]
+    return pokemon_lookup
+
+
+def patch(epub_filename: str, pokemon: List[Pokemon]):
+    try:
+        book = epub.read_epub(epub_filename)
+    except Exception:
+        logging.exception("Failed to open epub.")
+        sys.exit(1)
+
+    pokemon_lookup = get_pokemon_lookup(pokemon)
    chapters = [
        b
        for b in book.get_items()
        if isinstance(b, epub.EpubHtml)
        if b.id.startswith("np_")
    ]
+
+    if [c for c in chapters if c.id == POKEDEX_UID]:
+        logging.warning(f"It looks like '{epub_filename}' already has a Pokedex.")
+        sys.exit(1)
+
    for c in track(chapters, description="Add Pokemon links to chapters"):
        patch_chapter(c, pokemon_lookup)

@ -103,17 +165,17 @@ def patch(epub_filepath: str, pokemon: List[Pokemon]):
    book.spine.append((chapter.id, "yes"))

    for p in pokemon:
-        image_content = open(p.img_filepath, "rb").read()
+        image_content = open(p.img_filename, "rb").read()
        img = epub.EpubItem(
            uid=p.name,
-            file_name=p.img_filepath,
+            file_name=p.img_filename,
            media_type="image/png",
            content=image_content,
        )
        book.add_item(img)

    console = Console()
-    epub_out = epub_filepath.replace(".", "-with-links.")
+    epub_out = epub_filename.replace(".", "-with-links.")
    with console.status(f"Writing {epub_out}"):
        epub.write_epub(epub_out, book, {})
    console.print(f"[green]✓[/green] [orange1]{epub_out}[/orange1] written")
--- a/src/main.py
+++ b/src/main.py
@ -11,11 +11,14 @@ def main():
        level=logging.INFO,
        format="%(message)s",
        datefmt="[%X]",
-        handlers=[RichHandler()],
+        handlers=[RichHandler(rich_tracebacks=True)],
    )
    try:
        ptoos_epub = sys.argv[1]
    except IndexError:
        ptoos_epub = "ptoos.epub"
+        logging.warning(f"No epub file provided. Defaulting to '{ptoos_epub}'.")
    pokemon = src.pokemon.get_pokemon()
+    # for p in pokemon:
+    #     p.img_filename = p.img_filename.replace(".png", "-fs8.png")
    src.epub.patch(ptoos_epub, pokemon)
--- a/src/pokemon.py
+++ b/src/pokemon.py
@ -2,6 +2,7 @@ import requests
 import sys
 import os
 import logging
+import re
 from rich.progress import track
 from pydantic import BaseModel
 from bs4 import BeautifulSoup
@ -17,20 +18,21 @@ NATIONAL_INDEX_URL = (

 class Pokemon(BaseModel):
    name: str
+    link_id: str
    index: str
    html_url: str
    img_url: str
-    html_filepath: str
-    img_filepath: str
-    json_filepath: str
+    html_filename: str
+    img_filename: str
+    json_filename: str
    description: str = ""
    appears_in_book: bool = False


-def download_to_file(url: str, filepath: str, override=False):
-    """Downloads url into filepath."""
-    if os.path.isfile(filepath) and override is False:
-        logging.debug(f"'{filepath}' exists.")
+def download_to_file(url: str, filename: str, override=False):
+    """Downloads url into filename."""
+    if os.path.isfile(filename) and override is False:
+        logging.debug(f"'{filename}' exists.")
        return

    headers = {
@ -38,72 +40,91 @@ def download_to_file(url: str, filepath: str, override=False):
    }
    r = requests.get(url, headers=headers)
    if r.status_code != 200:
-        logging.warning(f"Could not download '{filepath}'")
-        return
+        logging.critical(f"Could not download '{filename}'.")
+        sys.exit(1)

    # Works for text and images
-    with open(filepath, "wb") as f:
+    with open(filename, "wb") as f:
        for c in r:
            f.write(c)
-    logging.debug(f"'{filepath}' downloaded.")
+    logging.debug(f"'{filename}' downloaded.")
+
+
+def download_national_index_html(national_index_filename: str):
+    download_to_file(NATIONAL_INDEX_URL, national_index_filename)
+
+
+def get_pokemon_table_row_soups(national_index_filename: str) -> List[BeautifulSoup]:
+    with open(national_index_filename, "r") as r:
+        soup = BeautifulSoup(r, "html.parser")
+    pokemon_list_soup = soup.find(
+        id="List_of_Pokémon_by_National_Pokédex_number"
+    ).parent
+    generation_soups = pokemon_list_soup.find_next_siblings("h3")
+    table_row_soups = []
+    for generation_soup in generation_soups:
+        table_soup = generation_soup.find_next_sibling("table")
+        tbody_soup = generation_soup.find_next("tbody")
+        # skip first row because it is the header
+        table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:]
+    return table_row_soups
+
+
+def extract_pokemon_from_table_row(table_row_soup: BeautifulSoup) -> Pokemon:
+    name = table_row_soup.find_next("th").next_element.attrs["title"]
+    link_id = re.sub("[^a-z]", "", name.lower())
+
+    # load Pokemon from JSON if it already exists
+    json_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".json")
+    if os.path.isfile(json_filename):
+        p = Pokemon.parse_file(json_filename)
+        logging.debug(f"Loaded '{p.json_filename}'.")
+        return p
+
+    index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
+    html_url = (
+        BULBAPEDIA_BASE_URL + table_row_soup.find_next("th").next_element.attrs["href"]
+    )
+    img_url = table_row_soup.find("img").attrs["src"]
+    html_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
+    img_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
+    return Pokemon(
+        name=name,
+        link_id=link_id,
+        index=index,
+        html_url=html_url,
+        img_url=img_url,
+        html_filename=html_filename,
+        img_filename=img_filename,
+        json_filename=json_filename,
+    )


 def get_pokemon() -> List[Pokemon]:
    """Scrape Pokemon from the Bulbapedia national dex"""
-    NATIONAL_INDEX_FILEPATH = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html")
-    download_to_file(NATIONAL_INDEX_URL, NATIONAL_INDEX_FILEPATH)
-    with open(NATIONAL_INDEX_FILEPATH, "r") as r:
-        soup = BeautifulSoup(r, "html.parser")
-    pokemon_list_soup: BeautifulSoup = soup.find(
-        id="List_of_Pokémon_by_National_Pokédex_number"
-    ).parent
-    generation_soups: BeautifulSoup = pokemon_list_soup.find_next_siblings("h3")
-
-    table_row_soups = []
-    for generation_soup in generation_soups:
-        table_soup: BeautifulSoup = generation_soup.find_next_sibling("table")
-        tbody_soup: BeautifulSoup = generation_soup.find_next("tbody")
-        # skip first row because it is the header
-        table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:]
+    if not os.path.isdir(POKEMON_CACHE_DIRECTORY):
+        os.mkdir(POKEMON_CACHE_DIRECTORY)
+    national_index_filename = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html")
+    download_national_index_html(national_index_filename)
+    table_row_soups = get_pokemon_table_row_soups(national_index_filename)

    pokemon = []
    for table_row_soup in track(table_row_soups, description="Download Pokemon"):
-        name = table_row_soup.find_next("th").next_element.attrs["title"]
+        p = extract_pokemon_from_table_row(table_row_soup)

-        # ignore Galarian and Alolan Pokemon so
-        if pokemon and pokemon[-1].name == name:
+        # Ignore Galarian and Alolan Pokemon (Pokemon with the same name)
+        if pokemon and pokemon[-1].name == p.name:
            continue
-
-        # load Pokemon from JSON if it already exists
-        json_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".json")
-        if os.path.isfile(json_filepath):
-            p = Pokemon.parse_file(json_filepath)
-            pokemon.append(p)
-            logging.debug(f"Loaded {p.json_filepath}.")
-            continue
-
-        index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
-        html_url = (
-            BULBAPEDIA_BASE_URL
-            + table_row_soup.find_next("th").next_element.attrs["href"]
-        )
-        img_url = table_row_soup.find("img").attrs["src"]
-        html_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
-        img_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
-        p = Pokemon(
-            name=name,
-            index=index,
-            html_url=html_url,
-            img_url=img_url,
-            html_filepath=html_filepath,
-            img_filepath=img_filepath,
-            json_filepath=json_filepath,
-        )
        pokemon.append(p)
+
+        # Pokemon has already been downloaded
+        if p.description and os.path.isfile(p.img_filename):
+            continue
+
        extend_pokemon(p)
-        with open(p.json_filepath, "w") as f:
+        with open(p.json_filename, "w") as f:
            f.write(p.json())
-            logging.debug(f"Saved {p.json_filepath}.")
+            logging.debug(f"Saved {p.json_filename}.")

    # Filter out speculative Pokemon
    pokemon = [
@ -117,23 +138,26 @@ def get_pokemon() -> List[Pokemon]:

 def extend_pokemon(p: Pokemon):
    """Add description and download Pokemon image"""
-    download_to_file(p.html_url, p.html_filepath)
-    with open(p.html_filepath, "r") as r:
+    download_to_file(p.html_url, p.html_filename)
+    with open(p.html_filename, "r") as r:
        soup = BeautifulSoup(r, "html.parser")
    content_soup: BeautifulSoup = soup.find(id="mw-content-text").contents[0]

-    # description
-    p_soup = content_soup.find("p")
-    description = []
-    while p_soup.name == "p":
-        description.append(p_soup.get_text())
-        p_soup = p_soup.next_sibling
-    p.description = "".join(description)
+    if not p.description:
+        p_soup = content_soup.find("p")
+        description = []
+        while p_soup.name == "p":
+            description.append(p_soup.get_text())
+            p_soup = p_soup.next_sibling
+        p.description = "".join(description)

-    # image
-    img_url = (
-        content_soup.find("table").find_next_sibling("table").find("img").attrs["src"]
-    )
-    img_url = img_url.replace("//", "https://")
-    p.img_url = img_url
-    download_to_file(img_url, p.img_filepath)
+    if not os.path.isfile(p.img_filename):
+        img_url = (
+            content_soup.find("table")
+            .find_next_sibling("table")
+            .find("img")
+            .attrs["src"]
+        )
+        img_url = img_url.replace("//", "https://")
+        p.img_url = img_url
+        download_to_file(img_url, p.img_filename)
--- a/test/pokedex.html
+++ b/test/pokedex.html
--- a/test/test_epub.py
+++ b/test/test_epub.py
--- a/test/test_pokedex.html
+++ b/test/test_pokedex.html
--- a/test/test_pokemon.py
+++ b/test/test_pokemon.py
@ -0,0 +1,44 @@
+import pokemon
+import os
+import filecmp
+
+
+def test_download_national_index_html(tmp_path):
+    pokemon_html = tmp_path / "pokedex.html"
+    pokemon.download_national_index_html(pokemon_html)
+    assert os.path.getsize(pokemon_html) > 500000
+
+
+def test_get_pokemon_table_row_soups():
+    national_index = "test/test_pokedex.html"
+    row_soups = pokemon.get_pokemon_table_row_soups(national_index)
+    assert len(row_soups) == 994
+
+
+def test_extract_pokemon_from_table_row(tmp_path):
+    national_index = "test/test_pokedex.html"
+    pokemon.POKEMON_CACHE_DIRECTORY = tmp_path
+    row_soups = pokemon.get_pokemon_table_row_soups(national_index)
+    p = pokemon.extract_pokemon_from_table_row(row_soups[42])
+    assert p.name == "Vulpix"
+    assert p.link_id == "vulpix"
+    assert p.index == "#037"
+    assert p.html_url == "https://bulbapedia.bulbagarden.net/wiki/Vulpix_(Pok%C3%A9mon)"
+    assert (
+        p.img_url
+        == "//archives.bulbagarden.net/media/upload/thumb/3/35/037Vulpix-Alola.png/70px-037Vulpix-Alola.png"
+    )
+    assert p.img_filename.endswith("vulpix.png")
+    assert p.json_filename.endswith("vulpix.json")
+    assert p.description == ""
+    assert p.appears_in_book == False
+
+
+def test_extend_pokemon(tmp_path):
+    national_index = "test/test_pokedex.html"
+    row_soups = pokemon.get_pokemon_table_row_soups(national_index)
+    p = pokemon.extract_pokemon_from_table_row(row_soups[42])
+    p.img_filename = tmp_path / "vulpix.png"
+    pokemon.extend_pokemon(p)
+    assert filecmp.cmp(p.img_filename, "test/test_vulpix.png")
+    assert p.description.startswith("Vulpix (Japanese: \u30ed\u30b3\u30f3 Rokon)")
--- a/test/test_vulpix.png
+++ b/test/test_vulpix.png
Author	SHA1	Message	Date
Felix Martin	3634f10e81	Handle annoying Pokemon clearer and only hyperlink first occurrence of Pokemon in chapter	2022-12-04 20:16:10 -05:00
felixm	808ab57ea9	Remove dead code and auto-reformat	2022-10-28 13:47:27 -04:00
felixm	7d9209d52e	Handle Mr. Mime, Nidoran, farfetch'd, and sirfetch'd to fix #1	2022-10-28 13:44:21 -04:00
Felix Martin	9c200a1246	Exit with warning if epub is annotated (resolves #4 )	2022-10-24 20:36:31 -04:00
Felix Martin	d224776a9a	Add tests and credit Bulbapedia in README This should resolve #2.	2022-10-24 20:31:29 -04:00