Update readme and format scripts

2022-10-22 21:37:01 -04:00
parent 1248d9d750
commit 8200225780
4 changed files with 78 additions and 46 deletions
@@ -1,9 +1,12 @@
 # poos-xray
-Script that annotates Pokemon: the Origin of the Species epub with links to
+Script that annotates the Pokemon: the Origin of the Species e-book with links
-descriptions of the Pokemon.
+to descriptions and pictures of the Pokemon within the e-book itself. 
 It works with the epub that you can download from [Daystar Eld's Patreon](https://www.patreon.com/daystareld/).
 ```shell
 pipenv install
 pipenv shell
 python poos-xray "DaystarEld - Pokemon The Origin of Species.epub"
 ```
@@ -9,26 +9,29 @@ from typing import List, Dict
 POKEMON_ID_PREFIX = "pokemon-id-"
 def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml:
    POKEDEX_TITLE = "Pokedex"
    POKEDEX_FILE = "content/np_pokedex.xhtml"
    POKEDEX_UID = "np_pokedex"
-    chapter = epub.EpubHtml(title=POKEDEX_TITLE, file_name=POKEDEX_FILE, uid=POKEDEX_UID)
+    chapter = epub.EpubHtml(
-    content = ['<h1>Pokedex</h1>']
+        title=POKEDEX_TITLE, file_name=POKEDEX_FILE, uid=POKEDEX_UID
    )
    content = ["<h1>Pokedex</h1>"]
    for p in pokemon:
        content.append(f'<h2 id="{POKEMON_ID_PREFIX}{p.name.lower()}">{p.name}</h2>')
-        content.append(f'  <p><img alt="[Pokemon {p.name}]" src="../{p.img_filepath}"/><br/></p>')
+        content.append(
            f'  <p><img alt="[Pokemon {p.name}]" src="../{p.img_filepath}"/><br/></p>'
        )
        for paragraph in p.description.split("\n"):
-            content.append(f'  <p>{paragraph}</p>')
+            content.append(f"  <p>{paragraph}</p>")
-        content.append('')
+        content.append("")
    chapter.content = "\n".join(content)
    return chapter
 def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
    r = re.compile("([:,.!?“”‘’…])")
    soup: BeautifulSoup = BeautifulSoup(chapter.content, "html.parser")
@@ -41,8 +44,8 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
        return tag
    def patch_string(section: NavigableString) -> List:
-        """ Replace Pokemon with link to Pokemon; requires splitting up the
+        """Replace Pokemon with link to Pokemon; requires splitting up the
-            NavigableString into a list of NavigableStrings and Tags. """
+        NavigableString into a list of NavigableStrings and Tags."""
        result = [[]]
        for word in str(section).split(" "):
            word_stripped = r.sub("", word)
@@ -55,7 +58,9 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
                else:
                    # add other chars before pokemon if there are any
                    result[-1].append("".join(word_split[:i]))
-                pokemon_link = pokemon_name_to_link(word_stripped.lower(), word_stripped)
+                pokemon_link = pokemon_name_to_link(
                    word_stripped.lower(), word_stripped
                )
                result.append(pokemon_link)
                result.append([])
                if i + 1 == len(word_split):
@@ -63,7 +68,7 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
                    result[-1].append(" ")
                else:
                    # add other chars after pokemon if there are any
-                    result[-1].append("".join(word_split[i + 1:]))
+                    result[-1].append("".join(word_split[i + 1 :]))
            else:
                result[-1].append(word)
@@ -96,20 +101,28 @@ def patch(epub_filepath: str, pokemon: List[Pokemon]):
    book.add_item(chapter)
    link = epub.Link(chapter.file_name, chapter.title, chapter.id)
    book.toc.append(link)
-    book.spine.append((chapter.id, 'yes'))
+    book.spine.append((chapter.id, "yes"))
    for p in pokemon:
-        image_content = open(p.img_filepath, 'rb').read()
+        image_content = open(p.img_filepath, "rb").read()
-        img = epub.EpubItem(uid=p.name, file_name=p.img_filepath, media_type='image/png', content=image_content)
+        img = epub.EpubItem(
            uid=p.name,
            file_name=p.img_filepath,
            media_type="image/png",
            content=image_content,
        )
        book.add_item(img)
    pokemon_lookup = {p.name.lower(): p for p in pokemon}
-    chapters = [b for b in book.get_items()
+    chapters = [
-                if isinstance(b, epub.EpubHtml)
+        b
-                if b.id.startswith("np_")]
+        for b in book.get_items()
        if isinstance(b, epub.EpubHtml)
        if b.id.startswith("np_")
    ]
    for c in chapters:
        patch_chapter(c, pokemon_lookup)
    epub_out = epub_filepath.replace(".", "-with-links.")
    epub.write_epub(epub_out, book, {})
-    logging.info(f"{epub_out} written.")
+    logging.info(f"Write '{epub_out}'.")
@@ -1,13 +1,15 @@
 import sys
 import logging
 import src.pokemon
 import src.epub
 def init_logging():
    logging.basicConfig(level=logging.INFO)
 def main():
-    init_logging()
+    logging.basicConfig(format="%(message)s", level=logging.INFO)
    try:
        ptoos_epub = sys.argv[1]
    except IndexError:
        ptoos_epub = "poos.epub"
    logging.info(f"Patching '{ptoos_epub}'.")
    pokemon = src.pokemon.get_pokemon()
-    src.epub.patch("poos.epub", pokemon)
+    src.epub.patch(ptoos_epub, pokemon)
@@ -9,7 +9,9 @@ from typing import List
 POKEMON_CACHE_DIRECTORY = "pokemon"
 BULBAPEDIA_BASE_URL = "https://bulbapedia.bulbagarden.net"
-NATIONAL_INDEX_URL = BULBAPEDIA_BASE_URL + "/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number"
+NATIONAL_INDEX_URL = (
    BULBAPEDIA_BASE_URL + "/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number"
 )
 class Pokemon(BaseModel):
@@ -24,13 +26,13 @@ class Pokemon(BaseModel):
 def download_to_file(url: str, filepath: str, override=False):
-    """ Downloads url into filepath. """
+    """Downloads url into filepath."""
    if os.path.isfile(filepath) and override is False:
        logging.debug(f"'{filepath}' exists.")
        return
    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0'
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0"
    }
    r = requests.get(url, headers=headers)
    if r.status_code != 200:
@@ -45,12 +47,14 @@ def download_to_file(url: str, filepath: str, override=False):
 def get_pokemon() -> List[Pokemon]:
-    """ Scrape Pokemon from the Bulbapedia national dex """
+    """Scrape Pokemon from the Bulbapedia national dex"""
    NATIONAL_INDEX_FILEPATH = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html")
    download_to_file(NATIONAL_INDEX_URL, NATIONAL_INDEX_FILEPATH)
    with open(NATIONAL_INDEX_FILEPATH, "r") as r:
        soup = BeautifulSoup(r, "html.parser")
-    pokemon_list_soup: BeautifulSoup = soup.find(id="List_of_Pokémon_by_National_Pokédex_number").parent
+    pokemon_list_soup: BeautifulSoup = soup.find(
        id="List_of_Pokémon_by_National_Pokédex_number"
    ).parent
    generation_soups: BeautifulSoup = pokemon_list_soup.find_next_siblings("h3")
    table_row_soups = []
@@ -77,48 +81,58 @@ def get_pokemon() -> List[Pokemon]:
            continue
        index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
-        html_url = BULBAPEDIA_BASE_URL + table_row_soup.find_next("th").next_element.attrs["href"]
+        html_url = (
            BULBAPEDIA_BASE_URL
            + table_row_soup.find_next("th").next_element.attrs["href"]
        )
        img_url = table_row_soup.find("img").attrs["src"]
        html_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
        img_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
-        p = Pokemon(name=name,
+        p = Pokemon(
-                    index=index,
+            name=name,
-                    html_url=html_url,
+            index=index,
-                    img_url=img_url,
+            html_url=html_url,
-                    html_filepath=html_filepath,
+            img_url=img_url,
-                    img_filepath=img_filepath,
+            html_filepath=html_filepath,
-                    json_filepath=json_filepath)
+            img_filepath=img_filepath,
            json_filepath=json_filepath,
        )
        pokemon.append(p)
        extend_pokemon(p)
-        with open(p.json_filepath, 'w') as f:
+        with open(p.json_filepath, "w") as f:
            f.write(p.json())
            logging.info(f"Saved {p.json_filepath}.")
    # Filter out speculative Pokemon
-    pokemon = [p for p in pokemon if not p.description.startswith("This article's contents will change")]
+    pokemon = [
        p
        for p in pokemon
        if not p.description.startswith("This article's contents will change")
    ]
    logging.info("Pokemon loaded.")
    return pokemon
 def extend_pokemon(p: Pokemon):
-    """ Add description and download Pokemon image """
+    """Add description and download Pokemon image"""
    download_to_file(p.html_url, p.html_filepath)
    with open(p.html_filepath, "r") as r:
        soup = BeautifulSoup(r, "html.parser")
-    content_soup: BeautifulSoup = soup.find(id='mw-content-text').contents[0]
+    content_soup: BeautifulSoup = soup.find(id="mw-content-text").contents[0]
    # description
    p_soup = content_soup.find("p")
    description = []
-    while p_soup.name == 'p':
+    while p_soup.name == "p":
        description.append(p_soup.get_text())
        p_soup = p_soup.next_sibling
    p.description = "".join(description)
    # image
-    img_url = content_soup.find("table").find_next_sibling("table").find("img").attrs["src"]
+    img_url = (
        content_soup.find("table").find_next_sibling("table").find("img").attrs["src"]
    )
    img_url = img_url.replace("//", "https://")
    p.img_url = img_url
    download_to_file(img_url, p.img_filepath)