Update readme and format scripts

main
Felix Martin 2022-10-22 21:37:01 -04:00
parent 1248d9d750
commit 8200225780
4 changed files with 78 additions and 46 deletions

View File

@ -1,9 +1,12 @@
# poos-xray
Script that annotates Pokemon: the Origin of the Species epub with links to
descriptions of the Pokemon.
Script that annotates the Pokemon: the Origin of the Species e-book with links
to descriptions and pictures of the Pokemon within the e-book itself.
It works with the epub that you can download from [Daystar Eld's Patreon](https://www.patreon.com/daystareld/).
```shell
pipenv install
pipenv shell
python poos-xray "DaystarEld - Pokemon The Origin of Species.epub"
```

View File

@ -9,26 +9,29 @@ from typing import List, Dict
POKEMON_ID_PREFIX = "pokemon-id-"
def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml:
POKEDEX_TITLE = "Pokedex"
POKEDEX_FILE = "content/np_pokedex.xhtml"
POKEDEX_UID = "np_pokedex"
chapter = epub.EpubHtml(title=POKEDEX_TITLE, file_name=POKEDEX_FILE, uid=POKEDEX_UID)
content = ['<h1>Pokedex</h1>']
chapter = epub.EpubHtml(
title=POKEDEX_TITLE, file_name=POKEDEX_FILE, uid=POKEDEX_UID
)
content = ["<h1>Pokedex</h1>"]
for p in pokemon:
content.append(f'<h2 id="{POKEMON_ID_PREFIX}{p.name.lower()}">{p.name}</h2>')
content.append(f' <p><img alt="[Pokemon {p.name}]" src="../{p.img_filepath}"/><br/></p>')
content.append(
f' <p><img alt="[Pokemon {p.name}]" src="../{p.img_filepath}"/><br/></p>'
)
for paragraph in p.description.split("\n"):
content.append(f' <p>{paragraph}</p>')
content.append('')
content.append(f" <p>{paragraph}</p>")
content.append("")
chapter.content = "\n".join(content)
return chapter
def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
r = re.compile("([:,.!?“”‘’…])")
soup: BeautifulSoup = BeautifulSoup(chapter.content, "html.parser")
@ -41,8 +44,8 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
return tag
def patch_string(section: NavigableString) -> List:
""" Replace Pokemon with link to Pokemon; requires splitting up the
NavigableString into a list of NavigableStrings and Tags. """
"""Replace Pokemon with link to Pokemon; requires splitting up the
NavigableString into a list of NavigableStrings and Tags."""
result = [[]]
for word in str(section).split(" "):
word_stripped = r.sub("", word)
@ -55,7 +58,9 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
else:
# add other chars before pokemon if there are any
result[-1].append("".join(word_split[:i]))
pokemon_link = pokemon_name_to_link(word_stripped.lower(), word_stripped)
pokemon_link = pokemon_name_to_link(
word_stripped.lower(), word_stripped
)
result.append(pokemon_link)
result.append([])
if i + 1 == len(word_split):
@ -63,7 +68,7 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
result[-1].append(" ")
else:
# add other chars after pokemon if there are any
result[-1].append("".join(word_split[i + 1:]))
result[-1].append("".join(word_split[i + 1 :]))
else:
result[-1].append(word)
@ -96,20 +101,28 @@ def patch(epub_filepath: str, pokemon: List[Pokemon]):
book.add_item(chapter)
link = epub.Link(chapter.file_name, chapter.title, chapter.id)
book.toc.append(link)
book.spine.append((chapter.id, 'yes'))
book.spine.append((chapter.id, "yes"))
for p in pokemon:
image_content = open(p.img_filepath, 'rb').read()
img = epub.EpubItem(uid=p.name, file_name=p.img_filepath, media_type='image/png', content=image_content)
image_content = open(p.img_filepath, "rb").read()
img = epub.EpubItem(
uid=p.name,
file_name=p.img_filepath,
media_type="image/png",
content=image_content,
)
book.add_item(img)
pokemon_lookup = {p.name.lower(): p for p in pokemon}
chapters = [b for b in book.get_items()
if isinstance(b, epub.EpubHtml)
if b.id.startswith("np_")]
chapters = [
b
for b in book.get_items()
if isinstance(b, epub.EpubHtml)
if b.id.startswith("np_")
]
for c in chapters:
patch_chapter(c, pokemon_lookup)
epub_out = epub_filepath.replace(".", "-with-links.")
epub.write_epub(epub_out, book, {})
logging.info(f"{epub_out} written.")
logging.info(f"Write '{epub_out}'.")

View File

@ -1,13 +1,15 @@
import sys
import logging
import src.pokemon
import src.epub
def init_logging():
logging.basicConfig(level=logging.INFO)
def main():
init_logging()
logging.basicConfig(format="%(message)s", level=logging.INFO)
try:
ptoos_epub = sys.argv[1]
except IndexError:
ptoos_epub = "poos.epub"
logging.info(f"Patching '{ptoos_epub}'.")
pokemon = src.pokemon.get_pokemon()
src.epub.patch("poos.epub", pokemon)
src.epub.patch(ptoos_epub, pokemon)

View File

@ -9,7 +9,9 @@ from typing import List
POKEMON_CACHE_DIRECTORY = "pokemon"
BULBAPEDIA_BASE_URL = "https://bulbapedia.bulbagarden.net"
NATIONAL_INDEX_URL = BULBAPEDIA_BASE_URL + "/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number"
NATIONAL_INDEX_URL = (
BULBAPEDIA_BASE_URL + "/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number"
)
class Pokemon(BaseModel):
@ -24,13 +26,13 @@ class Pokemon(BaseModel):
def download_to_file(url: str, filepath: str, override=False):
""" Downloads url into filepath. """
"""Downloads url into filepath."""
if os.path.isfile(filepath) and override is False:
logging.debug(f"'{filepath}' exists.")
return
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0'
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0"
}
r = requests.get(url, headers=headers)
if r.status_code != 200:
@ -45,12 +47,14 @@ def download_to_file(url: str, filepath: str, override=False):
def get_pokemon() -> List[Pokemon]:
""" Scrape Pokemon from the Bulbapedia national dex """
"""Scrape Pokemon from the Bulbapedia national dex"""
NATIONAL_INDEX_FILEPATH = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html")
download_to_file(NATIONAL_INDEX_URL, NATIONAL_INDEX_FILEPATH)
with open(NATIONAL_INDEX_FILEPATH, "r") as r:
soup = BeautifulSoup(r, "html.parser")
pokemon_list_soup: BeautifulSoup = soup.find(id="List_of_Pokémon_by_National_Pokédex_number").parent
pokemon_list_soup: BeautifulSoup = soup.find(
id="List_of_Pokémon_by_National_Pokédex_number"
).parent
generation_soups: BeautifulSoup = pokemon_list_soup.find_next_siblings("h3")
table_row_soups = []
@ -77,48 +81,58 @@ def get_pokemon() -> List[Pokemon]:
continue
index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
html_url = BULBAPEDIA_BASE_URL + table_row_soup.find_next("th").next_element.attrs["href"]
html_url = (
BULBAPEDIA_BASE_URL
+ table_row_soup.find_next("th").next_element.attrs["href"]
)
img_url = table_row_soup.find("img").attrs["src"]
html_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
img_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
p = Pokemon(name=name,
index=index,
html_url=html_url,
img_url=img_url,
html_filepath=html_filepath,
img_filepath=img_filepath,
json_filepath=json_filepath)
p = Pokemon(
name=name,
index=index,
html_url=html_url,
img_url=img_url,
html_filepath=html_filepath,
img_filepath=img_filepath,
json_filepath=json_filepath,
)
pokemon.append(p)
extend_pokemon(p)
with open(p.json_filepath, 'w') as f:
with open(p.json_filepath, "w") as f:
f.write(p.json())
logging.info(f"Saved {p.json_filepath}.")
# Filter out speculative Pokemon
pokemon = [p for p in pokemon if not p.description.startswith("This article's contents will change")]
pokemon = [
p
for p in pokemon
if not p.description.startswith("This article's contents will change")
]
logging.info("Pokemon loaded.")
return pokemon
def extend_pokemon(p: Pokemon):
""" Add description and download Pokemon image """
"""Add description and download Pokemon image"""
download_to_file(p.html_url, p.html_filepath)
with open(p.html_filepath, "r") as r:
soup = BeautifulSoup(r, "html.parser")
content_soup: BeautifulSoup = soup.find(id='mw-content-text').contents[0]
content_soup: BeautifulSoup = soup.find(id="mw-content-text").contents[0]
# description
p_soup = content_soup.find("p")
description = []
while p_soup.name == 'p':
while p_soup.name == "p":
description.append(p_soup.get_text())
p_soup = p_soup.next_sibling
p.description = "".join(description)
# image
img_url = content_soup.find("table").find_next_sibling("table").find("img").attrs["src"]
img_url = (
content_soup.find("table").find_next_sibling("table").find("img").attrs["src"]
)
img_url = img_url.replace("//", "https://")
p.img_url = img_url
download_to_file(img_url, p.img_filepath)