Compare commits

..

5 Commits

11 changed files with 24546 additions and 92 deletions

1
.gitignore vendored
View File

@@ -7,6 +7,7 @@ __pycache__/
pokemon pokemon
tmp tmp
ptoos.epub ptoos.epub
ptoos-with-links.epub
# C extensions # C extensions
*.so *.so

View File

@@ -5,8 +5,28 @@ to descriptions and pictures of the Pokemon within the e-book itself.
It works with the epub that you can download from [Daystar Eld's Patreon](https://www.patreon.com/daystareld/). It works with the epub that you can download from [Daystar Eld's Patreon](https://www.patreon.com/daystareld/).
## Usage
```shell ```shell
pip install --user pipenv
pipenv install pipenv install
pipenv shell pipenv shell
python ptoos-xray.py "DaystarEld - Pokemon The Origin of Species.epub" python ptoos-xray.py "DaystarEld - Pokemon The Origin of Species.epub"
``` ```
## Run tests
```shell
pipenv install --dev
pipenv run pytest
```
## Compress Pokemon PNGs
Use `pngquant` to compress the PNGs and get a smaller epub file.
## Credits
Full credit for the Pokemon names, images, and descriptions goes to
[Bulbapedia](https://bulbapedia.bulbagarden.net) under
[Attribution-NonCommercial-ShareAlike 2.5](https://creativecommons.org/licenses/by-nc-sa/2.5/).

2
pytest.ini Normal file
View File

@@ -0,0 +1,2 @@
[pytest]
pythonpath = src

View File

@@ -1,30 +1,46 @@
import ebooklib import ebooklib
import logging import logging
import re import re
import sys
from dataclasses import dataclass
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from bs4.element import NavigableString from bs4.element import NavigableString
from ebooklib import epub from ebooklib import epub
from src.pokemon import Pokemon from src.pokemon import Pokemon
from typing import List, Dict from typing import List, Dict, Optional
from rich.progress import track from rich.progress import track
from rich.console import Console from rich.console import Console
POKEMON_ID_PREFIX = "pokemon-id-" POKEMON_ID_PREFIX = "pokemon-id-"
POKEDEX_UID = "np_pokedex"
@dataclass
class AnnoyingPokemon:
name_chunks: List[str]
length_chunks: int
name_in_pokedex: str
ANNOYING_POKEMON = [
AnnoyingPokemon(["Mr", ".", "Mime"], 3, "mr. mime"),
AnnoyingPokemon(["farfetch", "", "d"], 3, "farfetch'd"),
AnnoyingPokemon(["sirfetch", "", "d"], 3, "sirfetch'd"),
]
def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml: def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml:
POKEDEX_TITLE = "Pokedex" POKEDEX_TITLE = "Pokedex"
POKEDEX_FILE = "content/np_pokedex.xhtml" POKEDEX_FILE = "content/np_pokedex.xhtml"
POKEDEX_UID = "np_pokedex"
chapter = epub.EpubHtml( chapter = epub.EpubHtml(
title=POKEDEX_TITLE, file_name=POKEDEX_FILE, uid=POKEDEX_UID title=POKEDEX_TITLE, file_name=POKEDEX_FILE, uid=POKEDEX_UID
) )
content = ["<h1>Pokedex</h1>"] content = ["<h1>Pokedex</h1>"]
for p in pokemon: for p in pokemon:
content.append(f'<h2 id="{POKEMON_ID_PREFIX}{p.name.lower()}">{p.name}</h2>') content.append(f'<h2 id="{POKEMON_ID_PREFIX}{p.link_id}">{p.name}</h2>')
content.append( content.append(
f' <p><img alt="[Pokemon {p.name}]" src="../{p.img_filepath}"/><br/></p>' f' <p><img alt="[Pokemon {p.name}]" src="../{p.img_filename}"/><br/></p>'
) )
for paragraph in p.description.split("\n"): for paragraph in p.description.split("\n"):
content.append(f" <p>{paragraph}</p>") content.append(f" <p>{paragraph}</p>")
@@ -35,28 +51,58 @@ def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml:
def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]): def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
r = re.compile("([:,.!?“”‘’… ]+)") special_chars_regex = re.compile("([:,.!?“”‘’… ]+)")
soup: BeautifulSoup = BeautifulSoup(chapter.content, "html.parser") soup: BeautifulSoup = BeautifulSoup(chapter.content, "html.parser")
def pokemon_name_to_link(key: str, word: str) -> Tag: # Set to remember which Pokemon have already gotten a link for that
# chapter.
pokemon_added_for_chapter = set()
def pokemon_to_link(p: Pokemon, name_as_in_book: str) -> Tag:
tag = soup.new_tag("a") tag = soup.new_tag("a")
tag.string = word tag.string = name_as_in_book
tag.attrs["href"] = f"np_pokedex.xhtml#{POKEMON_ID_PREFIX}{key}" tag.attrs["href"] = f"np_pokedex.xhtml#{POKEMON_ID_PREFIX}{p.link_id}"
tag.attrs["style"] = "color:black;text-decoration:none" # tag.attrs["style"] = "color:black;text-decoration:none"
return tag return tag
def is_annoying_pokemon(index: int, chunks: List[str]) -> Optional[AnnoyingPokemon]:
for p in ANNOYING_POKEMON:
if p.name_chunks == list(
map(lambda s: s.lower(), chunks[index : index + p.length_chunks])
):
return p
return None
def patch_string(section: NavigableString) -> List: def patch_string(section: NavigableString) -> List:
"""Replace Pokemon with link to Pokemon; requires splitting up the """Replace Pokemon with link to Pokemon; requires splitting up the
NavigableString into a list of NavigableStrings and Tags.""" NavigableString into a list of NavigableStrings and Tags."""
result = [[]] result = [[]]
for word in r.split(str(section)): index, chunks = 0, special_chars_regex.split(str(section))
while index < len(chunks):
word = chunks[index]
pokemon: Optional[Pokemon] = None
increment: int = 1
if word.lower() in pokemon_lookup: if word.lower() in pokemon_lookup:
pokemon_lookup[word.lower()].appears_in_book = True pokemon = pokemon_lookup[word.lower()]
link = pokemon_name_to_link(word.lower(), word) elif annoying_pokemon := is_annoying_pokemon(index, chunks):
pokemon = pokemon_lookup[annoying_pokemon.name_in_pokedex]
increment = annoying_pokemon.length_chunks
if pokemon is not None and pokemon.name in pokemon_added_for_chapter:
pokemon = None
if pokemon is not None:
pokemon_added_for_chapter.add(pokemon.name)
pokemon.appears_in_book = True
name = "".join(chunks[index : index + increment])
link = pokemon_to_link(pokemon, name)
result.append(link) result.append(link)
result.append([]) result.append([])
index += increment
else: else:
result[-1].append(word) result[-1].append(word)
index += 1
# convert words back into strings # convert words back into strings
for i in range(len(result)): for i in range(len(result)):
@@ -80,16 +126,32 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
chapter.content = str(soup) chapter.content = str(soup)
def patch(epub_filepath: str, pokemon: List[Pokemon]): def get_pokemon_lookup(pokemon: List[Pokemon]) -> Dict[str, Pokemon]:
book = epub.read_epub(epub_filepath)
pokemon_lookup = {p.name.lower(): p for p in pokemon} pokemon_lookup = {p.name.lower(): p for p in pokemon}
pokemon_lookup["nidoran"] = pokemon_lookup["nidoran♂"]
pokemon_lookup["barrierd"] = pokemon_lookup["mr. mime"]
return pokemon_lookup
def patch(epub_filename: str, pokemon: List[Pokemon]):
try:
book = epub.read_epub(epub_filename)
except Exception:
logging.exception("Failed to open epub.")
sys.exit(1)
pokemon_lookup = get_pokemon_lookup(pokemon)
chapters = [ chapters = [
b b
for b in book.get_items() for b in book.get_items()
if isinstance(b, epub.EpubHtml) if isinstance(b, epub.EpubHtml)
if b.id.startswith("np_") if b.id.startswith("np_")
] ]
if [c for c in chapters if c.id == POKEDEX_UID]:
logging.warning(f"It looks like '{epub_filename}' already has a Pokedex.")
sys.exit(1)
for c in track(chapters, description="Add Pokemon links to chapters"): for c in track(chapters, description="Add Pokemon links to chapters"):
patch_chapter(c, pokemon_lookup) patch_chapter(c, pokemon_lookup)
@@ -103,17 +165,17 @@ def patch(epub_filepath: str, pokemon: List[Pokemon]):
book.spine.append((chapter.id, "yes")) book.spine.append((chapter.id, "yes"))
for p in pokemon: for p in pokemon:
image_content = open(p.img_filepath, "rb").read() image_content = open(p.img_filename, "rb").read()
img = epub.EpubItem( img = epub.EpubItem(
uid=p.name, uid=p.name,
file_name=p.img_filepath, file_name=p.img_filename,
media_type="image/png", media_type="image/png",
content=image_content, content=image_content,
) )
book.add_item(img) book.add_item(img)
console = Console() console = Console()
epub_out = epub_filepath.replace(".", "-with-links.") epub_out = epub_filename.replace(".", "-with-links.")
with console.status(f"Writing {epub_out}"): with console.status(f"Writing {epub_out}"):
epub.write_epub(epub_out, book, {}) epub.write_epub(epub_out, book, {})
console.print(f"[green]✓[/green] [orange1]{epub_out}[/orange1] written") console.print(f"[green]✓[/green] [orange1]{epub_out}[/orange1] written")

View File

@@ -11,11 +11,14 @@ def main():
level=logging.INFO, level=logging.INFO,
format="%(message)s", format="%(message)s",
datefmt="[%X]", datefmt="[%X]",
handlers=[RichHandler()], handlers=[RichHandler(rich_tracebacks=True)],
) )
try: try:
ptoos_epub = sys.argv[1] ptoos_epub = sys.argv[1]
except IndexError: except IndexError:
ptoos_epub = "ptoos.epub" ptoos_epub = "ptoos.epub"
logging.warning(f"No epub file provided. Defaulting to '{ptoos_epub}'.")
pokemon = src.pokemon.get_pokemon() pokemon = src.pokemon.get_pokemon()
# for p in pokemon:
# p.img_filename = p.img_filename.replace(".png", "-fs8.png")
src.epub.patch(ptoos_epub, pokemon) src.epub.patch(ptoos_epub, pokemon)

View File

@@ -2,6 +2,7 @@ import requests
import sys import sys
import os import os
import logging import logging
import re
from rich.progress import track from rich.progress import track
from pydantic import BaseModel from pydantic import BaseModel
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@@ -17,20 +18,21 @@ NATIONAL_INDEX_URL = (
class Pokemon(BaseModel): class Pokemon(BaseModel):
name: str name: str
link_id: str
index: str index: str
html_url: str html_url: str
img_url: str img_url: str
html_filepath: str html_filename: str
img_filepath: str img_filename: str
json_filepath: str json_filename: str
description: str = "" description: str = ""
appears_in_book: bool = False appears_in_book: bool = False
def download_to_file(url: str, filepath: str, override=False): def download_to_file(url: str, filename: str, override=False):
"""Downloads url into filepath.""" """Downloads url into filename."""
if os.path.isfile(filepath) and override is False: if os.path.isfile(filename) and override is False:
logging.debug(f"'{filepath}' exists.") logging.debug(f"'{filename}' exists.")
return return
headers = { headers = {
@@ -38,72 +40,91 @@ def download_to_file(url: str, filepath: str, override=False):
} }
r = requests.get(url, headers=headers) r = requests.get(url, headers=headers)
if r.status_code != 200: if r.status_code != 200:
logging.warning(f"Could not download '{filepath}'") logging.critical(f"Could not download '{filename}'.")
return sys.exit(1)
# Works for text and images # Works for text and images
with open(filepath, "wb") as f: with open(filename, "wb") as f:
for c in r: for c in r:
f.write(c) f.write(c)
logging.debug(f"'{filepath}' downloaded.") logging.debug(f"'{filename}' downloaded.")
def download_national_index_html(national_index_filename: str):
download_to_file(NATIONAL_INDEX_URL, national_index_filename)
def get_pokemon_table_row_soups(national_index_filename: str) -> List[BeautifulSoup]:
with open(national_index_filename, "r") as r:
soup = BeautifulSoup(r, "html.parser")
pokemon_list_soup = soup.find(
id="List_of_Pokémon_by_National_Pokédex_number"
).parent
generation_soups = pokemon_list_soup.find_next_siblings("h3")
table_row_soups = []
for generation_soup in generation_soups:
table_soup = generation_soup.find_next_sibling("table")
tbody_soup = generation_soup.find_next("tbody")
# skip first row because it is the header
table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:]
return table_row_soups
def extract_pokemon_from_table_row(table_row_soup: BeautifulSoup) -> Pokemon:
name = table_row_soup.find_next("th").next_element.attrs["title"]
link_id = re.sub("[^a-z]", "", name.lower())
# load Pokemon from JSON if it already exists
json_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".json")
if os.path.isfile(json_filename):
p = Pokemon.parse_file(json_filename)
logging.debug(f"Loaded '{p.json_filename}'.")
return p
index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
html_url = (
BULBAPEDIA_BASE_URL + table_row_soup.find_next("th").next_element.attrs["href"]
)
img_url = table_row_soup.find("img").attrs["src"]
html_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
img_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
return Pokemon(
name=name,
link_id=link_id,
index=index,
html_url=html_url,
img_url=img_url,
html_filename=html_filename,
img_filename=img_filename,
json_filename=json_filename,
)
def get_pokemon() -> List[Pokemon]: def get_pokemon() -> List[Pokemon]:
"""Scrape Pokemon from the Bulbapedia national dex""" """Scrape Pokemon from the Bulbapedia national dex"""
NATIONAL_INDEX_FILEPATH = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html") if not os.path.isdir(POKEMON_CACHE_DIRECTORY):
download_to_file(NATIONAL_INDEX_URL, NATIONAL_INDEX_FILEPATH) os.mkdir(POKEMON_CACHE_DIRECTORY)
with open(NATIONAL_INDEX_FILEPATH, "r") as r: national_index_filename = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html")
soup = BeautifulSoup(r, "html.parser") download_national_index_html(national_index_filename)
pokemon_list_soup: BeautifulSoup = soup.find( table_row_soups = get_pokemon_table_row_soups(national_index_filename)
id="List_of_Pokémon_by_National_Pokédex_number"
).parent
generation_soups: BeautifulSoup = pokemon_list_soup.find_next_siblings("h3")
table_row_soups = []
for generation_soup in generation_soups:
table_soup: BeautifulSoup = generation_soup.find_next_sibling("table")
tbody_soup: BeautifulSoup = generation_soup.find_next("tbody")
# skip first row because it is the header
table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:]
pokemon = [] pokemon = []
for table_row_soup in track(table_row_soups, description="Download Pokemon"): for table_row_soup in track(table_row_soups, description="Download Pokemon"):
name = table_row_soup.find_next("th").next_element.attrs["title"] p = extract_pokemon_from_table_row(table_row_soup)
# ignore Galarian and Alolan Pokemon so # Ignore Galarian and Alolan Pokemon (Pokemon with the same name)
if pokemon and pokemon[-1].name == name: if pokemon and pokemon[-1].name == p.name:
continue continue
# load Pokemon from JSON if it already exists
json_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".json")
if os.path.isfile(json_filepath):
p = Pokemon.parse_file(json_filepath)
pokemon.append(p)
logging.debug(f"Loaded {p.json_filepath}.")
continue
index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
html_url = (
BULBAPEDIA_BASE_URL
+ table_row_soup.find_next("th").next_element.attrs["href"]
)
img_url = table_row_soup.find("img").attrs["src"]
html_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
img_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
p = Pokemon(
name=name,
index=index,
html_url=html_url,
img_url=img_url,
html_filepath=html_filepath,
img_filepath=img_filepath,
json_filepath=json_filepath,
)
pokemon.append(p) pokemon.append(p)
# Pokemon has already been downloaded
if p.description and os.path.isfile(p.img_filename):
continue
extend_pokemon(p) extend_pokemon(p)
with open(p.json_filepath, "w") as f: with open(p.json_filename, "w") as f:
f.write(p.json()) f.write(p.json())
logging.debug(f"Saved {p.json_filepath}.") logging.debug(f"Saved {p.json_filename}.")
# Filter out speculative Pokemon # Filter out speculative Pokemon
pokemon = [ pokemon = [
@@ -117,23 +138,26 @@ def get_pokemon() -> List[Pokemon]:
def extend_pokemon(p: Pokemon): def extend_pokemon(p: Pokemon):
"""Add description and download Pokemon image""" """Add description and download Pokemon image"""
download_to_file(p.html_url, p.html_filepath) download_to_file(p.html_url, p.html_filename)
with open(p.html_filepath, "r") as r: with open(p.html_filename, "r") as r:
soup = BeautifulSoup(r, "html.parser") soup = BeautifulSoup(r, "html.parser")
content_soup: BeautifulSoup = soup.find(id="mw-content-text").contents[0] content_soup: BeautifulSoup = soup.find(id="mw-content-text").contents[0]
# description if not p.description:
p_soup = content_soup.find("p") p_soup = content_soup.find("p")
description = [] description = []
while p_soup.name == "p": while p_soup.name == "p":
description.append(p_soup.get_text()) description.append(p_soup.get_text())
p_soup = p_soup.next_sibling p_soup = p_soup.next_sibling
p.description = "".join(description) p.description = "".join(description)
# image if not os.path.isfile(p.img_filename):
img_url = ( img_url = (
content_soup.find("table").find_next_sibling("table").find("img").attrs["src"] content_soup.find("table")
) .find_next_sibling("table")
img_url = img_url.replace("//", "https://") .find("img")
p.img_url = img_url .attrs["src"]
download_to_file(img_url, p.img_filepath) )
img_url = img_url.replace("//", "https://")
p.img_url = img_url
download_to_file(img_url, p.img_filename)

12149
test/pokedex.html Normal file

File diff suppressed because one or more lines are too long

0
test/test_epub.py Normal file
View File

12149
test/test_pokedex.html Normal file

File diff suppressed because one or more lines are too long

44
test/test_pokemon.py Normal file
View File

@@ -0,0 +1,44 @@
import pokemon
import os
import filecmp
def test_download_national_index_html(tmp_path):
pokemon_html = tmp_path / "pokedex.html"
pokemon.download_national_index_html(pokemon_html)
assert os.path.getsize(pokemon_html) > 500000
def test_get_pokemon_table_row_soups():
national_index = "test/test_pokedex.html"
row_soups = pokemon.get_pokemon_table_row_soups(national_index)
assert len(row_soups) == 994
def test_extract_pokemon_from_table_row(tmp_path):
national_index = "test/test_pokedex.html"
pokemon.POKEMON_CACHE_DIRECTORY = tmp_path
row_soups = pokemon.get_pokemon_table_row_soups(national_index)
p = pokemon.extract_pokemon_from_table_row(row_soups[42])
assert p.name == "Vulpix"
assert p.link_id == "vulpix"
assert p.index == "#037"
assert p.html_url == "https://bulbapedia.bulbagarden.net/wiki/Vulpix_(Pok%C3%A9mon)"
assert (
p.img_url
== "//archives.bulbagarden.net/media/upload/thumb/3/35/037Vulpix-Alola.png/70px-037Vulpix-Alola.png"
)
assert p.img_filename.endswith("vulpix.png")
assert p.json_filename.endswith("vulpix.json")
assert p.description == ""
assert p.appears_in_book == False
def test_extend_pokemon(tmp_path):
national_index = "test/test_pokedex.html"
row_soups = pokemon.get_pokemon_table_row_soups(national_index)
p = pokemon.extract_pokemon_from_table_row(row_soups[42])
p.img_filename = tmp_path / "vulpix.png"
pokemon.extend_pokemon(p)
assert filecmp.cmp(p.img_filename, "test/test_vulpix.png")
assert p.description.startswith("Vulpix (Japanese: \u30ed\u30b3\u30f3 Rokon)")

BIN
test/test_vulpix.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 52 KiB