Compare commits

...

5 Commits

11 changed files with 24546 additions and 92 deletions

1
.gitignore vendored
View File

@ -7,6 +7,7 @@ __pycache__/
pokemon
tmp
ptoos.epub
ptoos-with-links.epub
# C extensions
*.so

View File

@ -5,8 +5,28 @@ to descriptions and pictures of the Pokemon within the e-book itself.
It works with the epub that you can download from [Daystar Eld's Patreon](https://www.patreon.com/daystareld/).
## Usage
```shell
pip install --user pipenv
pipenv install
pipenv shell
python ptoos-xray.py "DaystarEld - Pokemon The Origin of Species.epub"
```
## Run tests
```shell
pipenv install --dev
pipenv run pytest
```
## Compress Pokemon PNGs
Use `pngquant` to compress the PNGs and get a smaller epub file.
## Credits
Full credit for the Pokemon names, images, and descriptions goes to
[Bulbapedia](https://bulbapedia.bulbagarden.net) under
[Attribution-NonCommercial-ShareAlike 2.5](https://creativecommons.org/licenses/by-nc-sa/2.5/).

2
pytest.ini Normal file
View File

@ -0,0 +1,2 @@
[pytest]
pythonpath = src

View File

@ -1,30 +1,46 @@
import ebooklib
import logging
import re
import sys
from dataclasses import dataclass
from bs4 import BeautifulSoup, Tag
from bs4.element import NavigableString
from ebooklib import epub
from src.pokemon import Pokemon
from typing import List, Dict
from typing import List, Dict, Optional
from rich.progress import track
from rich.console import Console
POKEMON_ID_PREFIX = "pokemon-id-"
POKEDEX_UID = "np_pokedex"
@dataclass
class AnnoyingPokemon:
name_chunks: List[str]
length_chunks: int
name_in_pokedex: str
ANNOYING_POKEMON = [
AnnoyingPokemon(["Mr", ".", "Mime"], 3, "mr. mime"),
AnnoyingPokemon(["farfetch", "", "d"], 3, "farfetch'd"),
AnnoyingPokemon(["sirfetch", "", "d"], 3, "sirfetch'd"),
]
def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml:
POKEDEX_TITLE = "Pokedex"
POKEDEX_FILE = "content/np_pokedex.xhtml"
POKEDEX_UID = "np_pokedex"
chapter = epub.EpubHtml(
title=POKEDEX_TITLE, file_name=POKEDEX_FILE, uid=POKEDEX_UID
)
content = ["<h1>Pokedex</h1>"]
for p in pokemon:
content.append(f'<h2 id="{POKEMON_ID_PREFIX}{p.name.lower()}">{p.name}</h2>')
content.append(f'<h2 id="{POKEMON_ID_PREFIX}{p.link_id}">{p.name}</h2>')
content.append(
f' <p><img alt="[Pokemon {p.name}]" src="../{p.img_filepath}"/><br/></p>'
f' <p><img alt="[Pokemon {p.name}]" src="../{p.img_filename}"/><br/></p>'
)
for paragraph in p.description.split("\n"):
content.append(f" <p>{paragraph}</p>")
@ -35,28 +51,58 @@ def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml:
def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
r = re.compile("([:,.!?“”‘’… ]+)")
special_chars_regex = re.compile("([:,.!?“”‘’… ]+)")
soup: BeautifulSoup = BeautifulSoup(chapter.content, "html.parser")
def pokemon_name_to_link(key: str, word: str) -> Tag:
# Set to remember which Pokemon have already gotten a link for that
# chapter.
pokemon_added_for_chapter = set()
def pokemon_to_link(p: Pokemon, name_as_in_book: str) -> Tag:
tag = soup.new_tag("a")
tag.string = word
tag.attrs["href"] = f"np_pokedex.xhtml#{POKEMON_ID_PREFIX}{key}"
tag.attrs["style"] = "color:black;text-decoration:none"
tag.string = name_as_in_book
tag.attrs["href"] = f"np_pokedex.xhtml#{POKEMON_ID_PREFIX}{p.link_id}"
# tag.attrs["style"] = "color:black;text-decoration:none"
return tag
def is_annoying_pokemon(index: int, chunks: List[str]) -> Optional[AnnoyingPokemon]:
for p in ANNOYING_POKEMON:
if p.name_chunks == list(
map(lambda s: s.lower(), chunks[index : index + p.length_chunks])
):
return p
return None
def patch_string(section: NavigableString) -> List:
"""Replace Pokemon with link to Pokemon; requires splitting up the
NavigableString into a list of NavigableStrings and Tags."""
result = [[]]
for word in r.split(str(section)):
index, chunks = 0, special_chars_regex.split(str(section))
while index < len(chunks):
word = chunks[index]
pokemon: Optional[Pokemon] = None
increment: int = 1
if word.lower() in pokemon_lookup:
pokemon_lookup[word.lower()].appears_in_book = True
link = pokemon_name_to_link(word.lower(), word)
pokemon = pokemon_lookup[word.lower()]
elif annoying_pokemon := is_annoying_pokemon(index, chunks):
pokemon = pokemon_lookup[annoying_pokemon.name_in_pokedex]
increment = annoying_pokemon.length_chunks
if pokemon is not None and pokemon.name in pokemon_added_for_chapter:
pokemon = None
if pokemon is not None:
pokemon_added_for_chapter.add(pokemon.name)
pokemon.appears_in_book = True
name = "".join(chunks[index : index + increment])
link = pokemon_to_link(pokemon, name)
result.append(link)
result.append([])
index += increment
else:
result[-1].append(word)
index += 1
# convert words back into strings
for i in range(len(result)):
@ -80,16 +126,32 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
chapter.content = str(soup)
def patch(epub_filepath: str, pokemon: List[Pokemon]):
book = epub.read_epub(epub_filepath)
def get_pokemon_lookup(pokemon: List[Pokemon]) -> Dict[str, Pokemon]:
pokemon_lookup = {p.name.lower(): p for p in pokemon}
pokemon_lookup["nidoran"] = pokemon_lookup["nidoran♂"]
pokemon_lookup["barrierd"] = pokemon_lookup["mr. mime"]
return pokemon_lookup
def patch(epub_filename: str, pokemon: List[Pokemon]):
try:
book = epub.read_epub(epub_filename)
except Exception:
logging.exception("Failed to open epub.")
sys.exit(1)
pokemon_lookup = get_pokemon_lookup(pokemon)
chapters = [
b
for b in book.get_items()
if isinstance(b, epub.EpubHtml)
if b.id.startswith("np_")
]
if [c for c in chapters if c.id == POKEDEX_UID]:
logging.warning(f"It looks like '{epub_filename}' already has a Pokedex.")
sys.exit(1)
for c in track(chapters, description="Add Pokemon links to chapters"):
patch_chapter(c, pokemon_lookup)
@ -103,17 +165,17 @@ def patch(epub_filepath: str, pokemon: List[Pokemon]):
book.spine.append((chapter.id, "yes"))
for p in pokemon:
image_content = open(p.img_filepath, "rb").read()
image_content = open(p.img_filename, "rb").read()
img = epub.EpubItem(
uid=p.name,
file_name=p.img_filepath,
file_name=p.img_filename,
media_type="image/png",
content=image_content,
)
book.add_item(img)
console = Console()
epub_out = epub_filepath.replace(".", "-with-links.")
epub_out = epub_filename.replace(".", "-with-links.")
with console.status(f"Writing {epub_out}"):
epub.write_epub(epub_out, book, {})
console.print(f"[green]✓[/green] [orange1]{epub_out}[/orange1] written")

View File

@ -11,11 +11,14 @@ def main():
level=logging.INFO,
format="%(message)s",
datefmt="[%X]",
handlers=[RichHandler()],
handlers=[RichHandler(rich_tracebacks=True)],
)
try:
ptoos_epub = sys.argv[1]
except IndexError:
ptoos_epub = "ptoos.epub"
logging.warning(f"No epub file provided. Defaulting to '{ptoos_epub}'.")
pokemon = src.pokemon.get_pokemon()
# for p in pokemon:
# p.img_filename = p.img_filename.replace(".png", "-fs8.png")
src.epub.patch(ptoos_epub, pokemon)

View File

@ -2,6 +2,7 @@ import requests
import sys
import os
import logging
import re
from rich.progress import track
from pydantic import BaseModel
from bs4 import BeautifulSoup
@ -17,20 +18,21 @@ NATIONAL_INDEX_URL = (
class Pokemon(BaseModel):
name: str
link_id: str
index: str
html_url: str
img_url: str
html_filepath: str
img_filepath: str
json_filepath: str
html_filename: str
img_filename: str
json_filename: str
description: str = ""
appears_in_book: bool = False
def download_to_file(url: str, filepath: str, override=False):
"""Downloads url into filepath."""
if os.path.isfile(filepath) and override is False:
logging.debug(f"'{filepath}' exists.")
def download_to_file(url: str, filename: str, override=False):
"""Downloads url into filename."""
if os.path.isfile(filename) and override is False:
logging.debug(f"'{filename}' exists.")
return
headers = {
@ -38,72 +40,91 @@ def download_to_file(url: str, filepath: str, override=False):
}
r = requests.get(url, headers=headers)
if r.status_code != 200:
logging.warning(f"Could not download '{filepath}'")
return
logging.critical(f"Could not download '{filename}'.")
sys.exit(1)
# Works for text and images
with open(filepath, "wb") as f:
with open(filename, "wb") as f:
for c in r:
f.write(c)
logging.debug(f"'{filepath}' downloaded.")
logging.debug(f"'{filename}' downloaded.")
def download_national_index_html(national_index_filename: str):
download_to_file(NATIONAL_INDEX_URL, national_index_filename)
def get_pokemon_table_row_soups(national_index_filename: str) -> List[BeautifulSoup]:
with open(national_index_filename, "r") as r:
soup = BeautifulSoup(r, "html.parser")
pokemon_list_soup = soup.find(
id="List_of_Pokémon_by_National_Pokédex_number"
).parent
generation_soups = pokemon_list_soup.find_next_siblings("h3")
table_row_soups = []
for generation_soup in generation_soups:
table_soup = generation_soup.find_next_sibling("table")
tbody_soup = generation_soup.find_next("tbody")
# skip first row because it is the header
table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:]
return table_row_soups
def extract_pokemon_from_table_row(table_row_soup: BeautifulSoup) -> Pokemon:
name = table_row_soup.find_next("th").next_element.attrs["title"]
link_id = re.sub("[^a-z]", "", name.lower())
# load Pokemon from JSON if it already exists
json_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".json")
if os.path.isfile(json_filename):
p = Pokemon.parse_file(json_filename)
logging.debug(f"Loaded '{p.json_filename}'.")
return p
index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
html_url = (
BULBAPEDIA_BASE_URL + table_row_soup.find_next("th").next_element.attrs["href"]
)
img_url = table_row_soup.find("img").attrs["src"]
html_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
img_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
return Pokemon(
name=name,
link_id=link_id,
index=index,
html_url=html_url,
img_url=img_url,
html_filename=html_filename,
img_filename=img_filename,
json_filename=json_filename,
)
def get_pokemon() -> List[Pokemon]:
"""Scrape Pokemon from the Bulbapedia national dex"""
NATIONAL_INDEX_FILEPATH = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html")
download_to_file(NATIONAL_INDEX_URL, NATIONAL_INDEX_FILEPATH)
with open(NATIONAL_INDEX_FILEPATH, "r") as r:
soup = BeautifulSoup(r, "html.parser")
pokemon_list_soup: BeautifulSoup = soup.find(
id="List_of_Pokémon_by_National_Pokédex_number"
).parent
generation_soups: BeautifulSoup = pokemon_list_soup.find_next_siblings("h3")
table_row_soups = []
for generation_soup in generation_soups:
table_soup: BeautifulSoup = generation_soup.find_next_sibling("table")
tbody_soup: BeautifulSoup = generation_soup.find_next("tbody")
# skip first row because it is the header
table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:]
if not os.path.isdir(POKEMON_CACHE_DIRECTORY):
os.mkdir(POKEMON_CACHE_DIRECTORY)
national_index_filename = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html")
download_national_index_html(national_index_filename)
table_row_soups = get_pokemon_table_row_soups(national_index_filename)
pokemon = []
for table_row_soup in track(table_row_soups, description="Download Pokemon"):
name = table_row_soup.find_next("th").next_element.attrs["title"]
p = extract_pokemon_from_table_row(table_row_soup)
# ignore Galarian and Alolan Pokemon so
if pokemon and pokemon[-1].name == name:
# Ignore Galarian and Alolan Pokemon (Pokemon with the same name)
if pokemon and pokemon[-1].name == p.name:
continue
# load Pokemon from JSON if it already exists
json_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".json")
if os.path.isfile(json_filepath):
p = Pokemon.parse_file(json_filepath)
pokemon.append(p)
logging.debug(f"Loaded {p.json_filepath}.")
continue
index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
html_url = (
BULBAPEDIA_BASE_URL
+ table_row_soup.find_next("th").next_element.attrs["href"]
)
img_url = table_row_soup.find("img").attrs["src"]
html_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
img_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
p = Pokemon(
name=name,
index=index,
html_url=html_url,
img_url=img_url,
html_filepath=html_filepath,
img_filepath=img_filepath,
json_filepath=json_filepath,
)
pokemon.append(p)
# Pokemon has already been downloaded
if p.description and os.path.isfile(p.img_filename):
continue
extend_pokemon(p)
with open(p.json_filepath, "w") as f:
with open(p.json_filename, "w") as f:
f.write(p.json())
logging.debug(f"Saved {p.json_filepath}.")
logging.debug(f"Saved {p.json_filename}.")
# Filter out speculative Pokemon
pokemon = [
@ -117,23 +138,26 @@ def get_pokemon() -> List[Pokemon]:
def extend_pokemon(p: Pokemon):
"""Add description and download Pokemon image"""
download_to_file(p.html_url, p.html_filepath)
with open(p.html_filepath, "r") as r:
download_to_file(p.html_url, p.html_filename)
with open(p.html_filename, "r") as r:
soup = BeautifulSoup(r, "html.parser")
content_soup: BeautifulSoup = soup.find(id="mw-content-text").contents[0]
# description
p_soup = content_soup.find("p")
description = []
while p_soup.name == "p":
description.append(p_soup.get_text())
p_soup = p_soup.next_sibling
p.description = "".join(description)
if not p.description:
p_soup = content_soup.find("p")
description = []
while p_soup.name == "p":
description.append(p_soup.get_text())
p_soup = p_soup.next_sibling
p.description = "".join(description)
# image
img_url = (
content_soup.find("table").find_next_sibling("table").find("img").attrs["src"]
)
img_url = img_url.replace("//", "https://")
p.img_url = img_url
download_to_file(img_url, p.img_filepath)
if not os.path.isfile(p.img_filename):
img_url = (
content_soup.find("table")
.find_next_sibling("table")
.find("img")
.attrs["src"]
)
img_url = img_url.replace("//", "https://")
p.img_url = img_url
download_to_file(img_url, p.img_filename)

12149
test/pokedex.html Normal file

File diff suppressed because one or more lines are too long

0
test/test_epub.py Normal file
View File

12149
test/test_pokedex.html Normal file

File diff suppressed because one or more lines are too long

44
test/test_pokemon.py Normal file
View File

@ -0,0 +1,44 @@
import pokemon
import os
import filecmp
def test_download_national_index_html(tmp_path):
pokemon_html = tmp_path / "pokedex.html"
pokemon.download_national_index_html(pokemon_html)
assert os.path.getsize(pokemon_html) > 500000
def test_get_pokemon_table_row_soups():
national_index = "test/test_pokedex.html"
row_soups = pokemon.get_pokemon_table_row_soups(national_index)
assert len(row_soups) == 994
def test_extract_pokemon_from_table_row(tmp_path):
national_index = "test/test_pokedex.html"
pokemon.POKEMON_CACHE_DIRECTORY = tmp_path
row_soups = pokemon.get_pokemon_table_row_soups(national_index)
p = pokemon.extract_pokemon_from_table_row(row_soups[42])
assert p.name == "Vulpix"
assert p.link_id == "vulpix"
assert p.index == "#037"
assert p.html_url == "https://bulbapedia.bulbagarden.net/wiki/Vulpix_(Pok%C3%A9mon)"
assert (
p.img_url
== "//archives.bulbagarden.net/media/upload/thumb/3/35/037Vulpix-Alola.png/70px-037Vulpix-Alola.png"
)
assert p.img_filename.endswith("vulpix.png")
assert p.json_filename.endswith("vulpix.json")
assert p.description == ""
assert p.appears_in_book == False
def test_extend_pokemon(tmp_path):
national_index = "test/test_pokedex.html"
row_soups = pokemon.get_pokemon_table_row_soups(national_index)
p = pokemon.extract_pokemon_from_table_row(row_soups[42])
p.img_filename = tmp_path / "vulpix.png"
pokemon.extend_pokemon(p)
assert filecmp.cmp(p.img_filename, "test/test_vulpix.png")
assert p.description.startswith("Vulpix (Japanese: \u30ed\u30b3\u30f3 Rokon)")

BIN
test/test_vulpix.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 52 KiB