Compare commits
5 Commits
bc962bd419
...
3634f10e81
Author | SHA1 | Date |
---|---|---|
Felix Martin | 3634f10e81 | |
felixm | 808ab57ea9 | |
felixm | 7d9209d52e | |
Felix Martin | 9c200a1246 | |
Felix Martin | d224776a9a |
|
@ -7,6 +7,7 @@ __pycache__/
|
|||
pokemon
|
||||
tmp
|
||||
ptoos.epub
|
||||
ptoos-with-links.epub
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
|
20
README.md
20
README.md
|
@ -5,8 +5,28 @@ to descriptions and pictures of the Pokemon within the e-book itself.
|
|||
|
||||
It works with the epub that you can download from [Daystar Eld's Patreon](https://www.patreon.com/daystareld/).
|
||||
|
||||
## Usage
|
||||
|
||||
```shell
|
||||
pip install --user pipenv
|
||||
pipenv install
|
||||
pipenv shell
|
||||
python ptoos-xray.py "DaystarEld - Pokemon The Origin of Species.epub"
|
||||
```
|
||||
|
||||
## Run tests
|
||||
|
||||
```shell
|
||||
pipenv install --dev
|
||||
pipenv run pytest
|
||||
```
|
||||
|
||||
## Compress Pokemon PNGs
|
||||
|
||||
Use `pngquant` to compress the PNGs and get a smaller epub file.
|
||||
|
||||
## Credits
|
||||
|
||||
Full credit for the Pokemon names, images, and descriptions goes to
|
||||
[Bulbapedia](https://bulbapedia.bulbagarden.net) under
|
||||
[Attribution-NonCommercial-ShareAlike 2.5](https://creativecommons.org/licenses/by-nc-sa/2.5/).
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
[pytest]
|
||||
pythonpath = src
|
98
src/epub.py
98
src/epub.py
|
@ -1,30 +1,46 @@
|
|||
import ebooklib
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
from bs4.element import NavigableString
|
||||
from ebooklib import epub
|
||||
from src.pokemon import Pokemon
|
||||
from typing import List, Dict
|
||||
from typing import List, Dict, Optional
|
||||
from rich.progress import track
|
||||
from rich.console import Console
|
||||
|
||||
POKEMON_ID_PREFIX = "pokemon-id-"
|
||||
POKEDEX_UID = "np_pokedex"
|
||||
|
||||
|
||||
@dataclass
|
||||
class AnnoyingPokemon:
|
||||
name_chunks: List[str]
|
||||
length_chunks: int
|
||||
name_in_pokedex: str
|
||||
|
||||
|
||||
ANNOYING_POKEMON = [
|
||||
AnnoyingPokemon(["Mr", ".", "Mime"], 3, "mr. mime"),
|
||||
AnnoyingPokemon(["farfetch", "’", "d"], 3, "farfetch'd"),
|
||||
AnnoyingPokemon(["sirfetch", "’", "d"], 3, "sirfetch'd"),
|
||||
]
|
||||
|
||||
|
||||
def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml:
|
||||
POKEDEX_TITLE = "Pokedex"
|
||||
POKEDEX_FILE = "content/np_pokedex.xhtml"
|
||||
POKEDEX_UID = "np_pokedex"
|
||||
chapter = epub.EpubHtml(
|
||||
title=POKEDEX_TITLE, file_name=POKEDEX_FILE, uid=POKEDEX_UID
|
||||
)
|
||||
content = ["<h1>Pokedex</h1>"]
|
||||
|
||||
for p in pokemon:
|
||||
content.append(f'<h2 id="{POKEMON_ID_PREFIX}{p.name.lower()}">{p.name}</h2>')
|
||||
content.append(f'<h2 id="{POKEMON_ID_PREFIX}{p.link_id}">{p.name}</h2>')
|
||||
content.append(
|
||||
f' <p><img alt="[Pokemon {p.name}]" src="../{p.img_filepath}"/><br/></p>'
|
||||
f' <p><img alt="[Pokemon {p.name}]" src="../{p.img_filename}"/><br/></p>'
|
||||
)
|
||||
for paragraph in p.description.split("\n"):
|
||||
content.append(f" <p>{paragraph}</p>")
|
||||
|
@ -35,28 +51,58 @@ def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml:
|
|||
|
||||
|
||||
def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
|
||||
r = re.compile("([:,.!?“”‘’… ]+)")
|
||||
special_chars_regex = re.compile("([:,.!?“”‘’… ]+)")
|
||||
soup: BeautifulSoup = BeautifulSoup(chapter.content, "html.parser")
|
||||
|
||||
def pokemon_name_to_link(key: str, word: str) -> Tag:
|
||||
# Set to remember which Pokemon have already gotten a link for that
|
||||
# chapter.
|
||||
pokemon_added_for_chapter = set()
|
||||
|
||||
def pokemon_to_link(p: Pokemon, name_as_in_book: str) -> Tag:
|
||||
tag = soup.new_tag("a")
|
||||
tag.string = word
|
||||
tag.attrs["href"] = f"np_pokedex.xhtml#{POKEMON_ID_PREFIX}{key}"
|
||||
tag.attrs["style"] = "color:black;text-decoration:none"
|
||||
tag.string = name_as_in_book
|
||||
tag.attrs["href"] = f"np_pokedex.xhtml#{POKEMON_ID_PREFIX}{p.link_id}"
|
||||
# tag.attrs["style"] = "color:black;text-decoration:none"
|
||||
return tag
|
||||
|
||||
def is_annoying_pokemon(index: int, chunks: List[str]) -> Optional[AnnoyingPokemon]:
|
||||
for p in ANNOYING_POKEMON:
|
||||
if p.name_chunks == list(
|
||||
map(lambda s: s.lower(), chunks[index : index + p.length_chunks])
|
||||
):
|
||||
return p
|
||||
return None
|
||||
|
||||
def patch_string(section: NavigableString) -> List:
|
||||
"""Replace Pokemon with link to Pokemon; requires splitting up the
|
||||
NavigableString into a list of NavigableStrings and Tags."""
|
||||
result = [[]]
|
||||
for word in r.split(str(section)):
|
||||
index, chunks = 0, special_chars_regex.split(str(section))
|
||||
while index < len(chunks):
|
||||
word = chunks[index]
|
||||
pokemon: Optional[Pokemon] = None
|
||||
increment: int = 1
|
||||
|
||||
if word.lower() in pokemon_lookup:
|
||||
pokemon_lookup[word.lower()].appears_in_book = True
|
||||
link = pokemon_name_to_link(word.lower(), word)
|
||||
pokemon = pokemon_lookup[word.lower()]
|
||||
elif annoying_pokemon := is_annoying_pokemon(index, chunks):
|
||||
pokemon = pokemon_lookup[annoying_pokemon.name_in_pokedex]
|
||||
increment = annoying_pokemon.length_chunks
|
||||
|
||||
if pokemon is not None and pokemon.name in pokemon_added_for_chapter:
|
||||
pokemon = None
|
||||
|
||||
if pokemon is not None:
|
||||
pokemon_added_for_chapter.add(pokemon.name)
|
||||
pokemon.appears_in_book = True
|
||||
name = "".join(chunks[index : index + increment])
|
||||
link = pokemon_to_link(pokemon, name)
|
||||
result.append(link)
|
||||
result.append([])
|
||||
index += increment
|
||||
else:
|
||||
result[-1].append(word)
|
||||
index += 1
|
||||
|
||||
# convert words back into strings
|
||||
for i in range(len(result)):
|
||||
|
@ -80,16 +126,32 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
|
|||
chapter.content = str(soup)
|
||||
|
||||
|
||||
def patch(epub_filepath: str, pokemon: List[Pokemon]):
|
||||
book = epub.read_epub(epub_filepath)
|
||||
|
||||
def get_pokemon_lookup(pokemon: List[Pokemon]) -> Dict[str, Pokemon]:
|
||||
pokemon_lookup = {p.name.lower(): p for p in pokemon}
|
||||
pokemon_lookup["nidoran"] = pokemon_lookup["nidoran♂"]
|
||||
pokemon_lookup["barrierd"] = pokemon_lookup["mr. mime"]
|
||||
return pokemon_lookup
|
||||
|
||||
|
||||
def patch(epub_filename: str, pokemon: List[Pokemon]):
|
||||
try:
|
||||
book = epub.read_epub(epub_filename)
|
||||
except Exception:
|
||||
logging.exception("Failed to open epub.")
|
||||
sys.exit(1)
|
||||
|
||||
pokemon_lookup = get_pokemon_lookup(pokemon)
|
||||
chapters = [
|
||||
b
|
||||
for b in book.get_items()
|
||||
if isinstance(b, epub.EpubHtml)
|
||||
if b.id.startswith("np_")
|
||||
]
|
||||
|
||||
if [c for c in chapters if c.id == POKEDEX_UID]:
|
||||
logging.warning(f"It looks like '{epub_filename}' already has a Pokedex.")
|
||||
sys.exit(1)
|
||||
|
||||
for c in track(chapters, description="Add Pokemon links to chapters"):
|
||||
patch_chapter(c, pokemon_lookup)
|
||||
|
||||
|
@ -103,17 +165,17 @@ def patch(epub_filepath: str, pokemon: List[Pokemon]):
|
|||
book.spine.append((chapter.id, "yes"))
|
||||
|
||||
for p in pokemon:
|
||||
image_content = open(p.img_filepath, "rb").read()
|
||||
image_content = open(p.img_filename, "rb").read()
|
||||
img = epub.EpubItem(
|
||||
uid=p.name,
|
||||
file_name=p.img_filepath,
|
||||
file_name=p.img_filename,
|
||||
media_type="image/png",
|
||||
content=image_content,
|
||||
)
|
||||
book.add_item(img)
|
||||
|
||||
console = Console()
|
||||
epub_out = epub_filepath.replace(".", "-with-links.")
|
||||
epub_out = epub_filename.replace(".", "-with-links.")
|
||||
with console.status(f"Writing {epub_out}"):
|
||||
epub.write_epub(epub_out, book, {})
|
||||
console.print(f"[green]✓[/green] [orange1]{epub_out}[/orange1] written")
|
||||
|
|
|
@ -11,11 +11,14 @@ def main():
|
|||
level=logging.INFO,
|
||||
format="%(message)s",
|
||||
datefmt="[%X]",
|
||||
handlers=[RichHandler()],
|
||||
handlers=[RichHandler(rich_tracebacks=True)],
|
||||
)
|
||||
try:
|
||||
ptoos_epub = sys.argv[1]
|
||||
except IndexError:
|
||||
ptoos_epub = "ptoos.epub"
|
||||
logging.warning(f"No epub file provided. Defaulting to '{ptoos_epub}'.")
|
||||
pokemon = src.pokemon.get_pokemon()
|
||||
# for p in pokemon:
|
||||
# p.img_filename = p.img_filename.replace(".png", "-fs8.png")
|
||||
src.epub.patch(ptoos_epub, pokemon)
|
||||
|
|
170
src/pokemon.py
170
src/pokemon.py
|
@ -2,6 +2,7 @@ import requests
|
|||
import sys
|
||||
import os
|
||||
import logging
|
||||
import re
|
||||
from rich.progress import track
|
||||
from pydantic import BaseModel
|
||||
from bs4 import BeautifulSoup
|
||||
|
@ -17,20 +18,21 @@ NATIONAL_INDEX_URL = (
|
|||
|
||||
class Pokemon(BaseModel):
|
||||
name: str
|
||||
link_id: str
|
||||
index: str
|
||||
html_url: str
|
||||
img_url: str
|
||||
html_filepath: str
|
||||
img_filepath: str
|
||||
json_filepath: str
|
||||
html_filename: str
|
||||
img_filename: str
|
||||
json_filename: str
|
||||
description: str = ""
|
||||
appears_in_book: bool = False
|
||||
|
||||
|
||||
def download_to_file(url: str, filepath: str, override=False):
|
||||
"""Downloads url into filepath."""
|
||||
if os.path.isfile(filepath) and override is False:
|
||||
logging.debug(f"'{filepath}' exists.")
|
||||
def download_to_file(url: str, filename: str, override=False):
|
||||
"""Downloads url into filename."""
|
||||
if os.path.isfile(filename) and override is False:
|
||||
logging.debug(f"'{filename}' exists.")
|
||||
return
|
||||
|
||||
headers = {
|
||||
|
@ -38,72 +40,91 @@ def download_to_file(url: str, filepath: str, override=False):
|
|||
}
|
||||
r = requests.get(url, headers=headers)
|
||||
if r.status_code != 200:
|
||||
logging.warning(f"Could not download '{filepath}'")
|
||||
return
|
||||
logging.critical(f"Could not download '{filename}'.")
|
||||
sys.exit(1)
|
||||
|
||||
# Works for text and images
|
||||
with open(filepath, "wb") as f:
|
||||
with open(filename, "wb") as f:
|
||||
for c in r:
|
||||
f.write(c)
|
||||
logging.debug(f"'{filepath}' downloaded.")
|
||||
logging.debug(f"'{filename}' downloaded.")
|
||||
|
||||
|
||||
def download_national_index_html(national_index_filename: str):
|
||||
download_to_file(NATIONAL_INDEX_URL, national_index_filename)
|
||||
|
||||
|
||||
def get_pokemon_table_row_soups(national_index_filename: str) -> List[BeautifulSoup]:
|
||||
with open(national_index_filename, "r") as r:
|
||||
soup = BeautifulSoup(r, "html.parser")
|
||||
pokemon_list_soup = soup.find(
|
||||
id="List_of_Pokémon_by_National_Pokédex_number"
|
||||
).parent
|
||||
generation_soups = pokemon_list_soup.find_next_siblings("h3")
|
||||
table_row_soups = []
|
||||
for generation_soup in generation_soups:
|
||||
table_soup = generation_soup.find_next_sibling("table")
|
||||
tbody_soup = generation_soup.find_next("tbody")
|
||||
# skip first row because it is the header
|
||||
table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:]
|
||||
return table_row_soups
|
||||
|
||||
|
||||
def extract_pokemon_from_table_row(table_row_soup: BeautifulSoup) -> Pokemon:
|
||||
name = table_row_soup.find_next("th").next_element.attrs["title"]
|
||||
link_id = re.sub("[^a-z]", "", name.lower())
|
||||
|
||||
# load Pokemon from JSON if it already exists
|
||||
json_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".json")
|
||||
if os.path.isfile(json_filename):
|
||||
p = Pokemon.parse_file(json_filename)
|
||||
logging.debug(f"Loaded '{p.json_filename}'.")
|
||||
return p
|
||||
|
||||
index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
|
||||
html_url = (
|
||||
BULBAPEDIA_BASE_URL + table_row_soup.find_next("th").next_element.attrs["href"]
|
||||
)
|
||||
img_url = table_row_soup.find("img").attrs["src"]
|
||||
html_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
|
||||
img_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
|
||||
return Pokemon(
|
||||
name=name,
|
||||
link_id=link_id,
|
||||
index=index,
|
||||
html_url=html_url,
|
||||
img_url=img_url,
|
||||
html_filename=html_filename,
|
||||
img_filename=img_filename,
|
||||
json_filename=json_filename,
|
||||
)
|
||||
|
||||
|
||||
def get_pokemon() -> List[Pokemon]:
|
||||
"""Scrape Pokemon from the Bulbapedia national dex"""
|
||||
NATIONAL_INDEX_FILEPATH = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html")
|
||||
download_to_file(NATIONAL_INDEX_URL, NATIONAL_INDEX_FILEPATH)
|
||||
with open(NATIONAL_INDEX_FILEPATH, "r") as r:
|
||||
soup = BeautifulSoup(r, "html.parser")
|
||||
pokemon_list_soup: BeautifulSoup = soup.find(
|
||||
id="List_of_Pokémon_by_National_Pokédex_number"
|
||||
).parent
|
||||
generation_soups: BeautifulSoup = pokemon_list_soup.find_next_siblings("h3")
|
||||
|
||||
table_row_soups = []
|
||||
for generation_soup in generation_soups:
|
||||
table_soup: BeautifulSoup = generation_soup.find_next_sibling("table")
|
||||
tbody_soup: BeautifulSoup = generation_soup.find_next("tbody")
|
||||
# skip first row because it is the header
|
||||
table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:]
|
||||
if not os.path.isdir(POKEMON_CACHE_DIRECTORY):
|
||||
os.mkdir(POKEMON_CACHE_DIRECTORY)
|
||||
national_index_filename = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html")
|
||||
download_national_index_html(national_index_filename)
|
||||
table_row_soups = get_pokemon_table_row_soups(national_index_filename)
|
||||
|
||||
pokemon = []
|
||||
for table_row_soup in track(table_row_soups, description="Download Pokemon"):
|
||||
name = table_row_soup.find_next("th").next_element.attrs["title"]
|
||||
p = extract_pokemon_from_table_row(table_row_soup)
|
||||
|
||||
# ignore Galarian and Alolan Pokemon so
|
||||
if pokemon and pokemon[-1].name == name:
|
||||
# Ignore Galarian and Alolan Pokemon (Pokemon with the same name)
|
||||
if pokemon and pokemon[-1].name == p.name:
|
||||
continue
|
||||
|
||||
# load Pokemon from JSON if it already exists
|
||||
json_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".json")
|
||||
if os.path.isfile(json_filepath):
|
||||
p = Pokemon.parse_file(json_filepath)
|
||||
pokemon.append(p)
|
||||
logging.debug(f"Loaded {p.json_filepath}.")
|
||||
continue
|
||||
|
||||
index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
|
||||
html_url = (
|
||||
BULBAPEDIA_BASE_URL
|
||||
+ table_row_soup.find_next("th").next_element.attrs["href"]
|
||||
)
|
||||
img_url = table_row_soup.find("img").attrs["src"]
|
||||
html_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
|
||||
img_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
|
||||
p = Pokemon(
|
||||
name=name,
|
||||
index=index,
|
||||
html_url=html_url,
|
||||
img_url=img_url,
|
||||
html_filepath=html_filepath,
|
||||
img_filepath=img_filepath,
|
||||
json_filepath=json_filepath,
|
||||
)
|
||||
pokemon.append(p)
|
||||
|
||||
# Pokemon has already been downloaded
|
||||
if p.description and os.path.isfile(p.img_filename):
|
||||
continue
|
||||
|
||||
extend_pokemon(p)
|
||||
with open(p.json_filepath, "w") as f:
|
||||
with open(p.json_filename, "w") as f:
|
||||
f.write(p.json())
|
||||
logging.debug(f"Saved {p.json_filepath}.")
|
||||
logging.debug(f"Saved {p.json_filename}.")
|
||||
|
||||
# Filter out speculative Pokemon
|
||||
pokemon = [
|
||||
|
@ -117,23 +138,26 @@ def get_pokemon() -> List[Pokemon]:
|
|||
|
||||
def extend_pokemon(p: Pokemon):
|
||||
"""Add description and download Pokemon image"""
|
||||
download_to_file(p.html_url, p.html_filepath)
|
||||
with open(p.html_filepath, "r") as r:
|
||||
download_to_file(p.html_url, p.html_filename)
|
||||
with open(p.html_filename, "r") as r:
|
||||
soup = BeautifulSoup(r, "html.parser")
|
||||
content_soup: BeautifulSoup = soup.find(id="mw-content-text").contents[0]
|
||||
|
||||
# description
|
||||
p_soup = content_soup.find("p")
|
||||
description = []
|
||||
while p_soup.name == "p":
|
||||
description.append(p_soup.get_text())
|
||||
p_soup = p_soup.next_sibling
|
||||
p.description = "".join(description)
|
||||
if not p.description:
|
||||
p_soup = content_soup.find("p")
|
||||
description = []
|
||||
while p_soup.name == "p":
|
||||
description.append(p_soup.get_text())
|
||||
p_soup = p_soup.next_sibling
|
||||
p.description = "".join(description)
|
||||
|
||||
# image
|
||||
img_url = (
|
||||
content_soup.find("table").find_next_sibling("table").find("img").attrs["src"]
|
||||
)
|
||||
img_url = img_url.replace("//", "https://")
|
||||
p.img_url = img_url
|
||||
download_to_file(img_url, p.img_filepath)
|
||||
if not os.path.isfile(p.img_filename):
|
||||
img_url = (
|
||||
content_soup.find("table")
|
||||
.find_next_sibling("table")
|
||||
.find("img")
|
||||
.attrs["src"]
|
||||
)
|
||||
img_url = img_url.replace("//", "https://")
|
||||
p.img_url = img_url
|
||||
download_to_file(img_url, p.img_filename)
|
||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,44 @@
|
|||
import pokemon
|
||||
import os
|
||||
import filecmp
|
||||
|
||||
|
||||
def test_download_national_index_html(tmp_path):
|
||||
pokemon_html = tmp_path / "pokedex.html"
|
||||
pokemon.download_national_index_html(pokemon_html)
|
||||
assert os.path.getsize(pokemon_html) > 500000
|
||||
|
||||
|
||||
def test_get_pokemon_table_row_soups():
|
||||
national_index = "test/test_pokedex.html"
|
||||
row_soups = pokemon.get_pokemon_table_row_soups(national_index)
|
||||
assert len(row_soups) == 994
|
||||
|
||||
|
||||
def test_extract_pokemon_from_table_row(tmp_path):
|
||||
national_index = "test/test_pokedex.html"
|
||||
pokemon.POKEMON_CACHE_DIRECTORY = tmp_path
|
||||
row_soups = pokemon.get_pokemon_table_row_soups(national_index)
|
||||
p = pokemon.extract_pokemon_from_table_row(row_soups[42])
|
||||
assert p.name == "Vulpix"
|
||||
assert p.link_id == "vulpix"
|
||||
assert p.index == "#037"
|
||||
assert p.html_url == "https://bulbapedia.bulbagarden.net/wiki/Vulpix_(Pok%C3%A9mon)"
|
||||
assert (
|
||||
p.img_url
|
||||
== "//archives.bulbagarden.net/media/upload/thumb/3/35/037Vulpix-Alola.png/70px-037Vulpix-Alola.png"
|
||||
)
|
||||
assert p.img_filename.endswith("vulpix.png")
|
||||
assert p.json_filename.endswith("vulpix.json")
|
||||
assert p.description == ""
|
||||
assert p.appears_in_book == False
|
||||
|
||||
|
||||
def test_extend_pokemon(tmp_path):
|
||||
national_index = "test/test_pokedex.html"
|
||||
row_soups = pokemon.get_pokemon_table_row_soups(national_index)
|
||||
p = pokemon.extract_pokemon_from_table_row(row_soups[42])
|
||||
p.img_filename = tmp_path / "vulpix.png"
|
||||
pokemon.extend_pokemon(p)
|
||||
assert filecmp.cmp(p.img_filename, "test/test_vulpix.png")
|
||||
assert p.description.startswith("Vulpix (Japanese: \u30ed\u30b3\u30f3 Rokon)")
|
Binary file not shown.
After Width: | Height: | Size: 52 KiB |
Loading…
Reference in New Issue