1
.gitignore
vendored
1
.gitignore
vendored
@ -7,6 +7,7 @@ __pycache__/
|
||||
pokemon
|
||||
tmp
|
||||
ptoos.epub
|
||||
ptoos-with-links.epub
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
16
README.md
16
README.md
@ -5,8 +5,24 @@ to descriptions and pictures of the Pokemon within the e-book itself.
|
||||
|
||||
It works with the epub that you can download from [Daystar Eld's Patreon](https://www.patreon.com/daystareld/).
|
||||
|
||||
## Usage
|
||||
|
||||
```shell
|
||||
pip install --user pipenv
|
||||
pipenv install
|
||||
pipenv shell
|
||||
python ptoos-xray.py "DaystarEld - Pokemon The Origin of Species.epub"
|
||||
```
|
||||
|
||||
## Run tests
|
||||
|
||||
```shell
|
||||
pipenv install --dev
|
||||
pipenv run pytest
|
||||
```
|
||||
|
||||
## Credits
|
||||
|
||||
Full credit for the Pokemon names, images, and descriptions goes to
|
||||
[Bulbapedia](https://bulbapedia.bulbagarden.net) under
|
||||
[Attribution-NonCommercial-ShareAlike 2.5](https://creativecommons.org/licenses/by-nc-sa/2.5/).
|
||||
2
pytest.ini
Normal file
2
pytest.ini
Normal file
@ -0,0 +1,2 @@
|
||||
[pytest]
|
||||
pythonpath = src
|
||||
17
src/epub.py
17
src/epub.py
@ -1,6 +1,7 @@
|
||||
import ebooklib
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
from bs4.element import NavigableString
|
||||
from ebooklib import epub
|
||||
@ -24,7 +25,7 @@ def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml:
|
||||
for p in pokemon:
|
||||
content.append(f'<h2 id="{POKEMON_ID_PREFIX}{p.name.lower()}">{p.name}</h2>')
|
||||
content.append(
|
||||
f' <p><img alt="[Pokemon {p.name}]" src="../{p.img_filepath}"/><br/></p>'
|
||||
f' <p><img alt="[Pokemon {p.name}]" src="../{p.img_filename}"/><br/></p>'
|
||||
)
|
||||
for paragraph in p.description.split("\n"):
|
||||
content.append(f" <p>{paragraph}</p>")
|
||||
@ -80,8 +81,12 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
|
||||
chapter.content = str(soup)
|
||||
|
||||
|
||||
def patch(epub_filepath: str, pokemon: List[Pokemon]):
|
||||
book = epub.read_epub(epub_filepath)
|
||||
def patch(epub_filename: str, pokemon: List[Pokemon]):
|
||||
try:
|
||||
book = epub.read_epub(epub_filename)
|
||||
except Exception:
|
||||
logging.exception("Failed to open epub.")
|
||||
sys.exit(1)
|
||||
|
||||
pokemon_lookup = {p.name.lower(): p for p in pokemon}
|
||||
chapters = [
|
||||
@ -103,17 +108,17 @@ def patch(epub_filepath: str, pokemon: List[Pokemon]):
|
||||
book.spine.append((chapter.id, "yes"))
|
||||
|
||||
for p in pokemon:
|
||||
image_content = open(p.img_filepath, "rb").read()
|
||||
image_content = open(p.img_filename, "rb").read()
|
||||
img = epub.EpubItem(
|
||||
uid=p.name,
|
||||
file_name=p.img_filepath,
|
||||
file_name=p.img_filename,
|
||||
media_type="image/png",
|
||||
content=image_content,
|
||||
)
|
||||
book.add_item(img)
|
||||
|
||||
console = Console()
|
||||
epub_out = epub_filepath.replace(".", "-with-links.")
|
||||
epub_out = epub_filename.replace(".", "-with-links.")
|
||||
with console.status(f"Writing {epub_out}"):
|
||||
epub.write_epub(epub_out, book, {})
|
||||
console.print(f"[green]✓[/green] [orange1]{epub_out}[/orange1] written")
|
||||
|
||||
@ -11,11 +11,12 @@ def main():
|
||||
level=logging.INFO,
|
||||
format="%(message)s",
|
||||
datefmt="[%X]",
|
||||
handlers=[RichHandler()],
|
||||
handlers=[RichHandler(rich_tracebacks=True)],
|
||||
)
|
||||
try:
|
||||
ptoos_epub = sys.argv[1]
|
||||
except IndexError:
|
||||
ptoos_epub = "ptoos.epub"
|
||||
logging.warning(f"No epub file provided. Defaulting to '{ptoos_epub}'.")
|
||||
pokemon = src.pokemon.get_pokemon()
|
||||
src.epub.patch(ptoos_epub, pokemon)
|
||||
|
||||
136
src/pokemon.py
136
src/pokemon.py
@ -20,17 +20,17 @@ class Pokemon(BaseModel):
|
||||
index: str
|
||||
html_url: str
|
||||
img_url: str
|
||||
html_filepath: str
|
||||
img_filepath: str
|
||||
json_filepath: str
|
||||
html_filename: str
|
||||
img_filename: str
|
||||
json_filename: str
|
||||
description: str = ""
|
||||
appears_in_book: bool = False
|
||||
|
||||
|
||||
def download_to_file(url: str, filepath: str, override=False):
|
||||
"""Downloads url into filepath."""
|
||||
if os.path.isfile(filepath) and override is False:
|
||||
logging.debug(f"'{filepath}' exists.")
|
||||
def download_to_file(url: str, filename: str, override=False):
|
||||
"""Downloads url into filename."""
|
||||
if os.path.isfile(filename) and override is False:
|
||||
logging.debug(f"'{filename}' exists.")
|
||||
return
|
||||
|
||||
headers = {
|
||||
@ -38,72 +38,88 @@ def download_to_file(url: str, filepath: str, override=False):
|
||||
}
|
||||
r = requests.get(url, headers=headers)
|
||||
if r.status_code != 200:
|
||||
logging.warning(f"Could not download '{filepath}'")
|
||||
return
|
||||
logging.critical(f"Could not download '{filename}'.")
|
||||
sys.exit(1)
|
||||
|
||||
# Works for text and images
|
||||
with open(filepath, "wb") as f:
|
||||
with open(filename, "wb") as f:
|
||||
for c in r:
|
||||
f.write(c)
|
||||
logging.debug(f"'{filepath}' downloaded.")
|
||||
logging.debug(f"'{filename}' downloaded.")
|
||||
|
||||
|
||||
def download_national_index_html(national_index_filename: str):
|
||||
download_to_file(NATIONAL_INDEX_URL, national_index_filename)
|
||||
|
||||
|
||||
def get_pokemon_table_row_soups(national_index_filename: str) -> List[BeautifulSoup]:
|
||||
with open(national_index_filename, "r") as r:
|
||||
soup = BeautifulSoup(r, "html.parser")
|
||||
pokemon_list_soup = soup.find(id="List_of_Pokémon_by_National_Pokédex_number").parent
|
||||
generation_soups = pokemon_list_soup.find_next_siblings("h3")
|
||||
table_row_soups = []
|
||||
for generation_soup in generation_soups:
|
||||
table_soup = generation_soup.find_next_sibling("table")
|
||||
tbody_soup = generation_soup.find_next("tbody")
|
||||
# skip first row because it is the header
|
||||
table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:]
|
||||
return table_row_soups
|
||||
|
||||
|
||||
def extract_pokemon_from_table_row(table_row_soup: BeautifulSoup) -> Pokemon:
|
||||
name = table_row_soup.find_next("th").next_element.attrs["title"]
|
||||
|
||||
# load Pokemon from JSON if it already exists
|
||||
json_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".json")
|
||||
if os.path.isfile(json_filename):
|
||||
p = Pokemon.parse_file(json_filename)
|
||||
logging.debug(f"Loaded '{p.json_filename}'.")
|
||||
return p
|
||||
|
||||
index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
|
||||
html_url = (
|
||||
BULBAPEDIA_BASE_URL
|
||||
+ table_row_soup.find_next("th").next_element.attrs["href"]
|
||||
)
|
||||
img_url = table_row_soup.find("img").attrs["src"]
|
||||
html_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
|
||||
img_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
|
||||
return Pokemon(
|
||||
name=name,
|
||||
index=index,
|
||||
html_url=html_url,
|
||||
img_url=img_url,
|
||||
html_filename=html_filename,
|
||||
img_filename=img_filename,
|
||||
json_filename=json_filename,
|
||||
)
|
||||
|
||||
|
||||
def get_pokemon() -> List[Pokemon]:
|
||||
"""Scrape Pokemon from the Bulbapedia national dex"""
|
||||
NATIONAL_INDEX_FILEPATH = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html")
|
||||
download_to_file(NATIONAL_INDEX_URL, NATIONAL_INDEX_FILEPATH)
|
||||
with open(NATIONAL_INDEX_FILEPATH, "r") as r:
|
||||
soup = BeautifulSoup(r, "html.parser")
|
||||
pokemon_list_soup: BeautifulSoup = soup.find(
|
||||
id="List_of_Pokémon_by_National_Pokédex_number"
|
||||
).parent
|
||||
generation_soups: BeautifulSoup = pokemon_list_soup.find_next_siblings("h3")
|
||||
|
||||
table_row_soups = []
|
||||
for generation_soup in generation_soups:
|
||||
table_soup: BeautifulSoup = generation_soup.find_next_sibling("table")
|
||||
tbody_soup: BeautifulSoup = generation_soup.find_next("tbody")
|
||||
# skip first row because it is the header
|
||||
table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:]
|
||||
if not os.path.isdir(POKEMON_CACHE_DIRECTORY):
|
||||
os.mkdir(POKEMON_CACHE_DIRECTORY)
|
||||
national_index_filename = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html")
|
||||
download_national_index_html(national_index_filename)
|
||||
table_row_soups = get_pokemon_table_row_soups(national_index_filename)
|
||||
|
||||
pokemon = []
|
||||
for table_row_soup in track(table_row_soups, description="Download Pokemon"):
|
||||
name = table_row_soup.find_next("th").next_element.attrs["title"]
|
||||
p = extract_pokemon_from_table_row(table_row_soup)
|
||||
|
||||
# ignore Galarian and Alolan Pokemon so
|
||||
if pokemon and pokemon[-1].name == name:
|
||||
# Ignore Galarian and Alolan Pokemon (Pokemon with the same name)
|
||||
if pokemon and pokemon[-1].name == p.name:
|
||||
continue
|
||||
|
||||
# load Pokemon from JSON if it already exists
|
||||
json_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".json")
|
||||
if os.path.isfile(json_filepath):
|
||||
p = Pokemon.parse_file(json_filepath)
|
||||
pokemon.append(p)
|
||||
logging.debug(f"Loaded {p.json_filepath}.")
|
||||
continue
|
||||
|
||||
index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
|
||||
html_url = (
|
||||
BULBAPEDIA_BASE_URL
|
||||
+ table_row_soup.find_next("th").next_element.attrs["href"]
|
||||
)
|
||||
img_url = table_row_soup.find("img").attrs["src"]
|
||||
html_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
|
||||
img_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
|
||||
p = Pokemon(
|
||||
name=name,
|
||||
index=index,
|
||||
html_url=html_url,
|
||||
img_url=img_url,
|
||||
html_filepath=html_filepath,
|
||||
img_filepath=img_filepath,
|
||||
json_filepath=json_filepath,
|
||||
)
|
||||
pokemon.append(p)
|
||||
|
||||
# Pokemon has already been downloaded
|
||||
if p.description and os.path.isfile(p.img_filename):
|
||||
continue
|
||||
|
||||
extend_pokemon(p)
|
||||
with open(p.json_filepath, "w") as f:
|
||||
with open(p.json_filename, "w") as f:
|
||||
f.write(p.json())
|
||||
logging.debug(f"Saved {p.json_filepath}.")
|
||||
logging.debug(f"Saved {p.json_filename}.")
|
||||
|
||||
# Filter out speculative Pokemon
|
||||
pokemon = [
|
||||
@ -117,8 +133,8 @@ def get_pokemon() -> List[Pokemon]:
|
||||
|
||||
def extend_pokemon(p: Pokemon):
|
||||
"""Add description and download Pokemon image"""
|
||||
download_to_file(p.html_url, p.html_filepath)
|
||||
with open(p.html_filepath, "r") as r:
|
||||
download_to_file(p.html_url, p.html_filename)
|
||||
with open(p.html_filename, "r") as r:
|
||||
soup = BeautifulSoup(r, "html.parser")
|
||||
content_soup: BeautifulSoup = soup.find(id="mw-content-text").contents[0]
|
||||
|
||||
@ -136,4 +152,4 @@ def extend_pokemon(p: Pokemon):
|
||||
)
|
||||
img_url = img_url.replace("//", "https://")
|
||||
p.img_url = img_url
|
||||
download_to_file(img_url, p.img_filepath)
|
||||
download_to_file(img_url, p.img_filename)
|
||||
|
||||
12149
test/pokedex.html
Normal file
12149
test/pokedex.html
Normal file
File diff suppressed because one or more lines are too long
0
test/test_epub.py
Normal file
0
test/test_epub.py
Normal file
12149
test/test_pokedex.html
Normal file
12149
test/test_pokedex.html
Normal file
File diff suppressed because one or more lines are too long
37
test/test_pokemon.py
Normal file
37
test/test_pokemon.py
Normal file
@ -0,0 +1,37 @@
|
||||
import pokemon
|
||||
import os
|
||||
import filecmp
|
||||
|
||||
|
||||
def test_download_national_index_html(tmp_path):
|
||||
pokemon_html = tmp_path / "pokedex.html"
|
||||
pokemon.download_national_index_html(pokemon_html)
|
||||
assert os.path.getsize(pokemon_html) > 500000
|
||||
|
||||
def test_get_pokemon_table_row_soups():
|
||||
national_index = "test/test_pokedex.html"
|
||||
row_soups = pokemon.get_pokemon_table_row_soups(national_index)
|
||||
assert len(row_soups) == 994
|
||||
|
||||
def test_extract_pokemon_from_table_row(tmp_path):
|
||||
national_index = "test/test_pokedex.html"
|
||||
pokemon.POKEMON_CACHE_DIRECTORY = tmp_path
|
||||
row_soups = pokemon.get_pokemon_table_row_soups(national_index)
|
||||
p = pokemon.extract_pokemon_from_table_row(row_soups[42])
|
||||
assert p.name == 'Vulpix'
|
||||
assert p.index == '#037'
|
||||
assert p.html_url == 'https://bulbapedia.bulbagarden.net/wiki/Vulpix_(Pok%C3%A9mon)'
|
||||
assert p.img_url == '//archives.bulbagarden.net/media/upload/thumb/3/35/037Vulpix-Alola.png/70px-037Vulpix-Alola.png'
|
||||
assert p.img_filename.endswith('vulpix.png')
|
||||
assert p.json_filename.endswith('vulpix.json')
|
||||
assert p.description == ''
|
||||
assert p.appears_in_book == False
|
||||
|
||||
def test_extend_pokemon(tmp_path):
|
||||
national_index = "test/test_pokedex.html"
|
||||
row_soups = pokemon.get_pokemon_table_row_soups(national_index)
|
||||
p = pokemon.extract_pokemon_from_table_row(row_soups[42])
|
||||
p.img_filename = tmp_path / 'vulpix.png'
|
||||
pokemon.extend_pokemon(p)
|
||||
assert filecmp.cmp(p.img_filename, 'test/test_vulpix.png')
|
||||
assert p.description.startswith("Vulpix (Japanese: \u30ed\u30b3\u30f3 Rokon)")
|
||||
BIN
test/test_vulpix.png
Normal file
BIN
test/test_vulpix.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 52 KiB |
Reference in New Issue
Block a user