Add tests and credit Bulbapedia in README

This should resolve #2.
This commit is contained in:
2022-10-24 20:31:29 -04:00
parent bc962bd419
commit d224776a9a
11 changed files with 24443 additions and 67 deletions

1
.gitignore vendored
View File

@@ -7,6 +7,7 @@ __pycache__/
pokemon pokemon
tmp tmp
ptoos.epub ptoos.epub
ptoos-with-links.epub
# C extensions # C extensions
*.so *.so

View File

@@ -5,8 +5,24 @@ to descriptions and pictures of the Pokemon within the e-book itself.
It works with the epub that you can download from [Daystar Eld's Patreon](https://www.patreon.com/daystareld/). It works with the epub that you can download from [Daystar Eld's Patreon](https://www.patreon.com/daystareld/).
## Usage
```shell ```shell
pip install --user pipenv
pipenv install pipenv install
pipenv shell pipenv shell
python ptoos-xray.py "DaystarEld - Pokemon The Origin of Species.epub" python ptoos-xray.py "DaystarEld - Pokemon The Origin of Species.epub"
``` ```
## Run tests
```shell
pipenv install --dev
pipenv run pytest
```
## Credits
Full credit for the Pokemon names, images, and descriptions goes to
[Bulbapedia](https://bulbapedia.bulbagarden.net) under
[Attribution-NonCommercial-ShareAlike 2.5](https://creativecommons.org/licenses/by-nc-sa/2.5/).

2
pytest.ini Normal file
View File

@@ -0,0 +1,2 @@
[pytest]
pythonpath = src

View File

@@ -1,6 +1,7 @@
import ebooklib import ebooklib
import logging import logging
import re import re
import sys
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from bs4.element import NavigableString from bs4.element import NavigableString
from ebooklib import epub from ebooklib import epub
@@ -24,7 +25,7 @@ def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml:
for p in pokemon: for p in pokemon:
content.append(f'<h2 id="{POKEMON_ID_PREFIX}{p.name.lower()}">{p.name}</h2>') content.append(f'<h2 id="{POKEMON_ID_PREFIX}{p.name.lower()}">{p.name}</h2>')
content.append( content.append(
f' <p><img alt="[Pokemon {p.name}]" src="../{p.img_filepath}"/><br/></p>' f' <p><img alt="[Pokemon {p.name}]" src="../{p.img_filename}"/><br/></p>'
) )
for paragraph in p.description.split("\n"): for paragraph in p.description.split("\n"):
content.append(f" <p>{paragraph}</p>") content.append(f" <p>{paragraph}</p>")
@@ -80,8 +81,12 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
chapter.content = str(soup) chapter.content = str(soup)
def patch(epub_filepath: str, pokemon: List[Pokemon]): def patch(epub_filename: str, pokemon: List[Pokemon]):
book = epub.read_epub(epub_filepath) try:
book = epub.read_epub(epub_filename)
except Exception:
logging.exception("Failed to open epub.")
sys.exit(1)
pokemon_lookup = {p.name.lower(): p for p in pokemon} pokemon_lookup = {p.name.lower(): p for p in pokemon}
chapters = [ chapters = [
@@ -103,17 +108,17 @@ def patch(epub_filepath: str, pokemon: List[Pokemon]):
book.spine.append((chapter.id, "yes")) book.spine.append((chapter.id, "yes"))
for p in pokemon: for p in pokemon:
image_content = open(p.img_filepath, "rb").read() image_content = open(p.img_filename, "rb").read()
img = epub.EpubItem( img = epub.EpubItem(
uid=p.name, uid=p.name,
file_name=p.img_filepath, file_name=p.img_filename,
media_type="image/png", media_type="image/png",
content=image_content, content=image_content,
) )
book.add_item(img) book.add_item(img)
console = Console() console = Console()
epub_out = epub_filepath.replace(".", "-with-links.") epub_out = epub_filename.replace(".", "-with-links.")
with console.status(f"Writing {epub_out}"): with console.status(f"Writing {epub_out}"):
epub.write_epub(epub_out, book, {}) epub.write_epub(epub_out, book, {})
console.print(f"[green]✓[/green] [orange1]{epub_out}[/orange1] written") console.print(f"[green]✓[/green] [orange1]{epub_out}[/orange1] written")

View File

@@ -11,11 +11,12 @@ def main():
level=logging.INFO, level=logging.INFO,
format="%(message)s", format="%(message)s",
datefmt="[%X]", datefmt="[%X]",
handlers=[RichHandler()], handlers=[RichHandler(rich_tracebacks=True)],
) )
try: try:
ptoos_epub = sys.argv[1] ptoos_epub = sys.argv[1]
except IndexError: except IndexError:
ptoos_epub = "ptoos.epub" ptoos_epub = "ptoos.epub"
logging.warning(f"No epub file provided. Defaulting to '{ptoos_epub}'.")
pokemon = src.pokemon.get_pokemon() pokemon = src.pokemon.get_pokemon()
src.epub.patch(ptoos_epub, pokemon) src.epub.patch(ptoos_epub, pokemon)

View File

@@ -20,17 +20,17 @@ class Pokemon(BaseModel):
index: str index: str
html_url: str html_url: str
img_url: str img_url: str
html_filepath: str html_filename: str
img_filepath: str img_filename: str
json_filepath: str json_filename: str
description: str = "" description: str = ""
appears_in_book: bool = False appears_in_book: bool = False
def download_to_file(url: str, filepath: str, override=False): def download_to_file(url: str, filename: str, override=False):
"""Downloads url into filepath.""" """Downloads url into filename."""
if os.path.isfile(filepath) and override is False: if os.path.isfile(filename) and override is False:
logging.debug(f"'{filepath}' exists.") logging.debug(f"'{filename}' exists.")
return return
headers = { headers = {
@@ -38,72 +38,88 @@ def download_to_file(url: str, filepath: str, override=False):
} }
r = requests.get(url, headers=headers) r = requests.get(url, headers=headers)
if r.status_code != 200: if r.status_code != 200:
logging.warning(f"Could not download '{filepath}'") logging.critical(f"Could not download '{filename}'.")
return sys.exit(1)
# Works for text and images # Works for text and images
with open(filepath, "wb") as f: with open(filename, "wb") as f:
for c in r: for c in r:
f.write(c) f.write(c)
logging.debug(f"'{filepath}' downloaded.") logging.debug(f"'{filename}' downloaded.")
def download_national_index_html(national_index_filename: str):
download_to_file(NATIONAL_INDEX_URL, national_index_filename)
def get_pokemon_table_row_soups(national_index_filename: str) -> List[BeautifulSoup]:
with open(national_index_filename, "r") as r:
soup = BeautifulSoup(r, "html.parser")
pokemon_list_soup = soup.find(id="List_of_Pokémon_by_National_Pokédex_number").parent
generation_soups = pokemon_list_soup.find_next_siblings("h3")
table_row_soups = []
for generation_soup in generation_soups:
table_soup = generation_soup.find_next_sibling("table")
tbody_soup = generation_soup.find_next("tbody")
# skip first row because it is the header
table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:]
return table_row_soups
def extract_pokemon_from_table_row(table_row_soup: BeautifulSoup) -> Pokemon:
name = table_row_soup.find_next("th").next_element.attrs["title"]
# load Pokemon from JSON if it already exists
json_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".json")
if os.path.isfile(json_filename):
p = Pokemon.parse_file(json_filename)
logging.debug(f"Loaded '{p.json_filename}'.")
return p
index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
html_url = (
BULBAPEDIA_BASE_URL
+ table_row_soup.find_next("th").next_element.attrs["href"]
)
img_url = table_row_soup.find("img").attrs["src"]
html_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
img_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
return Pokemon(
name=name,
index=index,
html_url=html_url,
img_url=img_url,
html_filename=html_filename,
img_filename=img_filename,
json_filename=json_filename,
)
def get_pokemon() -> List[Pokemon]: def get_pokemon() -> List[Pokemon]:
"""Scrape Pokemon from the Bulbapedia national dex""" """Scrape Pokemon from the Bulbapedia national dex"""
NATIONAL_INDEX_FILEPATH = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html") if not os.path.isdir(POKEMON_CACHE_DIRECTORY):
download_to_file(NATIONAL_INDEX_URL, NATIONAL_INDEX_FILEPATH) os.mkdir(POKEMON_CACHE_DIRECTORY)
with open(NATIONAL_INDEX_FILEPATH, "r") as r: national_index_filename = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html")
soup = BeautifulSoup(r, "html.parser") download_national_index_html(national_index_filename)
pokemon_list_soup: BeautifulSoup = soup.find( table_row_soups = get_pokemon_table_row_soups(national_index_filename)
id="List_of_Pokémon_by_National_Pokédex_number"
).parent
generation_soups: BeautifulSoup = pokemon_list_soup.find_next_siblings("h3")
table_row_soups = []
for generation_soup in generation_soups:
table_soup: BeautifulSoup = generation_soup.find_next_sibling("table")
tbody_soup: BeautifulSoup = generation_soup.find_next("tbody")
# skip first row because it is the header
table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:]
pokemon = [] pokemon = []
for table_row_soup in track(table_row_soups, description="Download Pokemon"): for table_row_soup in track(table_row_soups, description="Download Pokemon"):
name = table_row_soup.find_next("th").next_element.attrs["title"] p = extract_pokemon_from_table_row(table_row_soup)
# ignore Galarian and Alolan Pokemon so # Ignore Galarian and Alolan Pokemon (Pokemon with the same name)
if pokemon and pokemon[-1].name == name: if pokemon and pokemon[-1].name == p.name:
continue continue
# load Pokemon from JSON if it already exists
json_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".json")
if os.path.isfile(json_filepath):
p = Pokemon.parse_file(json_filepath)
pokemon.append(p)
logging.debug(f"Loaded {p.json_filepath}.")
continue
index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
html_url = (
BULBAPEDIA_BASE_URL
+ table_row_soup.find_next("th").next_element.attrs["href"]
)
img_url = table_row_soup.find("img").attrs["src"]
html_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
img_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
p = Pokemon(
name=name,
index=index,
html_url=html_url,
img_url=img_url,
html_filepath=html_filepath,
img_filepath=img_filepath,
json_filepath=json_filepath,
)
pokemon.append(p) pokemon.append(p)
# Pokemon has already been downloaded
if p.description and os.path.isfile(p.img_filename):
continue
extend_pokemon(p) extend_pokemon(p)
with open(p.json_filepath, "w") as f: with open(p.json_filename, "w") as f:
f.write(p.json()) f.write(p.json())
logging.debug(f"Saved {p.json_filepath}.") logging.debug(f"Saved {p.json_filename}.")
# Filter out speculative Pokemon # Filter out speculative Pokemon
pokemon = [ pokemon = [
@@ -117,8 +133,8 @@ def get_pokemon() -> List[Pokemon]:
def extend_pokemon(p: Pokemon): def extend_pokemon(p: Pokemon):
"""Add description and download Pokemon image""" """Add description and download Pokemon image"""
download_to_file(p.html_url, p.html_filepath) download_to_file(p.html_url, p.html_filename)
with open(p.html_filepath, "r") as r: with open(p.html_filename, "r") as r:
soup = BeautifulSoup(r, "html.parser") soup = BeautifulSoup(r, "html.parser")
content_soup: BeautifulSoup = soup.find(id="mw-content-text").contents[0] content_soup: BeautifulSoup = soup.find(id="mw-content-text").contents[0]
@@ -136,4 +152,4 @@ def extend_pokemon(p: Pokemon):
) )
img_url = img_url.replace("//", "https://") img_url = img_url.replace("//", "https://")
p.img_url = img_url p.img_url = img_url
download_to_file(img_url, p.img_filepath) download_to_file(img_url, p.img_filename)

12149
test/pokedex.html Normal file

File diff suppressed because one or more lines are too long

0
test/test_epub.py Normal file
View File

12149
test/test_pokedex.html Normal file

File diff suppressed because one or more lines are too long

37
test/test_pokemon.py Normal file
View File

@@ -0,0 +1,37 @@
import pokemon
import os
import filecmp
def test_download_national_index_html(tmp_path):
pokemon_html = tmp_path / "pokedex.html"
pokemon.download_national_index_html(pokemon_html)
assert os.path.getsize(pokemon_html) > 500000
def test_get_pokemon_table_row_soups():
national_index = "test/test_pokedex.html"
row_soups = pokemon.get_pokemon_table_row_soups(national_index)
assert len(row_soups) == 994
def test_extract_pokemon_from_table_row(tmp_path):
national_index = "test/test_pokedex.html"
pokemon.POKEMON_CACHE_DIRECTORY = tmp_path
row_soups = pokemon.get_pokemon_table_row_soups(national_index)
p = pokemon.extract_pokemon_from_table_row(row_soups[42])
assert p.name == 'Vulpix'
assert p.index == '#037'
assert p.html_url == 'https://bulbapedia.bulbagarden.net/wiki/Vulpix_(Pok%C3%A9mon)'
assert p.img_url == '//archives.bulbagarden.net/media/upload/thumb/3/35/037Vulpix-Alola.png/70px-037Vulpix-Alola.png'
assert p.img_filename.endswith('vulpix.png')
assert p.json_filename.endswith('vulpix.json')
assert p.description == ''
assert p.appears_in_book == False
def test_extend_pokemon(tmp_path):
national_index = "test/test_pokedex.html"
row_soups = pokemon.get_pokemon_table_row_soups(national_index)
p = pokemon.extract_pokemon_from_table_row(row_soups[42])
p.img_filename = tmp_path / 'vulpix.png'
pokemon.extend_pokemon(p)
assert filecmp.cmp(p.img_filename, 'test/test_vulpix.png')
assert p.description.startswith("Vulpix (Japanese: \u30ed\u30b3\u30f3 Rokon)")

BIN
test/test_vulpix.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 52 KiB