Add tests and credit Bulbapedia in README

This should resolve #2.
This commit is contained in:
2022-10-24 20:31:29 -04:00
parent bc962bd419
commit d224776a9a
11 changed files with 24443 additions and 67 deletions

1
.gitignore vendored
View File

@ -7,6 +7,7 @@ __pycache__/
pokemon
tmp
ptoos.epub
ptoos-with-links.epub
# C extensions
*.so

View File

@ -5,8 +5,24 @@ to descriptions and pictures of the Pokemon within the e-book itself.
It works with the epub that you can download from [Daystar Eld's Patreon](https://www.patreon.com/daystareld/).
## Usage
```shell
pip install --user pipenv
pipenv install
pipenv shell
python ptoos-xray.py "DaystarEld - Pokemon The Origin of Species.epub"
```
## Run tests
```shell
pipenv install --dev
pipenv run pytest
```
## Credits
Full credit for the Pokemon names, images, and descriptions goes to
[Bulbapedia](https://bulbapedia.bulbagarden.net) under
[Attribution-NonCommercial-ShareAlike 2.5](https://creativecommons.org/licenses/by-nc-sa/2.5/).

2
pytest.ini Normal file
View File

@ -0,0 +1,2 @@
[pytest]
pythonpath = src

View File

@ -1,6 +1,7 @@
import ebooklib
import logging
import re
import sys
from bs4 import BeautifulSoup, Tag
from bs4.element import NavigableString
from ebooklib import epub
@ -24,7 +25,7 @@ def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml:
for p in pokemon:
content.append(f'<h2 id="{POKEMON_ID_PREFIX}{p.name.lower()}">{p.name}</h2>')
content.append(
f' <p><img alt="[Pokemon {p.name}]" src="../{p.img_filepath}"/><br/></p>'
f' <p><img alt="[Pokemon {p.name}]" src="../{p.img_filename}"/><br/></p>'
)
for paragraph in p.description.split("\n"):
content.append(f" <p>{paragraph}</p>")
@ -80,8 +81,12 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
chapter.content = str(soup)
def patch(epub_filepath: str, pokemon: List[Pokemon]):
book = epub.read_epub(epub_filepath)
def patch(epub_filename: str, pokemon: List[Pokemon]):
try:
book = epub.read_epub(epub_filename)
except Exception:
logging.exception("Failed to open epub.")
sys.exit(1)
pokemon_lookup = {p.name.lower(): p for p in pokemon}
chapters = [
@ -103,17 +108,17 @@ def patch(epub_filepath: str, pokemon: List[Pokemon]):
book.spine.append((chapter.id, "yes"))
for p in pokemon:
image_content = open(p.img_filepath, "rb").read()
image_content = open(p.img_filename, "rb").read()
img = epub.EpubItem(
uid=p.name,
file_name=p.img_filepath,
file_name=p.img_filename,
media_type="image/png",
content=image_content,
)
book.add_item(img)
console = Console()
epub_out = epub_filepath.replace(".", "-with-links.")
epub_out = epub_filename.replace(".", "-with-links.")
with console.status(f"Writing {epub_out}"):
epub.write_epub(epub_out, book, {})
console.print(f"[green]✓[/green] [orange1]{epub_out}[/orange1] written")

View File

@ -11,11 +11,12 @@ def main():
level=logging.INFO,
format="%(message)s",
datefmt="[%X]",
handlers=[RichHandler()],
handlers=[RichHandler(rich_tracebacks=True)],
)
try:
ptoos_epub = sys.argv[1]
except IndexError:
ptoos_epub = "ptoos.epub"
logging.warning(f"No epub file provided. Defaulting to '{ptoos_epub}'.")
pokemon = src.pokemon.get_pokemon()
src.epub.patch(ptoos_epub, pokemon)

View File

@ -20,17 +20,17 @@ class Pokemon(BaseModel):
index: str
html_url: str
img_url: str
html_filepath: str
img_filepath: str
json_filepath: str
html_filename: str
img_filename: str
json_filename: str
description: str = ""
appears_in_book: bool = False
def download_to_file(url: str, filepath: str, override=False):
"""Downloads url into filepath."""
if os.path.isfile(filepath) and override is False:
logging.debug(f"'{filepath}' exists.")
def download_to_file(url: str, filename: str, override=False):
"""Downloads url into filename."""
if os.path.isfile(filename) and override is False:
logging.debug(f"'{filename}' exists.")
return
headers = {
@ -38,72 +38,88 @@ def download_to_file(url: str, filepath: str, override=False):
}
r = requests.get(url, headers=headers)
if r.status_code != 200:
logging.warning(f"Could not download '{filepath}'")
return
logging.critical(f"Could not download '{filename}'.")
sys.exit(1)
# Works for text and images
with open(filepath, "wb") as f:
with open(filename, "wb") as f:
for c in r:
f.write(c)
logging.debug(f"'{filepath}' downloaded.")
logging.debug(f"'{filename}' downloaded.")
def download_national_index_html(national_index_filename: str):
download_to_file(NATIONAL_INDEX_URL, national_index_filename)
def get_pokemon_table_row_soups(national_index_filename: str) -> List[BeautifulSoup]:
with open(national_index_filename, "r") as r:
soup = BeautifulSoup(r, "html.parser")
pokemon_list_soup = soup.find(id="List_of_Pokémon_by_National_Pokédex_number").parent
generation_soups = pokemon_list_soup.find_next_siblings("h3")
table_row_soups = []
for generation_soup in generation_soups:
table_soup = generation_soup.find_next_sibling("table")
tbody_soup = generation_soup.find_next("tbody")
# skip first row because it is the header
table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:]
return table_row_soups
def extract_pokemon_from_table_row(table_row_soup: BeautifulSoup) -> Pokemon:
name = table_row_soup.find_next("th").next_element.attrs["title"]
# load Pokemon from JSON if it already exists
json_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".json")
if os.path.isfile(json_filename):
p = Pokemon.parse_file(json_filename)
logging.debug(f"Loaded '{p.json_filename}'.")
return p
index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
html_url = (
BULBAPEDIA_BASE_URL
+ table_row_soup.find_next("th").next_element.attrs["href"]
)
img_url = table_row_soup.find("img").attrs["src"]
html_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
img_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
return Pokemon(
name=name,
index=index,
html_url=html_url,
img_url=img_url,
html_filename=html_filename,
img_filename=img_filename,
json_filename=json_filename,
)
def get_pokemon() -> List[Pokemon]:
"""Scrape Pokemon from the Bulbapedia national dex"""
NATIONAL_INDEX_FILEPATH = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html")
download_to_file(NATIONAL_INDEX_URL, NATIONAL_INDEX_FILEPATH)
with open(NATIONAL_INDEX_FILEPATH, "r") as r:
soup = BeautifulSoup(r, "html.parser")
pokemon_list_soup: BeautifulSoup = soup.find(
id="List_of_Pokémon_by_National_Pokédex_number"
).parent
generation_soups: BeautifulSoup = pokemon_list_soup.find_next_siblings("h3")
table_row_soups = []
for generation_soup in generation_soups:
table_soup: BeautifulSoup = generation_soup.find_next_sibling("table")
tbody_soup: BeautifulSoup = generation_soup.find_next("tbody")
# skip first row because it is the header
table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:]
if not os.path.isdir(POKEMON_CACHE_DIRECTORY):
os.mkdir(POKEMON_CACHE_DIRECTORY)
national_index_filename = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html")
download_national_index_html(national_index_filename)
table_row_soups = get_pokemon_table_row_soups(national_index_filename)
pokemon = []
for table_row_soup in track(table_row_soups, description="Download Pokemon"):
name = table_row_soup.find_next("th").next_element.attrs["title"]
p = extract_pokemon_from_table_row(table_row_soup)
# ignore Galarian and Alolan Pokemon so
if pokemon and pokemon[-1].name == name:
# Ignore Galarian and Alolan Pokemon (Pokemon with the same name)
if pokemon and pokemon[-1].name == p.name:
continue
# load Pokemon from JSON if it already exists
json_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".json")
if os.path.isfile(json_filepath):
p = Pokemon.parse_file(json_filepath)
pokemon.append(p)
logging.debug(f"Loaded {p.json_filepath}.")
continue
index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
html_url = (
BULBAPEDIA_BASE_URL
+ table_row_soup.find_next("th").next_element.attrs["href"]
)
img_url = table_row_soup.find("img").attrs["src"]
html_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
img_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
p = Pokemon(
name=name,
index=index,
html_url=html_url,
img_url=img_url,
html_filepath=html_filepath,
img_filepath=img_filepath,
json_filepath=json_filepath,
)
pokemon.append(p)
# Pokemon has already been downloaded
if p.description and os.path.isfile(p.img_filename):
continue
extend_pokemon(p)
with open(p.json_filepath, "w") as f:
with open(p.json_filename, "w") as f:
f.write(p.json())
logging.debug(f"Saved {p.json_filepath}.")
logging.debug(f"Saved {p.json_filename}.")
# Filter out speculative Pokemon
pokemon = [
@ -117,8 +133,8 @@ def get_pokemon() -> List[Pokemon]:
def extend_pokemon(p: Pokemon):
"""Add description and download Pokemon image"""
download_to_file(p.html_url, p.html_filepath)
with open(p.html_filepath, "r") as r:
download_to_file(p.html_url, p.html_filename)
with open(p.html_filename, "r") as r:
soup = BeautifulSoup(r, "html.parser")
content_soup: BeautifulSoup = soup.find(id="mw-content-text").contents[0]
@ -136,4 +152,4 @@ def extend_pokemon(p: Pokemon):
)
img_url = img_url.replace("//", "https://")
p.img_url = img_url
download_to_file(img_url, p.img_filepath)
download_to_file(img_url, p.img_filename)

12149
test/pokedex.html Normal file

File diff suppressed because one or more lines are too long

0
test/test_epub.py Normal file
View File

12149
test/test_pokedex.html Normal file

File diff suppressed because one or more lines are too long

37
test/test_pokemon.py Normal file
View File

@ -0,0 +1,37 @@
import pokemon
import os
import filecmp
def test_download_national_index_html(tmp_path):
pokemon_html = tmp_path / "pokedex.html"
pokemon.download_national_index_html(pokemon_html)
assert os.path.getsize(pokemon_html) > 500000
def test_get_pokemon_table_row_soups():
national_index = "test/test_pokedex.html"
row_soups = pokemon.get_pokemon_table_row_soups(national_index)
assert len(row_soups) == 994
def test_extract_pokemon_from_table_row(tmp_path):
national_index = "test/test_pokedex.html"
pokemon.POKEMON_CACHE_DIRECTORY = tmp_path
row_soups = pokemon.get_pokemon_table_row_soups(national_index)
p = pokemon.extract_pokemon_from_table_row(row_soups[42])
assert p.name == 'Vulpix'
assert p.index == '#037'
assert p.html_url == 'https://bulbapedia.bulbagarden.net/wiki/Vulpix_(Pok%C3%A9mon)'
assert p.img_url == '//archives.bulbagarden.net/media/upload/thumb/3/35/037Vulpix-Alola.png/70px-037Vulpix-Alola.png'
assert p.img_filename.endswith('vulpix.png')
assert p.json_filename.endswith('vulpix.json')
assert p.description == ''
assert p.appears_in_book == False
def test_extend_pokemon(tmp_path):
national_index = "test/test_pokedex.html"
row_soups = pokemon.get_pokemon_table_row_soups(national_index)
p = pokemon.extract_pokemon_from_table_row(row_soups[42])
p.img_filename = tmp_path / 'vulpix.png'
pokemon.extend_pokemon(p)
assert filecmp.cmp(p.img_filename, 'test/test_vulpix.png')
assert p.description.startswith("Vulpix (Japanese: \u30ed\u30b3\u30f3 Rokon)")

BIN
test/test_vulpix.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 52 KiB