1
.gitignore
vendored
1
.gitignore
vendored
@@ -7,6 +7,7 @@ __pycache__/
|
|||||||
pokemon
|
pokemon
|
||||||
tmp
|
tmp
|
||||||
ptoos.epub
|
ptoos.epub
|
||||||
|
ptoos-with-links.epub
|
||||||
|
|
||||||
# C extensions
|
# C extensions
|
||||||
*.so
|
*.so
|
||||||
|
|||||||
16
README.md
16
README.md
@@ -5,8 +5,24 @@ to descriptions and pictures of the Pokemon within the e-book itself.
|
|||||||
|
|
||||||
It works with the epub that you can download from [Daystar Eld's Patreon](https://www.patreon.com/daystareld/).
|
It works with the epub that you can download from [Daystar Eld's Patreon](https://www.patreon.com/daystareld/).
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
|
pip install --user pipenv
|
||||||
pipenv install
|
pipenv install
|
||||||
pipenv shell
|
pipenv shell
|
||||||
python ptoos-xray.py "DaystarEld - Pokemon The Origin of Species.epub"
|
python ptoos-xray.py "DaystarEld - Pokemon The Origin of Species.epub"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Run tests
|
||||||
|
|
||||||
|
```shell
|
||||||
|
pipenv install --dev
|
||||||
|
pipenv run pytest
|
||||||
|
```
|
||||||
|
|
||||||
|
## Credits
|
||||||
|
|
||||||
|
Full credit for the Pokemon names, images, and descriptions goes to
|
||||||
|
[Bulbapedia](https://bulbapedia.bulbagarden.net) under
|
||||||
|
[Attribution-NonCommercial-ShareAlike 2.5](https://creativecommons.org/licenses/by-nc-sa/2.5/).
|
||||||
2
pytest.ini
Normal file
2
pytest.ini
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
[pytest]
|
||||||
|
pythonpath = src
|
||||||
17
src/epub.py
17
src/epub.py
@@ -1,6 +1,7 @@
|
|||||||
import ebooklib
|
import ebooklib
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
from bs4.element import NavigableString
|
from bs4.element import NavigableString
|
||||||
from ebooklib import epub
|
from ebooklib import epub
|
||||||
@@ -24,7 +25,7 @@ def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml:
|
|||||||
for p in pokemon:
|
for p in pokemon:
|
||||||
content.append(f'<h2 id="{POKEMON_ID_PREFIX}{p.name.lower()}">{p.name}</h2>')
|
content.append(f'<h2 id="{POKEMON_ID_PREFIX}{p.name.lower()}">{p.name}</h2>')
|
||||||
content.append(
|
content.append(
|
||||||
f' <p><img alt="[Pokemon {p.name}]" src="../{p.img_filepath}"/><br/></p>'
|
f' <p><img alt="[Pokemon {p.name}]" src="../{p.img_filename}"/><br/></p>'
|
||||||
)
|
)
|
||||||
for paragraph in p.description.split("\n"):
|
for paragraph in p.description.split("\n"):
|
||||||
content.append(f" <p>{paragraph}</p>")
|
content.append(f" <p>{paragraph}</p>")
|
||||||
@@ -80,8 +81,12 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
|
|||||||
chapter.content = str(soup)
|
chapter.content = str(soup)
|
||||||
|
|
||||||
|
|
||||||
def patch(epub_filepath: str, pokemon: List[Pokemon]):
|
def patch(epub_filename: str, pokemon: List[Pokemon]):
|
||||||
book = epub.read_epub(epub_filepath)
|
try:
|
||||||
|
book = epub.read_epub(epub_filename)
|
||||||
|
except Exception:
|
||||||
|
logging.exception("Failed to open epub.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
pokemon_lookup = {p.name.lower(): p for p in pokemon}
|
pokemon_lookup = {p.name.lower(): p for p in pokemon}
|
||||||
chapters = [
|
chapters = [
|
||||||
@@ -103,17 +108,17 @@ def patch(epub_filepath: str, pokemon: List[Pokemon]):
|
|||||||
book.spine.append((chapter.id, "yes"))
|
book.spine.append((chapter.id, "yes"))
|
||||||
|
|
||||||
for p in pokemon:
|
for p in pokemon:
|
||||||
image_content = open(p.img_filepath, "rb").read()
|
image_content = open(p.img_filename, "rb").read()
|
||||||
img = epub.EpubItem(
|
img = epub.EpubItem(
|
||||||
uid=p.name,
|
uid=p.name,
|
||||||
file_name=p.img_filepath,
|
file_name=p.img_filename,
|
||||||
media_type="image/png",
|
media_type="image/png",
|
||||||
content=image_content,
|
content=image_content,
|
||||||
)
|
)
|
||||||
book.add_item(img)
|
book.add_item(img)
|
||||||
|
|
||||||
console = Console()
|
console = Console()
|
||||||
epub_out = epub_filepath.replace(".", "-with-links.")
|
epub_out = epub_filename.replace(".", "-with-links.")
|
||||||
with console.status(f"Writing {epub_out}"):
|
with console.status(f"Writing {epub_out}"):
|
||||||
epub.write_epub(epub_out, book, {})
|
epub.write_epub(epub_out, book, {})
|
||||||
console.print(f"[green]✓[/green] [orange1]{epub_out}[/orange1] written")
|
console.print(f"[green]✓[/green] [orange1]{epub_out}[/orange1] written")
|
||||||
|
|||||||
@@ -11,11 +11,12 @@ def main():
|
|||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
format="%(message)s",
|
format="%(message)s",
|
||||||
datefmt="[%X]",
|
datefmt="[%X]",
|
||||||
handlers=[RichHandler()],
|
handlers=[RichHandler(rich_tracebacks=True)],
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
ptoos_epub = sys.argv[1]
|
ptoos_epub = sys.argv[1]
|
||||||
except IndexError:
|
except IndexError:
|
||||||
ptoos_epub = "ptoos.epub"
|
ptoos_epub = "ptoos.epub"
|
||||||
|
logging.warning(f"No epub file provided. Defaulting to '{ptoos_epub}'.")
|
||||||
pokemon = src.pokemon.get_pokemon()
|
pokemon = src.pokemon.get_pokemon()
|
||||||
src.epub.patch(ptoos_epub, pokemon)
|
src.epub.patch(ptoos_epub, pokemon)
|
||||||
|
|||||||
136
src/pokemon.py
136
src/pokemon.py
@@ -20,17 +20,17 @@ class Pokemon(BaseModel):
|
|||||||
index: str
|
index: str
|
||||||
html_url: str
|
html_url: str
|
||||||
img_url: str
|
img_url: str
|
||||||
html_filepath: str
|
html_filename: str
|
||||||
img_filepath: str
|
img_filename: str
|
||||||
json_filepath: str
|
json_filename: str
|
||||||
description: str = ""
|
description: str = ""
|
||||||
appears_in_book: bool = False
|
appears_in_book: bool = False
|
||||||
|
|
||||||
|
|
||||||
def download_to_file(url: str, filepath: str, override=False):
|
def download_to_file(url: str, filename: str, override=False):
|
||||||
"""Downloads url into filepath."""
|
"""Downloads url into filename."""
|
||||||
if os.path.isfile(filepath) and override is False:
|
if os.path.isfile(filename) and override is False:
|
||||||
logging.debug(f"'{filepath}' exists.")
|
logging.debug(f"'{filename}' exists.")
|
||||||
return
|
return
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
@@ -38,72 +38,88 @@ def download_to_file(url: str, filepath: str, override=False):
|
|||||||
}
|
}
|
||||||
r = requests.get(url, headers=headers)
|
r = requests.get(url, headers=headers)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
logging.warning(f"Could not download '{filepath}'")
|
logging.critical(f"Could not download '{filename}'.")
|
||||||
return
|
sys.exit(1)
|
||||||
|
|
||||||
# Works for text and images
|
# Works for text and images
|
||||||
with open(filepath, "wb") as f:
|
with open(filename, "wb") as f:
|
||||||
for c in r:
|
for c in r:
|
||||||
f.write(c)
|
f.write(c)
|
||||||
logging.debug(f"'{filepath}' downloaded.")
|
logging.debug(f"'{filename}' downloaded.")
|
||||||
|
|
||||||
|
|
||||||
|
def download_national_index_html(national_index_filename: str):
|
||||||
|
download_to_file(NATIONAL_INDEX_URL, national_index_filename)
|
||||||
|
|
||||||
|
|
||||||
|
def get_pokemon_table_row_soups(national_index_filename: str) -> List[BeautifulSoup]:
|
||||||
|
with open(national_index_filename, "r") as r:
|
||||||
|
soup = BeautifulSoup(r, "html.parser")
|
||||||
|
pokemon_list_soup = soup.find(id="List_of_Pokémon_by_National_Pokédex_number").parent
|
||||||
|
generation_soups = pokemon_list_soup.find_next_siblings("h3")
|
||||||
|
table_row_soups = []
|
||||||
|
for generation_soup in generation_soups:
|
||||||
|
table_soup = generation_soup.find_next_sibling("table")
|
||||||
|
tbody_soup = generation_soup.find_next("tbody")
|
||||||
|
# skip first row because it is the header
|
||||||
|
table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:]
|
||||||
|
return table_row_soups
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pokemon_from_table_row(table_row_soup: BeautifulSoup) -> Pokemon:
|
||||||
|
name = table_row_soup.find_next("th").next_element.attrs["title"]
|
||||||
|
|
||||||
|
# load Pokemon from JSON if it already exists
|
||||||
|
json_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".json")
|
||||||
|
if os.path.isfile(json_filename):
|
||||||
|
p = Pokemon.parse_file(json_filename)
|
||||||
|
logging.debug(f"Loaded '{p.json_filename}'.")
|
||||||
|
return p
|
||||||
|
|
||||||
|
index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
|
||||||
|
html_url = (
|
||||||
|
BULBAPEDIA_BASE_URL
|
||||||
|
+ table_row_soup.find_next("th").next_element.attrs["href"]
|
||||||
|
)
|
||||||
|
img_url = table_row_soup.find("img").attrs["src"]
|
||||||
|
html_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
|
||||||
|
img_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
|
||||||
|
return Pokemon(
|
||||||
|
name=name,
|
||||||
|
index=index,
|
||||||
|
html_url=html_url,
|
||||||
|
img_url=img_url,
|
||||||
|
html_filename=html_filename,
|
||||||
|
img_filename=img_filename,
|
||||||
|
json_filename=json_filename,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_pokemon() -> List[Pokemon]:
|
def get_pokemon() -> List[Pokemon]:
|
||||||
"""Scrape Pokemon from the Bulbapedia national dex"""
|
"""Scrape Pokemon from the Bulbapedia national dex"""
|
||||||
NATIONAL_INDEX_FILEPATH = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html")
|
if not os.path.isdir(POKEMON_CACHE_DIRECTORY):
|
||||||
download_to_file(NATIONAL_INDEX_URL, NATIONAL_INDEX_FILEPATH)
|
os.mkdir(POKEMON_CACHE_DIRECTORY)
|
||||||
with open(NATIONAL_INDEX_FILEPATH, "r") as r:
|
national_index_filename = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html")
|
||||||
soup = BeautifulSoup(r, "html.parser")
|
download_national_index_html(national_index_filename)
|
||||||
pokemon_list_soup: BeautifulSoup = soup.find(
|
table_row_soups = get_pokemon_table_row_soups(national_index_filename)
|
||||||
id="List_of_Pokémon_by_National_Pokédex_number"
|
|
||||||
).parent
|
|
||||||
generation_soups: BeautifulSoup = pokemon_list_soup.find_next_siblings("h3")
|
|
||||||
|
|
||||||
table_row_soups = []
|
|
||||||
for generation_soup in generation_soups:
|
|
||||||
table_soup: BeautifulSoup = generation_soup.find_next_sibling("table")
|
|
||||||
tbody_soup: BeautifulSoup = generation_soup.find_next("tbody")
|
|
||||||
# skip first row because it is the header
|
|
||||||
table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:]
|
|
||||||
|
|
||||||
pokemon = []
|
pokemon = []
|
||||||
for table_row_soup in track(table_row_soups, description="Download Pokemon"):
|
for table_row_soup in track(table_row_soups, description="Download Pokemon"):
|
||||||
name = table_row_soup.find_next("th").next_element.attrs["title"]
|
p = extract_pokemon_from_table_row(table_row_soup)
|
||||||
|
|
||||||
# ignore Galarian and Alolan Pokemon so
|
# Ignore Galarian and Alolan Pokemon (Pokemon with the same name)
|
||||||
if pokemon and pokemon[-1].name == name:
|
if pokemon and pokemon[-1].name == p.name:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# load Pokemon from JSON if it already exists
|
|
||||||
json_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".json")
|
|
||||||
if os.path.isfile(json_filepath):
|
|
||||||
p = Pokemon.parse_file(json_filepath)
|
|
||||||
pokemon.append(p)
|
|
||||||
logging.debug(f"Loaded {p.json_filepath}.")
|
|
||||||
continue
|
|
||||||
|
|
||||||
index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
|
|
||||||
html_url = (
|
|
||||||
BULBAPEDIA_BASE_URL
|
|
||||||
+ table_row_soup.find_next("th").next_element.attrs["href"]
|
|
||||||
)
|
|
||||||
img_url = table_row_soup.find("img").attrs["src"]
|
|
||||||
html_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
|
|
||||||
img_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
|
|
||||||
p = Pokemon(
|
|
||||||
name=name,
|
|
||||||
index=index,
|
|
||||||
html_url=html_url,
|
|
||||||
img_url=img_url,
|
|
||||||
html_filepath=html_filepath,
|
|
||||||
img_filepath=img_filepath,
|
|
||||||
json_filepath=json_filepath,
|
|
||||||
)
|
|
||||||
pokemon.append(p)
|
pokemon.append(p)
|
||||||
|
|
||||||
|
# Pokemon has already been downloaded
|
||||||
|
if p.description and os.path.isfile(p.img_filename):
|
||||||
|
continue
|
||||||
|
|
||||||
extend_pokemon(p)
|
extend_pokemon(p)
|
||||||
with open(p.json_filepath, "w") as f:
|
with open(p.json_filename, "w") as f:
|
||||||
f.write(p.json())
|
f.write(p.json())
|
||||||
logging.debug(f"Saved {p.json_filepath}.")
|
logging.debug(f"Saved {p.json_filename}.")
|
||||||
|
|
||||||
# Filter out speculative Pokemon
|
# Filter out speculative Pokemon
|
||||||
pokemon = [
|
pokemon = [
|
||||||
@@ -117,8 +133,8 @@ def get_pokemon() -> List[Pokemon]:
|
|||||||
|
|
||||||
def extend_pokemon(p: Pokemon):
|
def extend_pokemon(p: Pokemon):
|
||||||
"""Add description and download Pokemon image"""
|
"""Add description and download Pokemon image"""
|
||||||
download_to_file(p.html_url, p.html_filepath)
|
download_to_file(p.html_url, p.html_filename)
|
||||||
with open(p.html_filepath, "r") as r:
|
with open(p.html_filename, "r") as r:
|
||||||
soup = BeautifulSoup(r, "html.parser")
|
soup = BeautifulSoup(r, "html.parser")
|
||||||
content_soup: BeautifulSoup = soup.find(id="mw-content-text").contents[0]
|
content_soup: BeautifulSoup = soup.find(id="mw-content-text").contents[0]
|
||||||
|
|
||||||
@@ -136,4 +152,4 @@ def extend_pokemon(p: Pokemon):
|
|||||||
)
|
)
|
||||||
img_url = img_url.replace("//", "https://")
|
img_url = img_url.replace("//", "https://")
|
||||||
p.img_url = img_url
|
p.img_url = img_url
|
||||||
download_to_file(img_url, p.img_filepath)
|
download_to_file(img_url, p.img_filename)
|
||||||
|
|||||||
12149
test/pokedex.html
Normal file
12149
test/pokedex.html
Normal file
File diff suppressed because one or more lines are too long
0
test/test_epub.py
Normal file
0
test/test_epub.py
Normal file
12149
test/test_pokedex.html
Normal file
12149
test/test_pokedex.html
Normal file
File diff suppressed because one or more lines are too long
37
test/test_pokemon.py
Normal file
37
test/test_pokemon.py
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
import pokemon
|
||||||
|
import os
|
||||||
|
import filecmp
|
||||||
|
|
||||||
|
|
||||||
|
def test_download_national_index_html(tmp_path):
|
||||||
|
pokemon_html = tmp_path / "pokedex.html"
|
||||||
|
pokemon.download_national_index_html(pokemon_html)
|
||||||
|
assert os.path.getsize(pokemon_html) > 500000
|
||||||
|
|
||||||
|
def test_get_pokemon_table_row_soups():
|
||||||
|
national_index = "test/test_pokedex.html"
|
||||||
|
row_soups = pokemon.get_pokemon_table_row_soups(national_index)
|
||||||
|
assert len(row_soups) == 994
|
||||||
|
|
||||||
|
def test_extract_pokemon_from_table_row(tmp_path):
|
||||||
|
national_index = "test/test_pokedex.html"
|
||||||
|
pokemon.POKEMON_CACHE_DIRECTORY = tmp_path
|
||||||
|
row_soups = pokemon.get_pokemon_table_row_soups(national_index)
|
||||||
|
p = pokemon.extract_pokemon_from_table_row(row_soups[42])
|
||||||
|
assert p.name == 'Vulpix'
|
||||||
|
assert p.index == '#037'
|
||||||
|
assert p.html_url == 'https://bulbapedia.bulbagarden.net/wiki/Vulpix_(Pok%C3%A9mon)'
|
||||||
|
assert p.img_url == '//archives.bulbagarden.net/media/upload/thumb/3/35/037Vulpix-Alola.png/70px-037Vulpix-Alola.png'
|
||||||
|
assert p.img_filename.endswith('vulpix.png')
|
||||||
|
assert p.json_filename.endswith('vulpix.json')
|
||||||
|
assert p.description == ''
|
||||||
|
assert p.appears_in_book == False
|
||||||
|
|
||||||
|
def test_extend_pokemon(tmp_path):
|
||||||
|
national_index = "test/test_pokedex.html"
|
||||||
|
row_soups = pokemon.get_pokemon_table_row_soups(national_index)
|
||||||
|
p = pokemon.extract_pokemon_from_table_row(row_soups[42])
|
||||||
|
p.img_filename = tmp_path / 'vulpix.png'
|
||||||
|
pokemon.extend_pokemon(p)
|
||||||
|
assert filecmp.cmp(p.img_filename, 'test/test_vulpix.png')
|
||||||
|
assert p.description.startswith("Vulpix (Japanese: \u30ed\u30b3\u30f3 Rokon)")
|
||||||
BIN
test/test_vulpix.png
Normal file
BIN
test/test_vulpix.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 52 KiB |
Reference in New Issue
Block a user