Update readme and format scripts
parent
1248d9d750
commit
8200225780
|
@ -1,9 +1,12 @@
|
|||
# poos-xray
|
||||
|
||||
Script that annotates Pokemon: the Origin of the Species epub with links to
|
||||
descriptions of the Pokemon.
|
||||
Script that annotates the Pokemon: the Origin of the Species e-book with links
|
||||
to descriptions and pictures of the Pokemon within the e-book itself.
|
||||
|
||||
It works with the epub that you can download from [Daystar Eld's Patreon](https://www.patreon.com/daystareld/).
|
||||
|
||||
```shell
|
||||
pipenv install
|
||||
pipenv shell
|
||||
python poos-xray "DaystarEld - Pokemon The Origin of Species.epub"
|
||||
```
|
||||
|
|
49
src/epub.py
49
src/epub.py
|
@ -9,26 +9,29 @@ from typing import List, Dict
|
|||
|
||||
POKEMON_ID_PREFIX = "pokemon-id-"
|
||||
|
||||
|
||||
def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml:
|
||||
POKEDEX_TITLE = "Pokedex"
|
||||
POKEDEX_FILE = "content/np_pokedex.xhtml"
|
||||
POKEDEX_UID = "np_pokedex"
|
||||
chapter = epub.EpubHtml(title=POKEDEX_TITLE, file_name=POKEDEX_FILE, uid=POKEDEX_UID)
|
||||
content = ['<h1>Pokedex</h1>']
|
||||
chapter = epub.EpubHtml(
|
||||
title=POKEDEX_TITLE, file_name=POKEDEX_FILE, uid=POKEDEX_UID
|
||||
)
|
||||
content = ["<h1>Pokedex</h1>"]
|
||||
|
||||
for p in pokemon:
|
||||
content.append(f'<h2 id="{POKEMON_ID_PREFIX}{p.name.lower()}">{p.name}</h2>')
|
||||
content.append(f' <p><img alt="[Pokemon {p.name}]" src="../{p.img_filepath}"/><br/></p>')
|
||||
content.append(
|
||||
f' <p><img alt="[Pokemon {p.name}]" src="../{p.img_filepath}"/><br/></p>'
|
||||
)
|
||||
for paragraph in p.description.split("\n"):
|
||||
content.append(f' <p>{paragraph}</p>')
|
||||
content.append('')
|
||||
content.append(f" <p>{paragraph}</p>")
|
||||
content.append("")
|
||||
|
||||
chapter.content = "\n".join(content)
|
||||
return chapter
|
||||
|
||||
|
||||
|
||||
|
||||
def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
|
||||
r = re.compile("([:,.!?“”‘’…])")
|
||||
soup: BeautifulSoup = BeautifulSoup(chapter.content, "html.parser")
|
||||
|
@ -41,8 +44,8 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
|
|||
return tag
|
||||
|
||||
def patch_string(section: NavigableString) -> List:
|
||||
""" Replace Pokemon with link to Pokemon; requires splitting up the
|
||||
NavigableString into a list of NavigableStrings and Tags. """
|
||||
"""Replace Pokemon with link to Pokemon; requires splitting up the
|
||||
NavigableString into a list of NavigableStrings and Tags."""
|
||||
result = [[]]
|
||||
for word in str(section).split(" "):
|
||||
word_stripped = r.sub("", word)
|
||||
|
@ -55,7 +58,9 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
|
|||
else:
|
||||
# add other chars before pokemon if there are any
|
||||
result[-1].append("".join(word_split[:i]))
|
||||
pokemon_link = pokemon_name_to_link(word_stripped.lower(), word_stripped)
|
||||
pokemon_link = pokemon_name_to_link(
|
||||
word_stripped.lower(), word_stripped
|
||||
)
|
||||
result.append(pokemon_link)
|
||||
result.append([])
|
||||
if i + 1 == len(word_split):
|
||||
|
@ -63,7 +68,7 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
|
|||
result[-1].append(" ")
|
||||
else:
|
||||
# add other chars after pokemon if there are any
|
||||
result[-1].append("".join(word_split[i + 1:]))
|
||||
result[-1].append("".join(word_split[i + 1 :]))
|
||||
else:
|
||||
result[-1].append(word)
|
||||
|
||||
|
@ -96,20 +101,28 @@ def patch(epub_filepath: str, pokemon: List[Pokemon]):
|
|||
book.add_item(chapter)
|
||||
link = epub.Link(chapter.file_name, chapter.title, chapter.id)
|
||||
book.toc.append(link)
|
||||
book.spine.append((chapter.id, 'yes'))
|
||||
book.spine.append((chapter.id, "yes"))
|
||||
|
||||
for p in pokemon:
|
||||
image_content = open(p.img_filepath, 'rb').read()
|
||||
img = epub.EpubItem(uid=p.name, file_name=p.img_filepath, media_type='image/png', content=image_content)
|
||||
image_content = open(p.img_filepath, "rb").read()
|
||||
img = epub.EpubItem(
|
||||
uid=p.name,
|
||||
file_name=p.img_filepath,
|
||||
media_type="image/png",
|
||||
content=image_content,
|
||||
)
|
||||
book.add_item(img)
|
||||
|
||||
pokemon_lookup = {p.name.lower(): p for p in pokemon}
|
||||
chapters = [b for b in book.get_items()
|
||||
if isinstance(b, epub.EpubHtml)
|
||||
if b.id.startswith("np_")]
|
||||
chapters = [
|
||||
b
|
||||
for b in book.get_items()
|
||||
if isinstance(b, epub.EpubHtml)
|
||||
if b.id.startswith("np_")
|
||||
]
|
||||
for c in chapters:
|
||||
patch_chapter(c, pokemon_lookup)
|
||||
|
||||
epub_out = epub_filepath.replace(".", "-with-links.")
|
||||
epub.write_epub(epub_out, book, {})
|
||||
logging.info(f"{epub_out} written.")
|
||||
logging.info(f"Write '{epub_out}'.")
|
||||
|
|
14
src/main.py
14
src/main.py
|
@ -1,13 +1,15 @@
|
|||
import sys
|
||||
import logging
|
||||
import src.pokemon
|
||||
import src.epub
|
||||
|
||||
|
||||
def init_logging():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
|
||||
def main():
|
||||
init_logging()
|
||||
logging.basicConfig(format="%(message)s", level=logging.INFO)
|
||||
try:
|
||||
ptoos_epub = sys.argv[1]
|
||||
except IndexError:
|
||||
ptoos_epub = "poos.epub"
|
||||
logging.info(f"Patching '{ptoos_epub}'.")
|
||||
pokemon = src.pokemon.get_pokemon()
|
||||
src.epub.patch("poos.epub", pokemon)
|
||||
src.epub.patch(ptoos_epub, pokemon)
|
||||
|
|
|
@ -9,7 +9,9 @@ from typing import List
|
|||
|
||||
POKEMON_CACHE_DIRECTORY = "pokemon"
|
||||
BULBAPEDIA_BASE_URL = "https://bulbapedia.bulbagarden.net"
|
||||
NATIONAL_INDEX_URL = BULBAPEDIA_BASE_URL + "/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number"
|
||||
NATIONAL_INDEX_URL = (
|
||||
BULBAPEDIA_BASE_URL + "/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number"
|
||||
)
|
||||
|
||||
|
||||
class Pokemon(BaseModel):
|
||||
|
@ -24,13 +26,13 @@ class Pokemon(BaseModel):
|
|||
|
||||
|
||||
def download_to_file(url: str, filepath: str, override=False):
|
||||
""" Downloads url into filepath. """
|
||||
"""Downloads url into filepath."""
|
||||
if os.path.isfile(filepath) and override is False:
|
||||
logging.debug(f"'{filepath}' exists.")
|
||||
return
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0'
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0"
|
||||
}
|
||||
r = requests.get(url, headers=headers)
|
||||
if r.status_code != 200:
|
||||
|
@ -45,12 +47,14 @@ def download_to_file(url: str, filepath: str, override=False):
|
|||
|
||||
|
||||
def get_pokemon() -> List[Pokemon]:
|
||||
""" Scrape Pokemon from the Bulbapedia national dex """
|
||||
"""Scrape Pokemon from the Bulbapedia national dex"""
|
||||
NATIONAL_INDEX_FILEPATH = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html")
|
||||
download_to_file(NATIONAL_INDEX_URL, NATIONAL_INDEX_FILEPATH)
|
||||
with open(NATIONAL_INDEX_FILEPATH, "r") as r:
|
||||
soup = BeautifulSoup(r, "html.parser")
|
||||
pokemon_list_soup: BeautifulSoup = soup.find(id="List_of_Pokémon_by_National_Pokédex_number").parent
|
||||
pokemon_list_soup: BeautifulSoup = soup.find(
|
||||
id="List_of_Pokémon_by_National_Pokédex_number"
|
||||
).parent
|
||||
generation_soups: BeautifulSoup = pokemon_list_soup.find_next_siblings("h3")
|
||||
|
||||
table_row_soups = []
|
||||
|
@ -77,48 +81,58 @@ def get_pokemon() -> List[Pokemon]:
|
|||
continue
|
||||
|
||||
index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
|
||||
html_url = BULBAPEDIA_BASE_URL + table_row_soup.find_next("th").next_element.attrs["href"]
|
||||
html_url = (
|
||||
BULBAPEDIA_BASE_URL
|
||||
+ table_row_soup.find_next("th").next_element.attrs["href"]
|
||||
)
|
||||
img_url = table_row_soup.find("img").attrs["src"]
|
||||
html_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
|
||||
img_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
|
||||
p = Pokemon(name=name,
|
||||
index=index,
|
||||
html_url=html_url,
|
||||
img_url=img_url,
|
||||
html_filepath=html_filepath,
|
||||
img_filepath=img_filepath,
|
||||
json_filepath=json_filepath)
|
||||
p = Pokemon(
|
||||
name=name,
|
||||
index=index,
|
||||
html_url=html_url,
|
||||
img_url=img_url,
|
||||
html_filepath=html_filepath,
|
||||
img_filepath=img_filepath,
|
||||
json_filepath=json_filepath,
|
||||
)
|
||||
pokemon.append(p)
|
||||
extend_pokemon(p)
|
||||
with open(p.json_filepath, 'w') as f:
|
||||
with open(p.json_filepath, "w") as f:
|
||||
f.write(p.json())
|
||||
logging.info(f"Saved {p.json_filepath}.")
|
||||
|
||||
# Filter out speculative Pokemon
|
||||
pokemon = [p for p in pokemon if not p.description.startswith("This article's contents will change")]
|
||||
pokemon = [
|
||||
p
|
||||
for p in pokemon
|
||||
if not p.description.startswith("This article's contents will change")
|
||||
]
|
||||
|
||||
logging.info("Pokemon loaded.")
|
||||
return pokemon
|
||||
|
||||
|
||||
def extend_pokemon(p: Pokemon):
|
||||
""" Add description and download Pokemon image """
|
||||
"""Add description and download Pokemon image"""
|
||||
download_to_file(p.html_url, p.html_filepath)
|
||||
with open(p.html_filepath, "r") as r:
|
||||
soup = BeautifulSoup(r, "html.parser")
|
||||
content_soup: BeautifulSoup = soup.find(id='mw-content-text').contents[0]
|
||||
content_soup: BeautifulSoup = soup.find(id="mw-content-text").contents[0]
|
||||
|
||||
# description
|
||||
p_soup = content_soup.find("p")
|
||||
description = []
|
||||
while p_soup.name == 'p':
|
||||
while p_soup.name == "p":
|
||||
description.append(p_soup.get_text())
|
||||
p_soup = p_soup.next_sibling
|
||||
p.description = "".join(description)
|
||||
|
||||
# image
|
||||
img_url = content_soup.find("table").find_next_sibling("table").find("img").attrs["src"]
|
||||
img_url = (
|
||||
content_soup.find("table").find_next_sibling("table").find("img").attrs["src"]
|
||||
)
|
||||
img_url = img_url.replace("//", "https://")
|
||||
p.img_url = img_url
|
||||
download_to_file(img_url, p.img_filepath)
|
||||
|
||||
|
|
Loading…
Reference in New Issue