Update readme and format scripts
This commit is contained in:
@@ -1,9 +1,12 @@
|
|||||||
# poos-xray
|
# poos-xray
|
||||||
|
|
||||||
Script that annotates Pokemon: the Origin of the Species epub with links to
|
Script that annotates the Pokemon: the Origin of the Species e-book with links
|
||||||
descriptions of the Pokemon.
|
to descriptions and pictures of the Pokemon within the e-book itself.
|
||||||
|
|
||||||
|
It works with the epub that you can download from [Daystar Eld's Patreon](https://www.patreon.com/daystareld/).
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
pipenv install
|
pipenv install
|
||||||
pipenv shell
|
pipenv shell
|
||||||
|
python poos-xray "DaystarEld - Pokemon The Origin of Species.epub"
|
||||||
```
|
```
|
||||||
|
|||||||
49
src/epub.py
49
src/epub.py
@@ -9,26 +9,29 @@ from typing import List, Dict
|
|||||||
|
|
||||||
POKEMON_ID_PREFIX = "pokemon-id-"
|
POKEMON_ID_PREFIX = "pokemon-id-"
|
||||||
|
|
||||||
|
|
||||||
def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml:
|
def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml:
|
||||||
POKEDEX_TITLE = "Pokedex"
|
POKEDEX_TITLE = "Pokedex"
|
||||||
POKEDEX_FILE = "content/np_pokedex.xhtml"
|
POKEDEX_FILE = "content/np_pokedex.xhtml"
|
||||||
POKEDEX_UID = "np_pokedex"
|
POKEDEX_UID = "np_pokedex"
|
||||||
chapter = epub.EpubHtml(title=POKEDEX_TITLE, file_name=POKEDEX_FILE, uid=POKEDEX_UID)
|
chapter = epub.EpubHtml(
|
||||||
content = ['<h1>Pokedex</h1>']
|
title=POKEDEX_TITLE, file_name=POKEDEX_FILE, uid=POKEDEX_UID
|
||||||
|
)
|
||||||
|
content = ["<h1>Pokedex</h1>"]
|
||||||
|
|
||||||
for p in pokemon:
|
for p in pokemon:
|
||||||
content.append(f'<h2 id="{POKEMON_ID_PREFIX}{p.name.lower()}">{p.name}</h2>')
|
content.append(f'<h2 id="{POKEMON_ID_PREFIX}{p.name.lower()}">{p.name}</h2>')
|
||||||
content.append(f' <p><img alt="[Pokemon {p.name}]" src="../{p.img_filepath}"/><br/></p>')
|
content.append(
|
||||||
|
f' <p><img alt="[Pokemon {p.name}]" src="../{p.img_filepath}"/><br/></p>'
|
||||||
|
)
|
||||||
for paragraph in p.description.split("\n"):
|
for paragraph in p.description.split("\n"):
|
||||||
content.append(f' <p>{paragraph}</p>')
|
content.append(f" <p>{paragraph}</p>")
|
||||||
content.append('')
|
content.append("")
|
||||||
|
|
||||||
chapter.content = "\n".join(content)
|
chapter.content = "\n".join(content)
|
||||||
return chapter
|
return chapter
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
|
def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
|
||||||
r = re.compile("([:,.!?“”‘’…])")
|
r = re.compile("([:,.!?“”‘’…])")
|
||||||
soup: BeautifulSoup = BeautifulSoup(chapter.content, "html.parser")
|
soup: BeautifulSoup = BeautifulSoup(chapter.content, "html.parser")
|
||||||
@@ -41,8 +44,8 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
|
|||||||
return tag
|
return tag
|
||||||
|
|
||||||
def patch_string(section: NavigableString) -> List:
|
def patch_string(section: NavigableString) -> List:
|
||||||
""" Replace Pokemon with link to Pokemon; requires splitting up the
|
"""Replace Pokemon with link to Pokemon; requires splitting up the
|
||||||
NavigableString into a list of NavigableStrings and Tags. """
|
NavigableString into a list of NavigableStrings and Tags."""
|
||||||
result = [[]]
|
result = [[]]
|
||||||
for word in str(section).split(" "):
|
for word in str(section).split(" "):
|
||||||
word_stripped = r.sub("", word)
|
word_stripped = r.sub("", word)
|
||||||
@@ -55,7 +58,9 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
|
|||||||
else:
|
else:
|
||||||
# add other chars before pokemon if there are any
|
# add other chars before pokemon if there are any
|
||||||
result[-1].append("".join(word_split[:i]))
|
result[-1].append("".join(word_split[:i]))
|
||||||
pokemon_link = pokemon_name_to_link(word_stripped.lower(), word_stripped)
|
pokemon_link = pokemon_name_to_link(
|
||||||
|
word_stripped.lower(), word_stripped
|
||||||
|
)
|
||||||
result.append(pokemon_link)
|
result.append(pokemon_link)
|
||||||
result.append([])
|
result.append([])
|
||||||
if i + 1 == len(word_split):
|
if i + 1 == len(word_split):
|
||||||
@@ -63,7 +68,7 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
|
|||||||
result[-1].append(" ")
|
result[-1].append(" ")
|
||||||
else:
|
else:
|
||||||
# add other chars after pokemon if there are any
|
# add other chars after pokemon if there are any
|
||||||
result[-1].append("".join(word_split[i + 1:]))
|
result[-1].append("".join(word_split[i + 1 :]))
|
||||||
else:
|
else:
|
||||||
result[-1].append(word)
|
result[-1].append(word)
|
||||||
|
|
||||||
@@ -96,20 +101,28 @@ def patch(epub_filepath: str, pokemon: List[Pokemon]):
|
|||||||
book.add_item(chapter)
|
book.add_item(chapter)
|
||||||
link = epub.Link(chapter.file_name, chapter.title, chapter.id)
|
link = epub.Link(chapter.file_name, chapter.title, chapter.id)
|
||||||
book.toc.append(link)
|
book.toc.append(link)
|
||||||
book.spine.append((chapter.id, 'yes'))
|
book.spine.append((chapter.id, "yes"))
|
||||||
|
|
||||||
for p in pokemon:
|
for p in pokemon:
|
||||||
image_content = open(p.img_filepath, 'rb').read()
|
image_content = open(p.img_filepath, "rb").read()
|
||||||
img = epub.EpubItem(uid=p.name, file_name=p.img_filepath, media_type='image/png', content=image_content)
|
img = epub.EpubItem(
|
||||||
|
uid=p.name,
|
||||||
|
file_name=p.img_filepath,
|
||||||
|
media_type="image/png",
|
||||||
|
content=image_content,
|
||||||
|
)
|
||||||
book.add_item(img)
|
book.add_item(img)
|
||||||
|
|
||||||
pokemon_lookup = {p.name.lower(): p for p in pokemon}
|
pokemon_lookup = {p.name.lower(): p for p in pokemon}
|
||||||
chapters = [b for b in book.get_items()
|
chapters = [
|
||||||
if isinstance(b, epub.EpubHtml)
|
b
|
||||||
if b.id.startswith("np_")]
|
for b in book.get_items()
|
||||||
|
if isinstance(b, epub.EpubHtml)
|
||||||
|
if b.id.startswith("np_")
|
||||||
|
]
|
||||||
for c in chapters:
|
for c in chapters:
|
||||||
patch_chapter(c, pokemon_lookup)
|
patch_chapter(c, pokemon_lookup)
|
||||||
|
|
||||||
epub_out = epub_filepath.replace(".", "-with-links.")
|
epub_out = epub_filepath.replace(".", "-with-links.")
|
||||||
epub.write_epub(epub_out, book, {})
|
epub.write_epub(epub_out, book, {})
|
||||||
logging.info(f"{epub_out} written.")
|
logging.info(f"Write '{epub_out}'.")
|
||||||
|
|||||||
14
src/main.py
14
src/main.py
@@ -1,13 +1,15 @@
|
|||||||
|
import sys
|
||||||
import logging
|
import logging
|
||||||
import src.pokemon
|
import src.pokemon
|
||||||
import src.epub
|
import src.epub
|
||||||
|
|
||||||
|
|
||||||
def init_logging():
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
init_logging()
|
logging.basicConfig(format="%(message)s", level=logging.INFO)
|
||||||
|
try:
|
||||||
|
ptoos_epub = sys.argv[1]
|
||||||
|
except IndexError:
|
||||||
|
ptoos_epub = "poos.epub"
|
||||||
|
logging.info(f"Patching '{ptoos_epub}'.")
|
||||||
pokemon = src.pokemon.get_pokemon()
|
pokemon = src.pokemon.get_pokemon()
|
||||||
src.epub.patch("poos.epub", pokemon)
|
src.epub.patch(ptoos_epub, pokemon)
|
||||||
|
|||||||
@@ -9,7 +9,9 @@ from typing import List
|
|||||||
|
|
||||||
POKEMON_CACHE_DIRECTORY = "pokemon"
|
POKEMON_CACHE_DIRECTORY = "pokemon"
|
||||||
BULBAPEDIA_BASE_URL = "https://bulbapedia.bulbagarden.net"
|
BULBAPEDIA_BASE_URL = "https://bulbapedia.bulbagarden.net"
|
||||||
NATIONAL_INDEX_URL = BULBAPEDIA_BASE_URL + "/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number"
|
NATIONAL_INDEX_URL = (
|
||||||
|
BULBAPEDIA_BASE_URL + "/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class Pokemon(BaseModel):
|
class Pokemon(BaseModel):
|
||||||
@@ -24,13 +26,13 @@ class Pokemon(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
def download_to_file(url: str, filepath: str, override=False):
|
def download_to_file(url: str, filepath: str, override=False):
|
||||||
""" Downloads url into filepath. """
|
"""Downloads url into filepath."""
|
||||||
if os.path.isfile(filepath) and override is False:
|
if os.path.isfile(filepath) and override is False:
|
||||||
logging.debug(f"'{filepath}' exists.")
|
logging.debug(f"'{filepath}' exists.")
|
||||||
return
|
return
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0'
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0"
|
||||||
}
|
}
|
||||||
r = requests.get(url, headers=headers)
|
r = requests.get(url, headers=headers)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
@@ -45,12 +47,14 @@ def download_to_file(url: str, filepath: str, override=False):
|
|||||||
|
|
||||||
|
|
||||||
def get_pokemon() -> List[Pokemon]:
|
def get_pokemon() -> List[Pokemon]:
|
||||||
""" Scrape Pokemon from the Bulbapedia national dex """
|
"""Scrape Pokemon from the Bulbapedia national dex"""
|
||||||
NATIONAL_INDEX_FILEPATH = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html")
|
NATIONAL_INDEX_FILEPATH = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html")
|
||||||
download_to_file(NATIONAL_INDEX_URL, NATIONAL_INDEX_FILEPATH)
|
download_to_file(NATIONAL_INDEX_URL, NATIONAL_INDEX_FILEPATH)
|
||||||
with open(NATIONAL_INDEX_FILEPATH, "r") as r:
|
with open(NATIONAL_INDEX_FILEPATH, "r") as r:
|
||||||
soup = BeautifulSoup(r, "html.parser")
|
soup = BeautifulSoup(r, "html.parser")
|
||||||
pokemon_list_soup: BeautifulSoup = soup.find(id="List_of_Pokémon_by_National_Pokédex_number").parent
|
pokemon_list_soup: BeautifulSoup = soup.find(
|
||||||
|
id="List_of_Pokémon_by_National_Pokédex_number"
|
||||||
|
).parent
|
||||||
generation_soups: BeautifulSoup = pokemon_list_soup.find_next_siblings("h3")
|
generation_soups: BeautifulSoup = pokemon_list_soup.find_next_siblings("h3")
|
||||||
|
|
||||||
table_row_soups = []
|
table_row_soups = []
|
||||||
@@ -77,48 +81,58 @@ def get_pokemon() -> List[Pokemon]:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
|
index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
|
||||||
html_url = BULBAPEDIA_BASE_URL + table_row_soup.find_next("th").next_element.attrs["href"]
|
html_url = (
|
||||||
|
BULBAPEDIA_BASE_URL
|
||||||
|
+ table_row_soup.find_next("th").next_element.attrs["href"]
|
||||||
|
)
|
||||||
img_url = table_row_soup.find("img").attrs["src"]
|
img_url = table_row_soup.find("img").attrs["src"]
|
||||||
html_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
|
html_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
|
||||||
img_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
|
img_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
|
||||||
p = Pokemon(name=name,
|
p = Pokemon(
|
||||||
index=index,
|
name=name,
|
||||||
html_url=html_url,
|
index=index,
|
||||||
img_url=img_url,
|
html_url=html_url,
|
||||||
html_filepath=html_filepath,
|
img_url=img_url,
|
||||||
img_filepath=img_filepath,
|
html_filepath=html_filepath,
|
||||||
json_filepath=json_filepath)
|
img_filepath=img_filepath,
|
||||||
|
json_filepath=json_filepath,
|
||||||
|
)
|
||||||
pokemon.append(p)
|
pokemon.append(p)
|
||||||
extend_pokemon(p)
|
extend_pokemon(p)
|
||||||
with open(p.json_filepath, 'w') as f:
|
with open(p.json_filepath, "w") as f:
|
||||||
f.write(p.json())
|
f.write(p.json())
|
||||||
logging.info(f"Saved {p.json_filepath}.")
|
logging.info(f"Saved {p.json_filepath}.")
|
||||||
|
|
||||||
# Filter out speculative Pokemon
|
# Filter out speculative Pokemon
|
||||||
pokemon = [p for p in pokemon if not p.description.startswith("This article's contents will change")]
|
pokemon = [
|
||||||
|
p
|
||||||
|
for p in pokemon
|
||||||
|
if not p.description.startswith("This article's contents will change")
|
||||||
|
]
|
||||||
|
|
||||||
logging.info("Pokemon loaded.")
|
logging.info("Pokemon loaded.")
|
||||||
return pokemon
|
return pokemon
|
||||||
|
|
||||||
|
|
||||||
def extend_pokemon(p: Pokemon):
|
def extend_pokemon(p: Pokemon):
|
||||||
""" Add description and download Pokemon image """
|
"""Add description and download Pokemon image"""
|
||||||
download_to_file(p.html_url, p.html_filepath)
|
download_to_file(p.html_url, p.html_filepath)
|
||||||
with open(p.html_filepath, "r") as r:
|
with open(p.html_filepath, "r") as r:
|
||||||
soup = BeautifulSoup(r, "html.parser")
|
soup = BeautifulSoup(r, "html.parser")
|
||||||
content_soup: BeautifulSoup = soup.find(id='mw-content-text').contents[0]
|
content_soup: BeautifulSoup = soup.find(id="mw-content-text").contents[0]
|
||||||
|
|
||||||
# description
|
# description
|
||||||
p_soup = content_soup.find("p")
|
p_soup = content_soup.find("p")
|
||||||
description = []
|
description = []
|
||||||
while p_soup.name == 'p':
|
while p_soup.name == "p":
|
||||||
description.append(p_soup.get_text())
|
description.append(p_soup.get_text())
|
||||||
p_soup = p_soup.next_sibling
|
p_soup = p_soup.next_sibling
|
||||||
p.description = "".join(description)
|
p.description = "".join(description)
|
||||||
|
|
||||||
# image
|
# image
|
||||||
img_url = content_soup.find("table").find_next_sibling("table").find("img").attrs["src"]
|
img_url = (
|
||||||
|
content_soup.find("table").find_next_sibling("table").find("img").attrs["src"]
|
||||||
|
)
|
||||||
img_url = img_url.replace("//", "https://")
|
img_url = img_url.replace("//", "https://")
|
||||||
p.img_url = img_url
|
p.img_url = img_url
|
||||||
download_to_file(img_url, p.img_filepath)
|
download_to_file(img_url, p.img_filepath)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user