Compare commits
5 Commits
bc962bd419
...
3634f10e81
| Author | SHA1 | Date | |
|---|---|---|---|
| 3634f10e81 | |||
| 808ab57ea9 | |||
| 7d9209d52e | |||
| 9c200a1246 | |||
| d224776a9a |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -7,6 +7,7 @@ __pycache__/
|
|||||||
pokemon
|
pokemon
|
||||||
tmp
|
tmp
|
||||||
ptoos.epub
|
ptoos.epub
|
||||||
|
ptoos-with-links.epub
|
||||||
|
|
||||||
# C extensions
|
# C extensions
|
||||||
*.so
|
*.so
|
||||||
|
|||||||
20
README.md
20
README.md
@@ -5,8 +5,28 @@ to descriptions and pictures of the Pokemon within the e-book itself.
|
|||||||
|
|
||||||
It works with the epub that you can download from [Daystar Eld's Patreon](https://www.patreon.com/daystareld/).
|
It works with the epub that you can download from [Daystar Eld's Patreon](https://www.patreon.com/daystareld/).
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
|
pip install --user pipenv
|
||||||
pipenv install
|
pipenv install
|
||||||
pipenv shell
|
pipenv shell
|
||||||
python ptoos-xray.py "DaystarEld - Pokemon The Origin of Species.epub"
|
python ptoos-xray.py "DaystarEld - Pokemon The Origin of Species.epub"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Run tests
|
||||||
|
|
||||||
|
```shell
|
||||||
|
pipenv install --dev
|
||||||
|
pipenv run pytest
|
||||||
|
```
|
||||||
|
|
||||||
|
## Compress Pokemon PNGs
|
||||||
|
|
||||||
|
Use `pngquant` to compress the PNGs and get a smaller epub file.
|
||||||
|
|
||||||
|
## Credits
|
||||||
|
|
||||||
|
Full credit for the Pokemon names, images, and descriptions goes to
|
||||||
|
[Bulbapedia](https://bulbapedia.bulbagarden.net) under
|
||||||
|
[Attribution-NonCommercial-ShareAlike 2.5](https://creativecommons.org/licenses/by-nc-sa/2.5/).
|
||||||
|
|||||||
2
pytest.ini
Normal file
2
pytest.ini
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
[pytest]
|
||||||
|
pythonpath = src
|
||||||
98
src/epub.py
98
src/epub.py
@@ -1,30 +1,46 @@
|
|||||||
import ebooklib
|
import ebooklib
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
|
from dataclasses import dataclass
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
from bs4.element import NavigableString
|
from bs4.element import NavigableString
|
||||||
from ebooklib import epub
|
from ebooklib import epub
|
||||||
from src.pokemon import Pokemon
|
from src.pokemon import Pokemon
|
||||||
from typing import List, Dict
|
from typing import List, Dict, Optional
|
||||||
from rich.progress import track
|
from rich.progress import track
|
||||||
from rich.console import Console
|
from rich.console import Console
|
||||||
|
|
||||||
POKEMON_ID_PREFIX = "pokemon-id-"
|
POKEMON_ID_PREFIX = "pokemon-id-"
|
||||||
|
POKEDEX_UID = "np_pokedex"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class AnnoyingPokemon:
|
||||||
|
name_chunks: List[str]
|
||||||
|
length_chunks: int
|
||||||
|
name_in_pokedex: str
|
||||||
|
|
||||||
|
|
||||||
|
ANNOYING_POKEMON = [
|
||||||
|
AnnoyingPokemon(["Mr", ".", "Mime"], 3, "mr. mime"),
|
||||||
|
AnnoyingPokemon(["farfetch", "’", "d"], 3, "farfetch'd"),
|
||||||
|
AnnoyingPokemon(["sirfetch", "’", "d"], 3, "sirfetch'd"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml:
|
def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml:
|
||||||
POKEDEX_TITLE = "Pokedex"
|
POKEDEX_TITLE = "Pokedex"
|
||||||
POKEDEX_FILE = "content/np_pokedex.xhtml"
|
POKEDEX_FILE = "content/np_pokedex.xhtml"
|
||||||
POKEDEX_UID = "np_pokedex"
|
|
||||||
chapter = epub.EpubHtml(
|
chapter = epub.EpubHtml(
|
||||||
title=POKEDEX_TITLE, file_name=POKEDEX_FILE, uid=POKEDEX_UID
|
title=POKEDEX_TITLE, file_name=POKEDEX_FILE, uid=POKEDEX_UID
|
||||||
)
|
)
|
||||||
content = ["<h1>Pokedex</h1>"]
|
content = ["<h1>Pokedex</h1>"]
|
||||||
|
|
||||||
for p in pokemon:
|
for p in pokemon:
|
||||||
content.append(f'<h2 id="{POKEMON_ID_PREFIX}{p.name.lower()}">{p.name}</h2>')
|
content.append(f'<h2 id="{POKEMON_ID_PREFIX}{p.link_id}">{p.name}</h2>')
|
||||||
content.append(
|
content.append(
|
||||||
f' <p><img alt="[Pokemon {p.name}]" src="../{p.img_filepath}"/><br/></p>'
|
f' <p><img alt="[Pokemon {p.name}]" src="../{p.img_filename}"/><br/></p>'
|
||||||
)
|
)
|
||||||
for paragraph in p.description.split("\n"):
|
for paragraph in p.description.split("\n"):
|
||||||
content.append(f" <p>{paragraph}</p>")
|
content.append(f" <p>{paragraph}</p>")
|
||||||
@@ -35,28 +51,58 @@ def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml:
|
|||||||
|
|
||||||
|
|
||||||
def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
|
def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
|
||||||
r = re.compile("([:,.!?“”‘’… ]+)")
|
special_chars_regex = re.compile("([:,.!?“”‘’… ]+)")
|
||||||
soup: BeautifulSoup = BeautifulSoup(chapter.content, "html.parser")
|
soup: BeautifulSoup = BeautifulSoup(chapter.content, "html.parser")
|
||||||
|
|
||||||
def pokemon_name_to_link(key: str, word: str) -> Tag:
|
# Set to remember which Pokemon have already gotten a link for that
|
||||||
|
# chapter.
|
||||||
|
pokemon_added_for_chapter = set()
|
||||||
|
|
||||||
|
def pokemon_to_link(p: Pokemon, name_as_in_book: str) -> Tag:
|
||||||
tag = soup.new_tag("a")
|
tag = soup.new_tag("a")
|
||||||
tag.string = word
|
tag.string = name_as_in_book
|
||||||
tag.attrs["href"] = f"np_pokedex.xhtml#{POKEMON_ID_PREFIX}{key}"
|
tag.attrs["href"] = f"np_pokedex.xhtml#{POKEMON_ID_PREFIX}{p.link_id}"
|
||||||
tag.attrs["style"] = "color:black;text-decoration:none"
|
# tag.attrs["style"] = "color:black;text-decoration:none"
|
||||||
return tag
|
return tag
|
||||||
|
|
||||||
|
def is_annoying_pokemon(index: int, chunks: List[str]) -> Optional[AnnoyingPokemon]:
|
||||||
|
for p in ANNOYING_POKEMON:
|
||||||
|
if p.name_chunks == list(
|
||||||
|
map(lambda s: s.lower(), chunks[index : index + p.length_chunks])
|
||||||
|
):
|
||||||
|
return p
|
||||||
|
return None
|
||||||
|
|
||||||
def patch_string(section: NavigableString) -> List:
|
def patch_string(section: NavigableString) -> List:
|
||||||
"""Replace Pokemon with link to Pokemon; requires splitting up the
|
"""Replace Pokemon with link to Pokemon; requires splitting up the
|
||||||
NavigableString into a list of NavigableStrings and Tags."""
|
NavigableString into a list of NavigableStrings and Tags."""
|
||||||
result = [[]]
|
result = [[]]
|
||||||
for word in r.split(str(section)):
|
index, chunks = 0, special_chars_regex.split(str(section))
|
||||||
|
while index < len(chunks):
|
||||||
|
word = chunks[index]
|
||||||
|
pokemon: Optional[Pokemon] = None
|
||||||
|
increment: int = 1
|
||||||
|
|
||||||
if word.lower() in pokemon_lookup:
|
if word.lower() in pokemon_lookup:
|
||||||
pokemon_lookup[word.lower()].appears_in_book = True
|
pokemon = pokemon_lookup[word.lower()]
|
||||||
link = pokemon_name_to_link(word.lower(), word)
|
elif annoying_pokemon := is_annoying_pokemon(index, chunks):
|
||||||
|
pokemon = pokemon_lookup[annoying_pokemon.name_in_pokedex]
|
||||||
|
increment = annoying_pokemon.length_chunks
|
||||||
|
|
||||||
|
if pokemon is not None and pokemon.name in pokemon_added_for_chapter:
|
||||||
|
pokemon = None
|
||||||
|
|
||||||
|
if pokemon is not None:
|
||||||
|
pokemon_added_for_chapter.add(pokemon.name)
|
||||||
|
pokemon.appears_in_book = True
|
||||||
|
name = "".join(chunks[index : index + increment])
|
||||||
|
link = pokemon_to_link(pokemon, name)
|
||||||
result.append(link)
|
result.append(link)
|
||||||
result.append([])
|
result.append([])
|
||||||
|
index += increment
|
||||||
else:
|
else:
|
||||||
result[-1].append(word)
|
result[-1].append(word)
|
||||||
|
index += 1
|
||||||
|
|
||||||
# convert words back into strings
|
# convert words back into strings
|
||||||
for i in range(len(result)):
|
for i in range(len(result)):
|
||||||
@@ -80,16 +126,32 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
|
|||||||
chapter.content = str(soup)
|
chapter.content = str(soup)
|
||||||
|
|
||||||
|
|
||||||
def patch(epub_filepath: str, pokemon: List[Pokemon]):
|
def get_pokemon_lookup(pokemon: List[Pokemon]) -> Dict[str, Pokemon]:
|
||||||
book = epub.read_epub(epub_filepath)
|
|
||||||
|
|
||||||
pokemon_lookup = {p.name.lower(): p for p in pokemon}
|
pokemon_lookup = {p.name.lower(): p for p in pokemon}
|
||||||
|
pokemon_lookup["nidoran"] = pokemon_lookup["nidoran♂"]
|
||||||
|
pokemon_lookup["barrierd"] = pokemon_lookup["mr. mime"]
|
||||||
|
return pokemon_lookup
|
||||||
|
|
||||||
|
|
||||||
|
def patch(epub_filename: str, pokemon: List[Pokemon]):
|
||||||
|
try:
|
||||||
|
book = epub.read_epub(epub_filename)
|
||||||
|
except Exception:
|
||||||
|
logging.exception("Failed to open epub.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
pokemon_lookup = get_pokemon_lookup(pokemon)
|
||||||
chapters = [
|
chapters = [
|
||||||
b
|
b
|
||||||
for b in book.get_items()
|
for b in book.get_items()
|
||||||
if isinstance(b, epub.EpubHtml)
|
if isinstance(b, epub.EpubHtml)
|
||||||
if b.id.startswith("np_")
|
if b.id.startswith("np_")
|
||||||
]
|
]
|
||||||
|
|
||||||
|
if [c for c in chapters if c.id == POKEDEX_UID]:
|
||||||
|
logging.warning(f"It looks like '{epub_filename}' already has a Pokedex.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
for c in track(chapters, description="Add Pokemon links to chapters"):
|
for c in track(chapters, description="Add Pokemon links to chapters"):
|
||||||
patch_chapter(c, pokemon_lookup)
|
patch_chapter(c, pokemon_lookup)
|
||||||
|
|
||||||
@@ -103,17 +165,17 @@ def patch(epub_filepath: str, pokemon: List[Pokemon]):
|
|||||||
book.spine.append((chapter.id, "yes"))
|
book.spine.append((chapter.id, "yes"))
|
||||||
|
|
||||||
for p in pokemon:
|
for p in pokemon:
|
||||||
image_content = open(p.img_filepath, "rb").read()
|
image_content = open(p.img_filename, "rb").read()
|
||||||
img = epub.EpubItem(
|
img = epub.EpubItem(
|
||||||
uid=p.name,
|
uid=p.name,
|
||||||
file_name=p.img_filepath,
|
file_name=p.img_filename,
|
||||||
media_type="image/png",
|
media_type="image/png",
|
||||||
content=image_content,
|
content=image_content,
|
||||||
)
|
)
|
||||||
book.add_item(img)
|
book.add_item(img)
|
||||||
|
|
||||||
console = Console()
|
console = Console()
|
||||||
epub_out = epub_filepath.replace(".", "-with-links.")
|
epub_out = epub_filename.replace(".", "-with-links.")
|
||||||
with console.status(f"Writing {epub_out}"):
|
with console.status(f"Writing {epub_out}"):
|
||||||
epub.write_epub(epub_out, book, {})
|
epub.write_epub(epub_out, book, {})
|
||||||
console.print(f"[green]✓[/green] [orange1]{epub_out}[/orange1] written")
|
console.print(f"[green]✓[/green] [orange1]{epub_out}[/orange1] written")
|
||||||
|
|||||||
@@ -11,11 +11,14 @@ def main():
|
|||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
format="%(message)s",
|
format="%(message)s",
|
||||||
datefmt="[%X]",
|
datefmt="[%X]",
|
||||||
handlers=[RichHandler()],
|
handlers=[RichHandler(rich_tracebacks=True)],
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
ptoos_epub = sys.argv[1]
|
ptoos_epub = sys.argv[1]
|
||||||
except IndexError:
|
except IndexError:
|
||||||
ptoos_epub = "ptoos.epub"
|
ptoos_epub = "ptoos.epub"
|
||||||
|
logging.warning(f"No epub file provided. Defaulting to '{ptoos_epub}'.")
|
||||||
pokemon = src.pokemon.get_pokemon()
|
pokemon = src.pokemon.get_pokemon()
|
||||||
|
# for p in pokemon:
|
||||||
|
# p.img_filename = p.img_filename.replace(".png", "-fs8.png")
|
||||||
src.epub.patch(ptoos_epub, pokemon)
|
src.epub.patch(ptoos_epub, pokemon)
|
||||||
|
|||||||
170
src/pokemon.py
170
src/pokemon.py
@@ -2,6 +2,7 @@ import requests
|
|||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
from rich.progress import track
|
from rich.progress import track
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
@@ -17,20 +18,21 @@ NATIONAL_INDEX_URL = (
|
|||||||
|
|
||||||
class Pokemon(BaseModel):
|
class Pokemon(BaseModel):
|
||||||
name: str
|
name: str
|
||||||
|
link_id: str
|
||||||
index: str
|
index: str
|
||||||
html_url: str
|
html_url: str
|
||||||
img_url: str
|
img_url: str
|
||||||
html_filepath: str
|
html_filename: str
|
||||||
img_filepath: str
|
img_filename: str
|
||||||
json_filepath: str
|
json_filename: str
|
||||||
description: str = ""
|
description: str = ""
|
||||||
appears_in_book: bool = False
|
appears_in_book: bool = False
|
||||||
|
|
||||||
|
|
||||||
def download_to_file(url: str, filepath: str, override=False):
|
def download_to_file(url: str, filename: str, override=False):
|
||||||
"""Downloads url into filepath."""
|
"""Downloads url into filename."""
|
||||||
if os.path.isfile(filepath) and override is False:
|
if os.path.isfile(filename) and override is False:
|
||||||
logging.debug(f"'{filepath}' exists.")
|
logging.debug(f"'{filename}' exists.")
|
||||||
return
|
return
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
@@ -38,72 +40,91 @@ def download_to_file(url: str, filepath: str, override=False):
|
|||||||
}
|
}
|
||||||
r = requests.get(url, headers=headers)
|
r = requests.get(url, headers=headers)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
logging.warning(f"Could not download '{filepath}'")
|
logging.critical(f"Could not download '{filename}'.")
|
||||||
return
|
sys.exit(1)
|
||||||
|
|
||||||
# Works for text and images
|
# Works for text and images
|
||||||
with open(filepath, "wb") as f:
|
with open(filename, "wb") as f:
|
||||||
for c in r:
|
for c in r:
|
||||||
f.write(c)
|
f.write(c)
|
||||||
logging.debug(f"'{filepath}' downloaded.")
|
logging.debug(f"'{filename}' downloaded.")
|
||||||
|
|
||||||
|
|
||||||
|
def download_national_index_html(national_index_filename: str):
|
||||||
|
download_to_file(NATIONAL_INDEX_URL, national_index_filename)
|
||||||
|
|
||||||
|
|
||||||
|
def get_pokemon_table_row_soups(national_index_filename: str) -> List[BeautifulSoup]:
|
||||||
|
with open(national_index_filename, "r") as r:
|
||||||
|
soup = BeautifulSoup(r, "html.parser")
|
||||||
|
pokemon_list_soup = soup.find(
|
||||||
|
id="List_of_Pokémon_by_National_Pokédex_number"
|
||||||
|
).parent
|
||||||
|
generation_soups = pokemon_list_soup.find_next_siblings("h3")
|
||||||
|
table_row_soups = []
|
||||||
|
for generation_soup in generation_soups:
|
||||||
|
table_soup = generation_soup.find_next_sibling("table")
|
||||||
|
tbody_soup = generation_soup.find_next("tbody")
|
||||||
|
# skip first row because it is the header
|
||||||
|
table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:]
|
||||||
|
return table_row_soups
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pokemon_from_table_row(table_row_soup: BeautifulSoup) -> Pokemon:
|
||||||
|
name = table_row_soup.find_next("th").next_element.attrs["title"]
|
||||||
|
link_id = re.sub("[^a-z]", "", name.lower())
|
||||||
|
|
||||||
|
# load Pokemon from JSON if it already exists
|
||||||
|
json_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".json")
|
||||||
|
if os.path.isfile(json_filename):
|
||||||
|
p = Pokemon.parse_file(json_filename)
|
||||||
|
logging.debug(f"Loaded '{p.json_filename}'.")
|
||||||
|
return p
|
||||||
|
|
||||||
|
index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
|
||||||
|
html_url = (
|
||||||
|
BULBAPEDIA_BASE_URL + table_row_soup.find_next("th").next_element.attrs["href"]
|
||||||
|
)
|
||||||
|
img_url = table_row_soup.find("img").attrs["src"]
|
||||||
|
html_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
|
||||||
|
img_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
|
||||||
|
return Pokemon(
|
||||||
|
name=name,
|
||||||
|
link_id=link_id,
|
||||||
|
index=index,
|
||||||
|
html_url=html_url,
|
||||||
|
img_url=img_url,
|
||||||
|
html_filename=html_filename,
|
||||||
|
img_filename=img_filename,
|
||||||
|
json_filename=json_filename,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_pokemon() -> List[Pokemon]:
|
def get_pokemon() -> List[Pokemon]:
|
||||||
"""Scrape Pokemon from the Bulbapedia national dex"""
|
"""Scrape Pokemon from the Bulbapedia national dex"""
|
||||||
NATIONAL_INDEX_FILEPATH = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html")
|
if not os.path.isdir(POKEMON_CACHE_DIRECTORY):
|
||||||
download_to_file(NATIONAL_INDEX_URL, NATIONAL_INDEX_FILEPATH)
|
os.mkdir(POKEMON_CACHE_DIRECTORY)
|
||||||
with open(NATIONAL_INDEX_FILEPATH, "r") as r:
|
national_index_filename = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html")
|
||||||
soup = BeautifulSoup(r, "html.parser")
|
download_national_index_html(national_index_filename)
|
||||||
pokemon_list_soup: BeautifulSoup = soup.find(
|
table_row_soups = get_pokemon_table_row_soups(national_index_filename)
|
||||||
id="List_of_Pokémon_by_National_Pokédex_number"
|
|
||||||
).parent
|
|
||||||
generation_soups: BeautifulSoup = pokemon_list_soup.find_next_siblings("h3")
|
|
||||||
|
|
||||||
table_row_soups = []
|
|
||||||
for generation_soup in generation_soups:
|
|
||||||
table_soup: BeautifulSoup = generation_soup.find_next_sibling("table")
|
|
||||||
tbody_soup: BeautifulSoup = generation_soup.find_next("tbody")
|
|
||||||
# skip first row because it is the header
|
|
||||||
table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:]
|
|
||||||
|
|
||||||
pokemon = []
|
pokemon = []
|
||||||
for table_row_soup in track(table_row_soups, description="Download Pokemon"):
|
for table_row_soup in track(table_row_soups, description="Download Pokemon"):
|
||||||
name = table_row_soup.find_next("th").next_element.attrs["title"]
|
p = extract_pokemon_from_table_row(table_row_soup)
|
||||||
|
|
||||||
# ignore Galarian and Alolan Pokemon so
|
# Ignore Galarian and Alolan Pokemon (Pokemon with the same name)
|
||||||
if pokemon and pokemon[-1].name == name:
|
if pokemon and pokemon[-1].name == p.name:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# load Pokemon from JSON if it already exists
|
|
||||||
json_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".json")
|
|
||||||
if os.path.isfile(json_filepath):
|
|
||||||
p = Pokemon.parse_file(json_filepath)
|
|
||||||
pokemon.append(p)
|
|
||||||
logging.debug(f"Loaded {p.json_filepath}.")
|
|
||||||
continue
|
|
||||||
|
|
||||||
index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
|
|
||||||
html_url = (
|
|
||||||
BULBAPEDIA_BASE_URL
|
|
||||||
+ table_row_soup.find_next("th").next_element.attrs["href"]
|
|
||||||
)
|
|
||||||
img_url = table_row_soup.find("img").attrs["src"]
|
|
||||||
html_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
|
|
||||||
img_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
|
|
||||||
p = Pokemon(
|
|
||||||
name=name,
|
|
||||||
index=index,
|
|
||||||
html_url=html_url,
|
|
||||||
img_url=img_url,
|
|
||||||
html_filepath=html_filepath,
|
|
||||||
img_filepath=img_filepath,
|
|
||||||
json_filepath=json_filepath,
|
|
||||||
)
|
|
||||||
pokemon.append(p)
|
pokemon.append(p)
|
||||||
|
|
||||||
|
# Pokemon has already been downloaded
|
||||||
|
if p.description and os.path.isfile(p.img_filename):
|
||||||
|
continue
|
||||||
|
|
||||||
extend_pokemon(p)
|
extend_pokemon(p)
|
||||||
with open(p.json_filepath, "w") as f:
|
with open(p.json_filename, "w") as f:
|
||||||
f.write(p.json())
|
f.write(p.json())
|
||||||
logging.debug(f"Saved {p.json_filepath}.")
|
logging.debug(f"Saved {p.json_filename}.")
|
||||||
|
|
||||||
# Filter out speculative Pokemon
|
# Filter out speculative Pokemon
|
||||||
pokemon = [
|
pokemon = [
|
||||||
@@ -117,23 +138,26 @@ def get_pokemon() -> List[Pokemon]:
|
|||||||
|
|
||||||
def extend_pokemon(p: Pokemon):
|
def extend_pokemon(p: Pokemon):
|
||||||
"""Add description and download Pokemon image"""
|
"""Add description and download Pokemon image"""
|
||||||
download_to_file(p.html_url, p.html_filepath)
|
download_to_file(p.html_url, p.html_filename)
|
||||||
with open(p.html_filepath, "r") as r:
|
with open(p.html_filename, "r") as r:
|
||||||
soup = BeautifulSoup(r, "html.parser")
|
soup = BeautifulSoup(r, "html.parser")
|
||||||
content_soup: BeautifulSoup = soup.find(id="mw-content-text").contents[0]
|
content_soup: BeautifulSoup = soup.find(id="mw-content-text").contents[0]
|
||||||
|
|
||||||
# description
|
if not p.description:
|
||||||
p_soup = content_soup.find("p")
|
p_soup = content_soup.find("p")
|
||||||
description = []
|
description = []
|
||||||
while p_soup.name == "p":
|
while p_soup.name == "p":
|
||||||
description.append(p_soup.get_text())
|
description.append(p_soup.get_text())
|
||||||
p_soup = p_soup.next_sibling
|
p_soup = p_soup.next_sibling
|
||||||
p.description = "".join(description)
|
p.description = "".join(description)
|
||||||
|
|
||||||
# image
|
if not os.path.isfile(p.img_filename):
|
||||||
img_url = (
|
img_url = (
|
||||||
content_soup.find("table").find_next_sibling("table").find("img").attrs["src"]
|
content_soup.find("table")
|
||||||
)
|
.find_next_sibling("table")
|
||||||
img_url = img_url.replace("//", "https://")
|
.find("img")
|
||||||
p.img_url = img_url
|
.attrs["src"]
|
||||||
download_to_file(img_url, p.img_filepath)
|
)
|
||||||
|
img_url = img_url.replace("//", "https://")
|
||||||
|
p.img_url = img_url
|
||||||
|
download_to_file(img_url, p.img_filename)
|
||||||
|
|||||||
12149
test/pokedex.html
Normal file
12149
test/pokedex.html
Normal file
File diff suppressed because one or more lines are too long
0
test/test_epub.py
Normal file
0
test/test_epub.py
Normal file
12149
test/test_pokedex.html
Normal file
12149
test/test_pokedex.html
Normal file
File diff suppressed because one or more lines are too long
44
test/test_pokemon.py
Normal file
44
test/test_pokemon.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
import pokemon
|
||||||
|
import os
|
||||||
|
import filecmp
|
||||||
|
|
||||||
|
|
||||||
|
def test_download_national_index_html(tmp_path):
|
||||||
|
pokemon_html = tmp_path / "pokedex.html"
|
||||||
|
pokemon.download_national_index_html(pokemon_html)
|
||||||
|
assert os.path.getsize(pokemon_html) > 500000
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_pokemon_table_row_soups():
|
||||||
|
national_index = "test/test_pokedex.html"
|
||||||
|
row_soups = pokemon.get_pokemon_table_row_soups(national_index)
|
||||||
|
assert len(row_soups) == 994
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_pokemon_from_table_row(tmp_path):
|
||||||
|
national_index = "test/test_pokedex.html"
|
||||||
|
pokemon.POKEMON_CACHE_DIRECTORY = tmp_path
|
||||||
|
row_soups = pokemon.get_pokemon_table_row_soups(national_index)
|
||||||
|
p = pokemon.extract_pokemon_from_table_row(row_soups[42])
|
||||||
|
assert p.name == "Vulpix"
|
||||||
|
assert p.link_id == "vulpix"
|
||||||
|
assert p.index == "#037"
|
||||||
|
assert p.html_url == "https://bulbapedia.bulbagarden.net/wiki/Vulpix_(Pok%C3%A9mon)"
|
||||||
|
assert (
|
||||||
|
p.img_url
|
||||||
|
== "//archives.bulbagarden.net/media/upload/thumb/3/35/037Vulpix-Alola.png/70px-037Vulpix-Alola.png"
|
||||||
|
)
|
||||||
|
assert p.img_filename.endswith("vulpix.png")
|
||||||
|
assert p.json_filename.endswith("vulpix.json")
|
||||||
|
assert p.description == ""
|
||||||
|
assert p.appears_in_book == False
|
||||||
|
|
||||||
|
|
||||||
|
def test_extend_pokemon(tmp_path):
|
||||||
|
national_index = "test/test_pokedex.html"
|
||||||
|
row_soups = pokemon.get_pokemon_table_row_soups(national_index)
|
||||||
|
p = pokemon.extract_pokemon_from_table_row(row_soups[42])
|
||||||
|
p.img_filename = tmp_path / "vulpix.png"
|
||||||
|
pokemon.extend_pokemon(p)
|
||||||
|
assert filecmp.cmp(p.img_filename, "test/test_vulpix.png")
|
||||||
|
assert p.description.startswith("Vulpix (Japanese: \u30ed\u30b3\u30f3 Rokon)")
|
||||||
BIN
test/test_vulpix.png
Normal file
BIN
test/test_vulpix.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 52 KiB |
Reference in New Issue
Block a user