Make patch string algorithm suck less and add progress bars

main
Felix Martin 2022-10-23 10:39:00 -04:00
parent 8200225780
commit 697cb22e09
5 changed files with 66 additions and 45 deletions

View File

@ -9,9 +9,9 @@ python_version = "3.10"
[packages] [packages]
bs4 = "*" bs4 = "*"
ebooklib = "*" ebooklib = "*"
lxml = "*"
pydantic = "*" pydantic = "*"
requests = "*" requests = "*"
rich = "*"
[dev-packages] [dev-packages]
black = "*" black = "*"

27
Pipfile.lock generated
View File

@ -1,7 +1,7 @@
{ {
"_meta": { "_meta": {
"hash": { "hash": {
"sha256": "5e5d63b2697bac028104473e63e0cfee2967b7aa93c011800ea85523c22c3f99" "sha256": "8de9c46e0028fc5384e51e2622ff20004653dca7138c702a57f12769c35240bf"
}, },
"pipfile-spec": 6, "pipfile-spec": 6,
"requires": { "requires": {
@ -47,6 +47,13 @@
"markers": "python_full_version >= '3.6.0'", "markers": "python_full_version >= '3.6.0'",
"version": "==2.1.1" "version": "==2.1.1"
}, },
"commonmark": {
"hashes": [
"sha256:452f9dc859be7f06631ddcb328b6919c67984aca654e5fefb3914d54691aed60",
"sha256:da2f38c92590f83de410ba1a3cbceafbc74fee9def35f9251ba9a971d6d66fd9"
],
"version": "==0.9.1"
},
"ebooklib": { "ebooklib": {
"hashes": [ "hashes": [
"sha256:fe23e22c28050196c68db3e7b13b257bf39426d927cb395c6f2cc13ac11327f1" "sha256:fe23e22c28050196c68db3e7b13b257bf39426d927cb395c6f2cc13ac11327f1"
@ -135,7 +142,7 @@
"sha256:fe17d10b97fdf58155f858606bddb4e037b805a60ae023c009f760d8361a4eb8", "sha256:fe17d10b97fdf58155f858606bddb4e037b805a60ae023c009f760d8361a4eb8",
"sha256:fe749b052bb7233fe5d072fcb549221a8cb1a16725c47c37e42b0b9cb3ff2c3f" "sha256:fe749b052bb7233fe5d072fcb549221a8cb1a16725c47c37e42b0b9cb3ff2c3f"
], ],
"index": "pypi", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==4.9.1" "version": "==4.9.1"
}, },
"pydantic": { "pydantic": {
@ -180,6 +187,14 @@
"index": "pypi", "index": "pypi",
"version": "==1.10.2" "version": "==1.10.2"
}, },
"pygments": {
"hashes": [
"sha256:56a8508ae95f98e2b9bdf93a6be5ae3f7d8af858b43e02c5a2ff083726be40c1",
"sha256:f643f331ab57ba3c9d89212ee4a2dabc6e94f117cf4eefde99a0574720d14c42"
],
"markers": "python_version >= '3.6'",
"version": "==2.13.0"
},
"requests": { "requests": {
"hashes": [ "hashes": [
"sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983", "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983",
@ -188,6 +203,14 @@
"index": "pypi", "index": "pypi",
"version": "==2.28.1" "version": "==2.28.1"
}, },
"rich": {
"hashes": [
"sha256:a4eb26484f2c82589bd9a17c73d32a010b1e29d89f1604cd9bf3a2097b81bb5e",
"sha256:ba3a3775974105c221d31141f2c116f4fd65c5ceb0698657a11e9f295ec93fd0"
],
"index": "pypi",
"version": "==12.6.0"
},
"six": { "six": {
"hashes": [ "hashes": [
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",

View File

@ -6,6 +6,8 @@ from bs4.element import NavigableString
from ebooklib import epub from ebooklib import epub
from src.pokemon import Pokemon from src.pokemon import Pokemon
from typing import List, Dict from typing import List, Dict
from rich.progress import track
from rich.console import Console
POKEMON_ID_PREFIX = "pokemon-id-" POKEMON_ID_PREFIX = "pokemon-id-"
@ -33,49 +35,33 @@ def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml:
def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]): def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
r = re.compile("([:,.!?“”‘’…])") r = re.compile("([:,.!?“”‘’… ]+)")
soup: BeautifulSoup = BeautifulSoup(chapter.content, "html.parser") soup: BeautifulSoup = BeautifulSoup(chapter.content, "html.parser")
def pokemon_name_to_link(key: str, word: str) -> Tag: def pokemon_name_to_link(key: str, word: str) -> Tag:
tag = soup.new_tag("a") tag = soup.new_tag("a")
tag.string = word tag.string = word
tag.attrs["href"] = f"np_pokedex.xhtml#{POKEMON_ID_PREFIX}{key}" tag.attrs["href"] = f"np_pokedex.xhtml#{POKEMON_ID_PREFIX}{key}"
tag.attrs["style"] = "color:black;text-decoration:none" # tag.attrs["style"] = "color:black;text-decoration:none"
return tag return tag
def patch_string(section: NavigableString) -> List: def patch_string(section: NavigableString) -> List:
"""Replace Pokemon with link to Pokemon; requires splitting up the """Replace Pokemon with link to Pokemon; requires splitting up the
NavigableString into a list of NavigableStrings and Tags.""" NavigableString into a list of NavigableStrings and Tags."""
result = [[]] result = [[]]
for word in str(section).split(" "): for word in r.split(str(section)):
word_stripped = r.sub("", word) if word.lower() in pokemon_lookup:
if word_stripped.lower() in pokemon_lookup: pokemon_lookup[word.lower()].appears_in_book = True
word_split = r.split(word) link = pokemon_name_to_link(word.lower(), word)
i = word_split.index(word_stripped) result.append(link)
if i == 0:
# add space if there are no other chars before pokemon
result[-1].append(" ")
else:
# add other chars before pokemon if there are any
result[-1].append("".join(word_split[:i]))
pokemon_link = pokemon_name_to_link(
word_stripped.lower(), word_stripped
)
result.append(pokemon_link)
result.append([]) result.append([])
if i + 1 == len(word_split):
# add space after pokemon if there are no other chars
result[-1].append(" ")
else:
# add other chars after pokemon if there are any
result[-1].append("".join(word_split[i + 1 :]))
else: else:
result[-1].append(word) result[-1].append(word)
# convert words back into strings. # convert words back into strings
for i in range(len(result)): for i in range(len(result)):
if isinstance(result[i], list): if isinstance(result[i], list):
result[i] = NavigableString(" ".join(result[i])) result[i] = NavigableString("".join(result[i]))
return result return result
def patch_paragraph(paragraph: Tag): def patch_paragraph(paragraph: Tag):
@ -97,6 +83,19 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
def patch(epub_filepath: str, pokemon: List[Pokemon]): def patch(epub_filepath: str, pokemon: List[Pokemon]):
book = epub.read_epub(epub_filepath) book = epub.read_epub(epub_filepath)
pokemon_lookup = {p.name.lower(): p for p in pokemon}
chapters = [
b
for b in book.get_items()
if isinstance(b, epub.EpubHtml)
if b.id.startswith("np_")
]
for c in track(chapters, description="Add Pokemon links to chapters"):
patch_chapter(c, pokemon_lookup)
# only add Pokemon to Pokedex chapter that appear (in the book)
pokemon = [p for p in pokemon if p.appears_in_book]
chapter = create_pokedex_chapter(pokemon) chapter = create_pokedex_chapter(pokemon)
book.add_item(chapter) book.add_item(chapter)
link = epub.Link(chapter.file_name, chapter.title, chapter.id) link = epub.Link(chapter.file_name, chapter.title, chapter.id)
@ -113,16 +112,8 @@ def patch(epub_filepath: str, pokemon: List[Pokemon]):
) )
book.add_item(img) book.add_item(img)
pokemon_lookup = {p.name.lower(): p for p in pokemon} console = Console()
chapters = [
b
for b in book.get_items()
if isinstance(b, epub.EpubHtml)
if b.id.startswith("np_")
]
for c in chapters:
patch_chapter(c, pokemon_lookup)
epub_out = epub_filepath.replace(".", "-with-links.") epub_out = epub_filepath.replace(".", "-with-links.")
epub.write_epub(epub_out, book, {}) with console.status(f"Writing {epub_out}"):
logging.info(f"Write '{epub_out}'.") epub.write_epub(epub_out, book, {})
console.print(f"[green]✓[/green] [orange1]{epub_out}[/orange1] written")

View File

@ -3,13 +3,19 @@ import logging
import src.pokemon import src.pokemon
import src.epub import src.epub
from rich.logging import RichHandler
def main(): def main():
logging.basicConfig(format="%(message)s", level=logging.INFO) logging.basicConfig(
level=logging.INFO,
format="%(message)s",
datefmt="[%X]",
handlers=[RichHandler()],
)
try: try:
ptoos_epub = sys.argv[1] ptoos_epub = sys.argv[1]
except IndexError: except IndexError:
ptoos_epub = "poos.epub" ptoos_epub = "poos.epub"
logging.info(f"Patching '{ptoos_epub}'.")
pokemon = src.pokemon.get_pokemon() pokemon = src.pokemon.get_pokemon()
src.epub.patch(ptoos_epub, pokemon) src.epub.patch(ptoos_epub, pokemon)

View File

@ -2,6 +2,7 @@ import requests
import sys import sys
import os import os
import logging import logging
from rich.progress import track
from pydantic import BaseModel from pydantic import BaseModel
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from typing import List from typing import List
@ -23,6 +24,7 @@ class Pokemon(BaseModel):
img_filepath: str img_filepath: str
json_filepath: str json_filepath: str
description: str = "" description: str = ""
appears_in_book: bool = False
def download_to_file(url: str, filepath: str, override=False): def download_to_file(url: str, filepath: str, override=False):
@ -65,7 +67,7 @@ def get_pokemon() -> List[Pokemon]:
table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:] table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:]
pokemon = [] pokemon = []
for table_row_soup in table_row_soups: for table_row_soup in track(table_row_soups, description="Download Pokemon"):
name = table_row_soup.find_next("th").next_element.attrs["title"] name = table_row_soup.find_next("th").next_element.attrs["title"]
# ignore Galarian and Alolan Pokemon so # ignore Galarian and Alolan Pokemon so
@ -101,7 +103,7 @@ def get_pokemon() -> List[Pokemon]:
extend_pokemon(p) extend_pokemon(p)
with open(p.json_filepath, "w") as f: with open(p.json_filepath, "w") as f:
f.write(p.json()) f.write(p.json())
logging.info(f"Saved {p.json_filepath}.") logging.debug(f"Saved {p.json_filepath}.")
# Filter out speculative Pokemon # Filter out speculative Pokemon
pokemon = [ pokemon = [
@ -110,7 +112,6 @@ def get_pokemon() -> List[Pokemon]:
if not p.description.startswith("This article's contents will change") if not p.description.startswith("This article's contents will change")
] ]
logging.info("Pokemon loaded.")
return pokemon return pokemon