Make patch string algorithm suck less and add progress bars

main
Felix Martin 2022-10-23 10:39:00 -04:00
parent 8200225780
commit 697cb22e09
5 changed files with 66 additions and 45 deletions

View File

@ -9,9 +9,9 @@ python_version = "3.10"
[packages]
bs4 = "*"
ebooklib = "*"
lxml = "*"
pydantic = "*"
requests = "*"
rich = "*"
[dev-packages]
black = "*"

27
Pipfile.lock generated
View File

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "5e5d63b2697bac028104473e63e0cfee2967b7aa93c011800ea85523c22c3f99"
"sha256": "8de9c46e0028fc5384e51e2622ff20004653dca7138c702a57f12769c35240bf"
},
"pipfile-spec": 6,
"requires": {
@ -47,6 +47,13 @@
"markers": "python_full_version >= '3.6.0'",
"version": "==2.1.1"
},
"commonmark": {
"hashes": [
"sha256:452f9dc859be7f06631ddcb328b6919c67984aca654e5fefb3914d54691aed60",
"sha256:da2f38c92590f83de410ba1a3cbceafbc74fee9def35f9251ba9a971d6d66fd9"
],
"version": "==0.9.1"
},
"ebooklib": {
"hashes": [
"sha256:fe23e22c28050196c68db3e7b13b257bf39426d927cb395c6f2cc13ac11327f1"
@ -135,7 +142,7 @@
"sha256:fe17d10b97fdf58155f858606bddb4e037b805a60ae023c009f760d8361a4eb8",
"sha256:fe749b052bb7233fe5d072fcb549221a8cb1a16725c47c37e42b0b9cb3ff2c3f"
],
"index": "pypi",
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==4.9.1"
},
"pydantic": {
@ -180,6 +187,14 @@
"index": "pypi",
"version": "==1.10.2"
},
"pygments": {
"hashes": [
"sha256:56a8508ae95f98e2b9bdf93a6be5ae3f7d8af858b43e02c5a2ff083726be40c1",
"sha256:f643f331ab57ba3c9d89212ee4a2dabc6e94f117cf4eefde99a0574720d14c42"
],
"markers": "python_version >= '3.6'",
"version": "==2.13.0"
},
"requests": {
"hashes": [
"sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983",
@ -188,6 +203,14 @@
"index": "pypi",
"version": "==2.28.1"
},
"rich": {
"hashes": [
"sha256:a4eb26484f2c82589bd9a17c73d32a010b1e29d89f1604cd9bf3a2097b81bb5e",
"sha256:ba3a3775974105c221d31141f2c116f4fd65c5ceb0698657a11e9f295ec93fd0"
],
"index": "pypi",
"version": "==12.6.0"
},
"six": {
"hashes": [
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",

View File

@ -6,6 +6,8 @@ from bs4.element import NavigableString
from ebooklib import epub
from src.pokemon import Pokemon
from typing import List, Dict
from rich.progress import track
from rich.console import Console
POKEMON_ID_PREFIX = "pokemon-id-"
@ -33,49 +35,33 @@ def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml:
def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
r = re.compile("([:,.!?“”‘’…])")
r = re.compile("([:,.!?“”‘’… ]+)")
soup: BeautifulSoup = BeautifulSoup(chapter.content, "html.parser")
def pokemon_name_to_link(key: str, word: str) -> Tag:
tag = soup.new_tag("a")
tag.string = word
tag.attrs["href"] = f"np_pokedex.xhtml#{POKEMON_ID_PREFIX}{key}"
tag.attrs["style"] = "color:black;text-decoration:none"
# tag.attrs["style"] = "color:black;text-decoration:none"
return tag
def patch_string(section: NavigableString) -> List:
"""Replace Pokemon with link to Pokemon; requires splitting up the
NavigableString into a list of NavigableStrings and Tags."""
result = [[]]
for word in str(section).split(" "):
word_stripped = r.sub("", word)
if word_stripped.lower() in pokemon_lookup:
word_split = r.split(word)
i = word_split.index(word_stripped)
if i == 0:
# add space if there are no other chars before pokemon
result[-1].append(" ")
else:
# add other chars before pokemon if there are any
result[-1].append("".join(word_split[:i]))
pokemon_link = pokemon_name_to_link(
word_stripped.lower(), word_stripped
)
result.append(pokemon_link)
for word in r.split(str(section)):
if word.lower() in pokemon_lookup:
pokemon_lookup[word.lower()].appears_in_book = True
link = pokemon_name_to_link(word.lower(), word)
result.append(link)
result.append([])
if i + 1 == len(word_split):
# add space after pokemon if there are no other chars
result[-1].append(" ")
else:
# add other chars after pokemon if there are any
result[-1].append("".join(word_split[i + 1 :]))
else:
result[-1].append(word)
# convert words back into strings.
# convert words back into strings
for i in range(len(result)):
if isinstance(result[i], list):
result[i] = NavigableString(" ".join(result[i]))
result[i] = NavigableString("".join(result[i]))
return result
def patch_paragraph(paragraph: Tag):
@ -97,6 +83,19 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
def patch(epub_filepath: str, pokemon: List[Pokemon]):
book = epub.read_epub(epub_filepath)
pokemon_lookup = {p.name.lower(): p for p in pokemon}
chapters = [
b
for b in book.get_items()
if isinstance(b, epub.EpubHtml)
if b.id.startswith("np_")
]
for c in track(chapters, description="Add Pokemon links to chapters"):
patch_chapter(c, pokemon_lookup)
# only add Pokemon to Pokedex chapter that appear (in the book)
pokemon = [p for p in pokemon if p.appears_in_book]
chapter = create_pokedex_chapter(pokemon)
book.add_item(chapter)
link = epub.Link(chapter.file_name, chapter.title, chapter.id)
@ -113,16 +112,8 @@ def patch(epub_filepath: str, pokemon: List[Pokemon]):
)
book.add_item(img)
pokemon_lookup = {p.name.lower(): p for p in pokemon}
chapters = [
b
for b in book.get_items()
if isinstance(b, epub.EpubHtml)
if b.id.startswith("np_")
]
for c in chapters:
patch_chapter(c, pokemon_lookup)
console = Console()
epub_out = epub_filepath.replace(".", "-with-links.")
epub.write_epub(epub_out, book, {})
logging.info(f"Write '{epub_out}'.")
with console.status(f"Writing {epub_out}"):
epub.write_epub(epub_out, book, {})
console.print(f"[green]✓[/green] [orange1]{epub_out}[/orange1] written")

View File

@ -3,13 +3,19 @@ import logging
import src.pokemon
import src.epub
from rich.logging import RichHandler
def main():
logging.basicConfig(format="%(message)s", level=logging.INFO)
logging.basicConfig(
level=logging.INFO,
format="%(message)s",
datefmt="[%X]",
handlers=[RichHandler()],
)
try:
ptoos_epub = sys.argv[1]
except IndexError:
ptoos_epub = "poos.epub"
logging.info(f"Patching '{ptoos_epub}'.")
pokemon = src.pokemon.get_pokemon()
src.epub.patch(ptoos_epub, pokemon)

View File

@ -2,6 +2,7 @@ import requests
import sys
import os
import logging
from rich.progress import track
from pydantic import BaseModel
from bs4 import BeautifulSoup
from typing import List
@ -23,6 +24,7 @@ class Pokemon(BaseModel):
img_filepath: str
json_filepath: str
description: str = ""
appears_in_book: bool = False
def download_to_file(url: str, filepath: str, override=False):
@ -65,7 +67,7 @@ def get_pokemon() -> List[Pokemon]:
table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:]
pokemon = []
for table_row_soup in table_row_soups:
for table_row_soup in track(table_row_soups, description="Download Pokemon"):
name = table_row_soup.find_next("th").next_element.attrs["title"]
# ignore Galarian and Alolan Pokemon so
@ -101,7 +103,7 @@ def get_pokemon() -> List[Pokemon]:
extend_pokemon(p)
with open(p.json_filepath, "w") as f:
f.write(p.json())
logging.info(f"Saved {p.json_filepath}.")
logging.debug(f"Saved {p.json_filepath}.")
# Filter out speculative Pokemon
pokemon = [
@ -110,7 +112,6 @@ def get_pokemon() -> List[Pokemon]:
if not p.description.startswith("This article's contents will change")
]
logging.info("Pokemon loaded.")
return pokemon