Make patch string algorithm suck less and add progress bars
parent
8200225780
commit
697cb22e09
2
Pipfile
2
Pipfile
|
@ -9,9 +9,9 @@ python_version = "3.10"
|
|||
[packages]
|
||||
bs4 = "*"
|
||||
ebooklib = "*"
|
||||
lxml = "*"
|
||||
pydantic = "*"
|
||||
requests = "*"
|
||||
rich = "*"
|
||||
|
||||
[dev-packages]
|
||||
black = "*"
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "5e5d63b2697bac028104473e63e0cfee2967b7aa93c011800ea85523c22c3f99"
|
||||
"sha256": "8de9c46e0028fc5384e51e2622ff20004653dca7138c702a57f12769c35240bf"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
|
@ -47,6 +47,13 @@
|
|||
"markers": "python_full_version >= '3.6.0'",
|
||||
"version": "==2.1.1"
|
||||
},
|
||||
"commonmark": {
|
||||
"hashes": [
|
||||
"sha256:452f9dc859be7f06631ddcb328b6919c67984aca654e5fefb3914d54691aed60",
|
||||
"sha256:da2f38c92590f83de410ba1a3cbceafbc74fee9def35f9251ba9a971d6d66fd9"
|
||||
],
|
||||
"version": "==0.9.1"
|
||||
},
|
||||
"ebooklib": {
|
||||
"hashes": [
|
||||
"sha256:fe23e22c28050196c68db3e7b13b257bf39426d927cb395c6f2cc13ac11327f1"
|
||||
|
@ -135,7 +142,7 @@
|
|||
"sha256:fe17d10b97fdf58155f858606bddb4e037b805a60ae023c009f760d8361a4eb8",
|
||||
"sha256:fe749b052bb7233fe5d072fcb549221a8cb1a16725c47c37e42b0b9cb3ff2c3f"
|
||||
],
|
||||
"index": "pypi",
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==4.9.1"
|
||||
},
|
||||
"pydantic": {
|
||||
|
@ -180,6 +187,14 @@
|
|||
"index": "pypi",
|
||||
"version": "==1.10.2"
|
||||
},
|
||||
"pygments": {
|
||||
"hashes": [
|
||||
"sha256:56a8508ae95f98e2b9bdf93a6be5ae3f7d8af858b43e02c5a2ff083726be40c1",
|
||||
"sha256:f643f331ab57ba3c9d89212ee4a2dabc6e94f117cf4eefde99a0574720d14c42"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2.13.0"
|
||||
},
|
||||
"requests": {
|
||||
"hashes": [
|
||||
"sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983",
|
||||
|
@ -188,6 +203,14 @@
|
|||
"index": "pypi",
|
||||
"version": "==2.28.1"
|
||||
},
|
||||
"rich": {
|
||||
"hashes": [
|
||||
"sha256:a4eb26484f2c82589bd9a17c73d32a010b1e29d89f1604cd9bf3a2097b81bb5e",
|
||||
"sha256:ba3a3775974105c221d31141f2c116f4fd65c5ceb0698657a11e9f295ec93fd0"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==12.6.0"
|
||||
},
|
||||
"six": {
|
||||
"hashes": [
|
||||
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
|
||||
|
|
65
src/epub.py
65
src/epub.py
|
@ -6,6 +6,8 @@ from bs4.element import NavigableString
|
|||
from ebooklib import epub
|
||||
from src.pokemon import Pokemon
|
||||
from typing import List, Dict
|
||||
from rich.progress import track
|
||||
from rich.console import Console
|
||||
|
||||
POKEMON_ID_PREFIX = "pokemon-id-"
|
||||
|
||||
|
@ -33,49 +35,33 @@ def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml:
|
|||
|
||||
|
||||
def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
|
||||
r = re.compile("([:,.!?“”‘’…])")
|
||||
r = re.compile("([:,.!?“”‘’… ]+)")
|
||||
soup: BeautifulSoup = BeautifulSoup(chapter.content, "html.parser")
|
||||
|
||||
def pokemon_name_to_link(key: str, word: str) -> Tag:
|
||||
tag = soup.new_tag("a")
|
||||
tag.string = word
|
||||
tag.attrs["href"] = f"np_pokedex.xhtml#{POKEMON_ID_PREFIX}{key}"
|
||||
tag.attrs["style"] = "color:black;text-decoration:none"
|
||||
# tag.attrs["style"] = "color:black;text-decoration:none"
|
||||
return tag
|
||||
|
||||
def patch_string(section: NavigableString) -> List:
|
||||
"""Replace Pokemon with link to Pokemon; requires splitting up the
|
||||
NavigableString into a list of NavigableStrings and Tags."""
|
||||
result = [[]]
|
||||
for word in str(section).split(" "):
|
||||
word_stripped = r.sub("", word)
|
||||
if word_stripped.lower() in pokemon_lookup:
|
||||
word_split = r.split(word)
|
||||
i = word_split.index(word_stripped)
|
||||
if i == 0:
|
||||
# add space if there are no other chars before pokemon
|
||||
result[-1].append(" ")
|
||||
else:
|
||||
# add other chars before pokemon if there are any
|
||||
result[-1].append("".join(word_split[:i]))
|
||||
pokemon_link = pokemon_name_to_link(
|
||||
word_stripped.lower(), word_stripped
|
||||
)
|
||||
result.append(pokemon_link)
|
||||
for word in r.split(str(section)):
|
||||
if word.lower() in pokemon_lookup:
|
||||
pokemon_lookup[word.lower()].appears_in_book = True
|
||||
link = pokemon_name_to_link(word.lower(), word)
|
||||
result.append(link)
|
||||
result.append([])
|
||||
if i + 1 == len(word_split):
|
||||
# add space after pokemon if there are no other chars
|
||||
result[-1].append(" ")
|
||||
else:
|
||||
# add other chars after pokemon if there are any
|
||||
result[-1].append("".join(word_split[i + 1 :]))
|
||||
else:
|
||||
result[-1].append(word)
|
||||
|
||||
# convert words back into strings.
|
||||
# convert words back into strings
|
||||
for i in range(len(result)):
|
||||
if isinstance(result[i], list):
|
||||
result[i] = NavigableString(" ".join(result[i]))
|
||||
result[i] = NavigableString("".join(result[i]))
|
||||
return result
|
||||
|
||||
def patch_paragraph(paragraph: Tag):
|
||||
|
@ -97,6 +83,19 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
|
|||
def patch(epub_filepath: str, pokemon: List[Pokemon]):
|
||||
book = epub.read_epub(epub_filepath)
|
||||
|
||||
pokemon_lookup = {p.name.lower(): p for p in pokemon}
|
||||
chapters = [
|
||||
b
|
||||
for b in book.get_items()
|
||||
if isinstance(b, epub.EpubHtml)
|
||||
if b.id.startswith("np_")
|
||||
]
|
||||
for c in track(chapters, description="Add Pokemon links to chapters"):
|
||||
patch_chapter(c, pokemon_lookup)
|
||||
|
||||
# only add Pokemon to Pokedex chapter that appear (in the book)
|
||||
pokemon = [p for p in pokemon if p.appears_in_book]
|
||||
|
||||
chapter = create_pokedex_chapter(pokemon)
|
||||
book.add_item(chapter)
|
||||
link = epub.Link(chapter.file_name, chapter.title, chapter.id)
|
||||
|
@ -113,16 +112,8 @@ def patch(epub_filepath: str, pokemon: List[Pokemon]):
|
|||
)
|
||||
book.add_item(img)
|
||||
|
||||
pokemon_lookup = {p.name.lower(): p for p in pokemon}
|
||||
chapters = [
|
||||
b
|
||||
for b in book.get_items()
|
||||
if isinstance(b, epub.EpubHtml)
|
||||
if b.id.startswith("np_")
|
||||
]
|
||||
for c in chapters:
|
||||
patch_chapter(c, pokemon_lookup)
|
||||
|
||||
console = Console()
|
||||
epub_out = epub_filepath.replace(".", "-with-links.")
|
||||
epub.write_epub(epub_out, book, {})
|
||||
logging.info(f"Write '{epub_out}'.")
|
||||
with console.status(f"Writing {epub_out}"):
|
||||
epub.write_epub(epub_out, book, {})
|
||||
console.print(f"[green]✓[/green] [orange1]{epub_out}[/orange1] written")
|
||||
|
|
10
src/main.py
10
src/main.py
|
@ -3,13 +3,19 @@ import logging
|
|||
import src.pokemon
|
||||
import src.epub
|
||||
|
||||
from rich.logging import RichHandler
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(format="%(message)s", level=logging.INFO)
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(message)s",
|
||||
datefmt="[%X]",
|
||||
handlers=[RichHandler()],
|
||||
)
|
||||
try:
|
||||
ptoos_epub = sys.argv[1]
|
||||
except IndexError:
|
||||
ptoos_epub = "poos.epub"
|
||||
logging.info(f"Patching '{ptoos_epub}'.")
|
||||
pokemon = src.pokemon.get_pokemon()
|
||||
src.epub.patch(ptoos_epub, pokemon)
|
||||
|
|
|
@ -2,6 +2,7 @@ import requests
|
|||
import sys
|
||||
import os
|
||||
import logging
|
||||
from rich.progress import track
|
||||
from pydantic import BaseModel
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import List
|
||||
|
@ -23,6 +24,7 @@ class Pokemon(BaseModel):
|
|||
img_filepath: str
|
||||
json_filepath: str
|
||||
description: str = ""
|
||||
appears_in_book: bool = False
|
||||
|
||||
|
||||
def download_to_file(url: str, filepath: str, override=False):
|
||||
|
@ -65,7 +67,7 @@ def get_pokemon() -> List[Pokemon]:
|
|||
table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:]
|
||||
|
||||
pokemon = []
|
||||
for table_row_soup in table_row_soups:
|
||||
for table_row_soup in track(table_row_soups, description="Download Pokemon"):
|
||||
name = table_row_soup.find_next("th").next_element.attrs["title"]
|
||||
|
||||
# ignore Galarian and Alolan Pokemon so
|
||||
|
@ -101,7 +103,7 @@ def get_pokemon() -> List[Pokemon]:
|
|||
extend_pokemon(p)
|
||||
with open(p.json_filepath, "w") as f:
|
||||
f.write(p.json())
|
||||
logging.info(f"Saved {p.json_filepath}.")
|
||||
logging.debug(f"Saved {p.json_filepath}.")
|
||||
|
||||
# Filter out speculative Pokemon
|
||||
pokemon = [
|
||||
|
@ -110,7 +112,6 @@ def get_pokemon() -> List[Pokemon]:
|
|||
if not p.description.startswith("This article's contents will change")
|
||||
]
|
||||
|
||||
logging.info("Pokemon loaded.")
|
||||
return pokemon
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue