Make patch string algorithm suck less and add progress bars
parent
8200225780
commit
697cb22e09
2
Pipfile
2
Pipfile
|
@ -9,9 +9,9 @@ python_version = "3.10"
|
||||||
[packages]
|
[packages]
|
||||||
bs4 = "*"
|
bs4 = "*"
|
||||||
ebooklib = "*"
|
ebooklib = "*"
|
||||||
lxml = "*"
|
|
||||||
pydantic = "*"
|
pydantic = "*"
|
||||||
requests = "*"
|
requests = "*"
|
||||||
|
rich = "*"
|
||||||
|
|
||||||
[dev-packages]
|
[dev-packages]
|
||||||
black = "*"
|
black = "*"
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
{
|
{
|
||||||
"_meta": {
|
"_meta": {
|
||||||
"hash": {
|
"hash": {
|
||||||
"sha256": "5e5d63b2697bac028104473e63e0cfee2967b7aa93c011800ea85523c22c3f99"
|
"sha256": "8de9c46e0028fc5384e51e2622ff20004653dca7138c702a57f12769c35240bf"
|
||||||
},
|
},
|
||||||
"pipfile-spec": 6,
|
"pipfile-spec": 6,
|
||||||
"requires": {
|
"requires": {
|
||||||
|
@ -47,6 +47,13 @@
|
||||||
"markers": "python_full_version >= '3.6.0'",
|
"markers": "python_full_version >= '3.6.0'",
|
||||||
"version": "==2.1.1"
|
"version": "==2.1.1"
|
||||||
},
|
},
|
||||||
|
"commonmark": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:452f9dc859be7f06631ddcb328b6919c67984aca654e5fefb3914d54691aed60",
|
||||||
|
"sha256:da2f38c92590f83de410ba1a3cbceafbc74fee9def35f9251ba9a971d6d66fd9"
|
||||||
|
],
|
||||||
|
"version": "==0.9.1"
|
||||||
|
},
|
||||||
"ebooklib": {
|
"ebooklib": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:fe23e22c28050196c68db3e7b13b257bf39426d927cb395c6f2cc13ac11327f1"
|
"sha256:fe23e22c28050196c68db3e7b13b257bf39426d927cb395c6f2cc13ac11327f1"
|
||||||
|
@ -135,7 +142,7 @@
|
||||||
"sha256:fe17d10b97fdf58155f858606bddb4e037b805a60ae023c009f760d8361a4eb8",
|
"sha256:fe17d10b97fdf58155f858606bddb4e037b805a60ae023c009f760d8361a4eb8",
|
||||||
"sha256:fe749b052bb7233fe5d072fcb549221a8cb1a16725c47c37e42b0b9cb3ff2c3f"
|
"sha256:fe749b052bb7233fe5d072fcb549221a8cb1a16725c47c37e42b0b9cb3ff2c3f"
|
||||||
],
|
],
|
||||||
"index": "pypi",
|
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||||
"version": "==4.9.1"
|
"version": "==4.9.1"
|
||||||
},
|
},
|
||||||
"pydantic": {
|
"pydantic": {
|
||||||
|
@ -180,6 +187,14 @@
|
||||||
"index": "pypi",
|
"index": "pypi",
|
||||||
"version": "==1.10.2"
|
"version": "==1.10.2"
|
||||||
},
|
},
|
||||||
|
"pygments": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:56a8508ae95f98e2b9bdf93a6be5ae3f7d8af858b43e02c5a2ff083726be40c1",
|
||||||
|
"sha256:f643f331ab57ba3c9d89212ee4a2dabc6e94f117cf4eefde99a0574720d14c42"
|
||||||
|
],
|
||||||
|
"markers": "python_version >= '3.6'",
|
||||||
|
"version": "==2.13.0"
|
||||||
|
},
|
||||||
"requests": {
|
"requests": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983",
|
"sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983",
|
||||||
|
@ -188,6 +203,14 @@
|
||||||
"index": "pypi",
|
"index": "pypi",
|
||||||
"version": "==2.28.1"
|
"version": "==2.28.1"
|
||||||
},
|
},
|
||||||
|
"rich": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:a4eb26484f2c82589bd9a17c73d32a010b1e29d89f1604cd9bf3a2097b81bb5e",
|
||||||
|
"sha256:ba3a3775974105c221d31141f2c116f4fd65c5ceb0698657a11e9f295ec93fd0"
|
||||||
|
],
|
||||||
|
"index": "pypi",
|
||||||
|
"version": "==12.6.0"
|
||||||
|
},
|
||||||
"six": {
|
"six": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
|
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
|
||||||
|
|
65
src/epub.py
65
src/epub.py
|
@ -6,6 +6,8 @@ from bs4.element import NavigableString
|
||||||
from ebooklib import epub
|
from ebooklib import epub
|
||||||
from src.pokemon import Pokemon
|
from src.pokemon import Pokemon
|
||||||
from typing import List, Dict
|
from typing import List, Dict
|
||||||
|
from rich.progress import track
|
||||||
|
from rich.console import Console
|
||||||
|
|
||||||
POKEMON_ID_PREFIX = "pokemon-id-"
|
POKEMON_ID_PREFIX = "pokemon-id-"
|
||||||
|
|
||||||
|
@ -33,49 +35,33 @@ def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml:
|
||||||
|
|
||||||
|
|
||||||
def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
|
def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
|
||||||
r = re.compile("([:,.!?“”‘’…])")
|
r = re.compile("([:,.!?“”‘’… ]+)")
|
||||||
soup: BeautifulSoup = BeautifulSoup(chapter.content, "html.parser")
|
soup: BeautifulSoup = BeautifulSoup(chapter.content, "html.parser")
|
||||||
|
|
||||||
def pokemon_name_to_link(key: str, word: str) -> Tag:
|
def pokemon_name_to_link(key: str, word: str) -> Tag:
|
||||||
tag = soup.new_tag("a")
|
tag = soup.new_tag("a")
|
||||||
tag.string = word
|
tag.string = word
|
||||||
tag.attrs["href"] = f"np_pokedex.xhtml#{POKEMON_ID_PREFIX}{key}"
|
tag.attrs["href"] = f"np_pokedex.xhtml#{POKEMON_ID_PREFIX}{key}"
|
||||||
tag.attrs["style"] = "color:black;text-decoration:none"
|
# tag.attrs["style"] = "color:black;text-decoration:none"
|
||||||
return tag
|
return tag
|
||||||
|
|
||||||
def patch_string(section: NavigableString) -> List:
|
def patch_string(section: NavigableString) -> List:
|
||||||
"""Replace Pokemon with link to Pokemon; requires splitting up the
|
"""Replace Pokemon with link to Pokemon; requires splitting up the
|
||||||
NavigableString into a list of NavigableStrings and Tags."""
|
NavigableString into a list of NavigableStrings and Tags."""
|
||||||
result = [[]]
|
result = [[]]
|
||||||
for word in str(section).split(" "):
|
for word in r.split(str(section)):
|
||||||
word_stripped = r.sub("", word)
|
if word.lower() in pokemon_lookup:
|
||||||
if word_stripped.lower() in pokemon_lookup:
|
pokemon_lookup[word.lower()].appears_in_book = True
|
||||||
word_split = r.split(word)
|
link = pokemon_name_to_link(word.lower(), word)
|
||||||
i = word_split.index(word_stripped)
|
result.append(link)
|
||||||
if i == 0:
|
|
||||||
# add space if there are no other chars before pokemon
|
|
||||||
result[-1].append(" ")
|
|
||||||
else:
|
|
||||||
# add other chars before pokemon if there are any
|
|
||||||
result[-1].append("".join(word_split[:i]))
|
|
||||||
pokemon_link = pokemon_name_to_link(
|
|
||||||
word_stripped.lower(), word_stripped
|
|
||||||
)
|
|
||||||
result.append(pokemon_link)
|
|
||||||
result.append([])
|
result.append([])
|
||||||
if i + 1 == len(word_split):
|
|
||||||
# add space after pokemon if there are no other chars
|
|
||||||
result[-1].append(" ")
|
|
||||||
else:
|
|
||||||
# add other chars after pokemon if there are any
|
|
||||||
result[-1].append("".join(word_split[i + 1 :]))
|
|
||||||
else:
|
else:
|
||||||
result[-1].append(word)
|
result[-1].append(word)
|
||||||
|
|
||||||
# convert words back into strings.
|
# convert words back into strings
|
||||||
for i in range(len(result)):
|
for i in range(len(result)):
|
||||||
if isinstance(result[i], list):
|
if isinstance(result[i], list):
|
||||||
result[i] = NavigableString(" ".join(result[i]))
|
result[i] = NavigableString("".join(result[i]))
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def patch_paragraph(paragraph: Tag):
|
def patch_paragraph(paragraph: Tag):
|
||||||
|
@ -97,6 +83,19 @@ def patch_chapter(chapter: epub.EpubHtml, pokemon_lookup: Dict[str, Pokemon]):
|
||||||
def patch(epub_filepath: str, pokemon: List[Pokemon]):
|
def patch(epub_filepath: str, pokemon: List[Pokemon]):
|
||||||
book = epub.read_epub(epub_filepath)
|
book = epub.read_epub(epub_filepath)
|
||||||
|
|
||||||
|
pokemon_lookup = {p.name.lower(): p for p in pokemon}
|
||||||
|
chapters = [
|
||||||
|
b
|
||||||
|
for b in book.get_items()
|
||||||
|
if isinstance(b, epub.EpubHtml)
|
||||||
|
if b.id.startswith("np_")
|
||||||
|
]
|
||||||
|
for c in track(chapters, description="Add Pokemon links to chapters"):
|
||||||
|
patch_chapter(c, pokemon_lookup)
|
||||||
|
|
||||||
|
# only add Pokemon to Pokedex chapter that appear (in the book)
|
||||||
|
pokemon = [p for p in pokemon if p.appears_in_book]
|
||||||
|
|
||||||
chapter = create_pokedex_chapter(pokemon)
|
chapter = create_pokedex_chapter(pokemon)
|
||||||
book.add_item(chapter)
|
book.add_item(chapter)
|
||||||
link = epub.Link(chapter.file_name, chapter.title, chapter.id)
|
link = epub.Link(chapter.file_name, chapter.title, chapter.id)
|
||||||
|
@ -113,16 +112,8 @@ def patch(epub_filepath: str, pokemon: List[Pokemon]):
|
||||||
)
|
)
|
||||||
book.add_item(img)
|
book.add_item(img)
|
||||||
|
|
||||||
pokemon_lookup = {p.name.lower(): p for p in pokemon}
|
console = Console()
|
||||||
chapters = [
|
|
||||||
b
|
|
||||||
for b in book.get_items()
|
|
||||||
if isinstance(b, epub.EpubHtml)
|
|
||||||
if b.id.startswith("np_")
|
|
||||||
]
|
|
||||||
for c in chapters:
|
|
||||||
patch_chapter(c, pokemon_lookup)
|
|
||||||
|
|
||||||
epub_out = epub_filepath.replace(".", "-with-links.")
|
epub_out = epub_filepath.replace(".", "-with-links.")
|
||||||
epub.write_epub(epub_out, book, {})
|
with console.status(f"Writing {epub_out}"):
|
||||||
logging.info(f"Write '{epub_out}'.")
|
epub.write_epub(epub_out, book, {})
|
||||||
|
console.print(f"[green]✓[/green] [orange1]{epub_out}[/orange1] written")
|
||||||
|
|
10
src/main.py
10
src/main.py
|
@ -3,13 +3,19 @@ import logging
|
||||||
import src.pokemon
|
import src.pokemon
|
||||||
import src.epub
|
import src.epub
|
||||||
|
|
||||||
|
from rich.logging import RichHandler
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
logging.basicConfig(format="%(message)s", level=logging.INFO)
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(message)s",
|
||||||
|
datefmt="[%X]",
|
||||||
|
handlers=[RichHandler()],
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
ptoos_epub = sys.argv[1]
|
ptoos_epub = sys.argv[1]
|
||||||
except IndexError:
|
except IndexError:
|
||||||
ptoos_epub = "poos.epub"
|
ptoos_epub = "poos.epub"
|
||||||
logging.info(f"Patching '{ptoos_epub}'.")
|
|
||||||
pokemon = src.pokemon.get_pokemon()
|
pokemon = src.pokemon.get_pokemon()
|
||||||
src.epub.patch(ptoos_epub, pokemon)
|
src.epub.patch(ptoos_epub, pokemon)
|
||||||
|
|
|
@ -2,6 +2,7 @@ import requests
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
|
from rich.progress import track
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from typing import List
|
from typing import List
|
||||||
|
@ -23,6 +24,7 @@ class Pokemon(BaseModel):
|
||||||
img_filepath: str
|
img_filepath: str
|
||||||
json_filepath: str
|
json_filepath: str
|
||||||
description: str = ""
|
description: str = ""
|
||||||
|
appears_in_book: bool = False
|
||||||
|
|
||||||
|
|
||||||
def download_to_file(url: str, filepath: str, override=False):
|
def download_to_file(url: str, filepath: str, override=False):
|
||||||
|
@ -65,7 +67,7 @@ def get_pokemon() -> List[Pokemon]:
|
||||||
table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:]
|
table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:]
|
||||||
|
|
||||||
pokemon = []
|
pokemon = []
|
||||||
for table_row_soup in table_row_soups:
|
for table_row_soup in track(table_row_soups, description="Download Pokemon"):
|
||||||
name = table_row_soup.find_next("th").next_element.attrs["title"]
|
name = table_row_soup.find_next("th").next_element.attrs["title"]
|
||||||
|
|
||||||
# ignore Galarian and Alolan Pokemon so
|
# ignore Galarian and Alolan Pokemon so
|
||||||
|
@ -101,7 +103,7 @@ def get_pokemon() -> List[Pokemon]:
|
||||||
extend_pokemon(p)
|
extend_pokemon(p)
|
||||||
with open(p.json_filepath, "w") as f:
|
with open(p.json_filepath, "w") as f:
|
||||||
f.write(p.json())
|
f.write(p.json())
|
||||||
logging.info(f"Saved {p.json_filepath}.")
|
logging.debug(f"Saved {p.json_filepath}.")
|
||||||
|
|
||||||
# Filter out speculative Pokemon
|
# Filter out speculative Pokemon
|
||||||
pokemon = [
|
pokemon = [
|
||||||
|
@ -110,7 +112,6 @@ def get_pokemon() -> List[Pokemon]:
|
||||||
if not p.description.startswith("This article's contents will change")
|
if not p.description.startswith("This article's contents will change")
|
||||||
]
|
]
|
||||||
|
|
||||||
logging.info("Pokemon loaded.")
|
|
||||||
return pokemon
|
return pokemon
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue