feat: Add book viewer at /book with shared design system
- Add html/index.html: book viewer with auto-discovering sidebar, prev/next navigation, keyboard shortcuts, and URL hash persistence - Add html/book-page.css: shared stylesheet for all book pages derived from fabula-ultima-sheet.css (dark theme, CSS variables, Cinzel/ Crimson Text fonts, common class styles) - Add book.js entry point so webpack injects the shared CSS into the book viewer; update webpack.config.js for two entry points, split CSS chunk, CopyWebpackPlugin for book pages, and /book dev server rewrite rule - Add scripts/strip_watermark.py: removes "Guest Customer (Order #52072168)" watermark artifacts from all 210 book pages - Add scripts/restyle_book.py: strips per-page <style> blocks and injects <link rel="stylesheet" href="book-page.css"> into all pages - Update Justfile deploy to scp -r dist/* for the new /book subtree Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
63
scripts/pdftohtml.py
Normal file
63
scripts/pdftohtml.py
Normal file
@@ -0,0 +1,63 @@
|
||||
# /// script
|
||||
# requires-python = ">=3.12"
|
||||
# dependencies = [
|
||||
# "cryptography",
|
||||
# "ollama",
|
||||
# "pypdf",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import ollama
|
||||
from pypdf import PdfReader
|
||||
|
||||
|
||||
def extract_text_from_pdf(pdf_path):
|
||||
"""
|
||||
Extracts raw text content from all pages of a PDF file, one page at a time.
|
||||
"""
|
||||
reader = PdfReader(pdf_path)
|
||||
full_text = ""
|
||||
for page in reader.pages:
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
yield text
|
||||
|
||||
|
||||
def convert_text_to_html(raw_text, model_name="gemma4"):
|
||||
"""Sends raw text to Ollama and requests a semantic HTML conversion."""
|
||||
prompt = f"""
|
||||
You are an expert web developer. Convert the following raw text extracted from a PDF document into a beautifully styled, clean, and semantic HTML document.
|
||||
|
||||
Requirements:
|
||||
1. Use appropriate HTML tags (<h1>, <p>, <ul>, <li>, <strong>, etc.) to recreate the structural hierarchy.
|
||||
2. Add inline CSS or a <style> block in the <head> to make it look professional (modern sans-serif typography, clean margins, and clear layout).
|
||||
3. Return ONLY valid HTML code. Do not include markdown code block backticks (```html) or extra conversational commentary.
|
||||
|
||||
Raw Document Text:
|
||||
\"\"\"
|
||||
{raw_text}
|
||||
\"\"\"
|
||||
"""
|
||||
# print(f"Sending text to Ollama model '{model_name}' for HTML generation...")
|
||||
response = ollama.generate(model=model_name, prompt=prompt)
|
||||
return response["response"]
|
||||
|
||||
|
||||
def main():
|
||||
input_pdf = (
|
||||
"Fabula_Ultima_-_Natural_Fantasy_Atlas_ENG_v1_1.pdf"
|
||||
)
|
||||
try:
|
||||
from pathlib import Path
|
||||
Path('./html').mkdir(exist_ok=True)
|
||||
for page_num, text in enumerate(extract_text_from_pdf(input_pdf)):
|
||||
html_output = convert_text_to_html(text)
|
||||
print(f'Writing html/{page_num}.html ({len(text)} bytes)')
|
||||
with open(f'html/{page_num}.html', 'w') as fh:
|
||||
fh.write(html_output)
|
||||
except FileNotFoundError:
|
||||
print(f"Error: The file '{input_pdf}' was not found. Please check your path.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
85
scripts/restyle_book.py
Normal file
85
scripts/restyle_book.py
Normal file
@@ -0,0 +1,85 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Remove per-page <style> blocks from book pages and replace them with a
|
||||
shared <link rel="stylesheet" href="book-page.css"> derived from the main
|
||||
fabula-ultima-sheet.css design system.
|
||||
"""
|
||||
|
||||
import glob
|
||||
import os
|
||||
import re
|
||||
|
||||
HTML_DIR = os.path.join(os.path.dirname(__file__), "..", "html")
|
||||
|
||||
# Remove <style>...</style> blocks (including surrounding blank lines)
|
||||
STYLE_BLOCK_RE = re.compile(r"\s*<style[^>]*>.*?</style>", re.DOTALL | re.IGNORECASE)
|
||||
|
||||
# Remove any existing Google Fonts <link> tags
|
||||
GFONTS_LINK_RE = re.compile(
|
||||
r'\s*<link[^>]+fonts\.googleapis\.com[^>]*>',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
CSS_LINK = ' <link rel="stylesheet" href="book-page.css">'
|
||||
|
||||
|
||||
def process_file(filepath: str) -> bool:
|
||||
with open(filepath, encoding="utf-8") as f:
|
||||
original = f.read()
|
||||
|
||||
content = original
|
||||
|
||||
# Strip <style> blocks
|
||||
content = STYLE_BLOCK_RE.sub("", content)
|
||||
|
||||
# Strip any Google Fonts <link> (fonts are now loaded by book-page.css)
|
||||
content = GFONTS_LINK_RE.sub("", content)
|
||||
|
||||
# Inject the shared stylesheet link, handling three head structures:
|
||||
# 1. Has </title> → insert after it
|
||||
# 2. Has </head> but no </title> → insert before </head>
|
||||
# 3. No <head> at all (bare fragment) → prepend link at top of file
|
||||
if CSS_LINK not in content:
|
||||
if re.search(r"</title>", content, re.IGNORECASE):
|
||||
content = re.sub(
|
||||
r"(</title>)",
|
||||
r"\1\n" + CSS_LINK,
|
||||
content, count=1, flags=re.IGNORECASE,
|
||||
)
|
||||
elif re.search(r"</head>", content, re.IGNORECASE):
|
||||
content = re.sub(
|
||||
r"(</head>)",
|
||||
CSS_LINK + r"\n\1",
|
||||
content, count=1, flags=re.IGNORECASE,
|
||||
)
|
||||
else:
|
||||
content = CSS_LINK + "\n" + content
|
||||
|
||||
if content == original:
|
||||
return False
|
||||
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
f.write(content)
|
||||
return True
|
||||
|
||||
|
||||
def main() -> None:
|
||||
def sort_key(p):
|
||||
m = re.search(r"(\d+)", os.path.basename(p))
|
||||
return int(m.group(1)) if m else -1
|
||||
|
||||
html_files = sorted(
|
||||
glob.glob(os.path.join(HTML_DIR, "[0-9]*.html")),
|
||||
key=sort_key,
|
||||
)
|
||||
|
||||
changed = 0
|
||||
for filepath in html_files:
|
||||
if process_file(filepath):
|
||||
changed += 1
|
||||
|
||||
print(f"Done. {changed}/{len(html_files)} pages updated.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
121
scripts/strip_watermark.py
Normal file
121
scripts/strip_watermark.py
Normal file
@@ -0,0 +1,121 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Remove "Guest Customer (Order #52072168)" watermark artifacts from all book pages.
|
||||
|
||||
Strategy:
|
||||
- If a line's text content (with HTML tags stripped) consists entirely of
|
||||
watermark text, remove the whole line.
|
||||
- If watermark text is embedded within a line that has other content, strip
|
||||
just the watermark portion and tidy the surrounding punctuation.
|
||||
"""
|
||||
|
||||
import glob
|
||||
import os
|
||||
import re
|
||||
|
||||
HTML_DIR = os.path.join(os.path.dirname(__file__), "..", "html")
|
||||
|
||||
# Matches the watermark text in all the forms it appears
|
||||
WATERMARK_RE = re.compile(
|
||||
r"(?:"
|
||||
r"Guest\s+Customer\b[^\n<]*?" # "Guest Customer" + trailing text
|
||||
r"|Order\s+\w[^\n<]*?52072168" # "Order <label>: 52072168"
|
||||
r"|Order\s*[#:]\s*:?\s*52072168" # "Order #52072168", "Order #: 52072168"
|
||||
r"|#\s*52072168" # "#52072168" standalone
|
||||
r"|\b52072168\b" # bare order number
|
||||
r")",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
TAG_RE = re.compile(r"<[^>]+>")
|
||||
|
||||
# Punctuation and decoration that can be left behind after stripping
|
||||
DECORATION_RE = re.compile(r"^[\s\-–—|,.:;!?()\[\]*&#\d/\\]+$")
|
||||
|
||||
|
||||
def text_content(line: str) -> str:
|
||||
"""Return the visible text of a line with HTML tags removed."""
|
||||
return TAG_RE.sub("", line)
|
||||
|
||||
|
||||
def is_watermark_only(line: str) -> bool:
|
||||
"""True when the line's entire visible text is watermark content."""
|
||||
text = text_content(line).strip()
|
||||
remaining = WATERMARK_RE.sub("", text).strip()
|
||||
return DECORATION_RE.match(remaining) is not None or remaining == ""
|
||||
|
||||
|
||||
def strip_watermark_inline(line: str) -> str:
|
||||
"""Remove watermark text from a line that has other real content."""
|
||||
result = WATERMARK_RE.sub("", line)
|
||||
# Tidy decoration left behind after removal (e.g. "— ", " —", " | ", "---")
|
||||
result = re.sub(r"\s*[—–\-]{1,3}\s*$", "", result, flags=re.MULTILINE)
|
||||
result = re.sub(r"^\s*[—–\-]{1,3}\s*", "", result, flags=re.MULTILINE)
|
||||
result = re.sub(r"\|\s*$", "", result, flags=re.MULTILINE)
|
||||
result = re.sub(r"^\s*\|\s*", "", result, flags=re.MULTILINE)
|
||||
result = re.sub(r"\s{2,}", " ", result)
|
||||
return result
|
||||
|
||||
|
||||
def process_file(filepath: str) -> tuple[int, int]:
|
||||
with open(filepath, encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
|
||||
new_lines: list[str] = []
|
||||
removed = 0
|
||||
stripped = 0
|
||||
|
||||
for line in lines:
|
||||
if not WATERMARK_RE.search(line):
|
||||
new_lines.append(line)
|
||||
continue
|
||||
|
||||
if is_watermark_only(line.strip()):
|
||||
removed += 1
|
||||
# Keep the newline gap only if needed for readability; skip blank result
|
||||
else:
|
||||
new_lines.append(strip_watermark_inline(line))
|
||||
stripped += 1
|
||||
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
f.writelines(new_lines)
|
||||
|
||||
return removed, stripped
|
||||
|
||||
|
||||
def main() -> None:
|
||||
def sort_key(p):
|
||||
m = re.search(r"(\d+)", os.path.basename(p))
|
||||
return int(m.group(1)) if m else -1
|
||||
|
||||
html_files = sorted(
|
||||
glob.glob(os.path.join(HTML_DIR, "*.html")),
|
||||
key=sort_key,
|
||||
)
|
||||
|
||||
total_removed = 0
|
||||
total_stripped = 0
|
||||
affected = 0
|
||||
|
||||
for filepath in html_files:
|
||||
removed, stripped = process_file(filepath)
|
||||
if removed or stripped:
|
||||
affected += 1
|
||||
name = os.path.basename(filepath)
|
||||
parts = []
|
||||
if removed:
|
||||
parts.append(f"{removed} line{'s' if removed != 1 else ''} removed")
|
||||
if stripped:
|
||||
parts.append(f"{stripped} inline")
|
||||
print(f" {name}: {', '.join(parts)}")
|
||||
total_removed += removed
|
||||
total_stripped += stripped
|
||||
|
||||
print(
|
||||
f"\nDone. {affected} files changed — "
|
||||
f"{total_removed} lines removed, {total_stripped} inline occurrences stripped."
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user