feat: Add book viewer at /book with shared design system

- Add html/index.html: book viewer with auto-discovering sidebar, prev/next navigation, keyboard shortcuts, and URL hash persistence - Add html/book-page.css: shared stylesheet for all book pages derived from fabula-ultima-sheet.css (dark theme, CSS variables, Cinzel/ Crimson Text fonts, common class styles) - Add book.js entry point so webpack injects the shared CSS into the book viewer; update webpack.config.js for two entry points, split CSS chunk, CopyWebpackPlugin for book pages, and /book dev server rewrite rule - Add scripts/strip_watermark.py: removes "Guest Customer (Order #52072168)" watermark artifacts from all 210 book pages - Add scripts/restyle_book.py: strips per-page <style> blocks and injects <link rel="stylesheet" href="book-page.css"> into all pages - Update Justfile deploy to scp -r dist/* for the new /book subtree Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-06 03:36:35 +00:00
parent 58552b536f
commit c75cd188c1
220 changed files with 12685 additions and 10 deletions
--- a/scripts/pdftohtml.py
+++ b/scripts/pdftohtml.py
@@ -0,0 +1,63 @@
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "cryptography",
+#     "ollama",
+#     "pypdf",
+# ]
+# ///
+
+import ollama
+from pypdf import PdfReader
+
+
+def extract_text_from_pdf(pdf_path):
+    """
+    Extracts raw text content from all pages of a PDF file, one page at a time.
+    """
+    reader = PdfReader(pdf_path)
+    full_text = ""
+    for page in reader.pages:
+        text = page.extract_text()
+        if text:
+            yield text
+
+
+def convert_text_to_html(raw_text, model_name="gemma4"):
+    """Sends raw text to Ollama and requests a semantic HTML conversion."""
+    prompt = f"""
+    You are an expert web developer. Convert the following raw text extracted from a PDF document into a beautifully styled, clean, and semantic HTML document. 
+    
+    Requirements:
+    1. Use appropriate HTML tags (<h1>, <p>, <ul>, <li>, <strong>, etc.) to recreate the structural hierarchy.
+    2. Add inline CSS or a <style> block in the <head> to make it look professional (modern sans-serif typography, clean margins, and clear layout).
+    3. Return ONLY valid HTML code. Do not include markdown code block backticks (```html) or extra conversational commentary.
+
+    Raw Document Text:
+    \"\"\"
+    {raw_text}
+    \"\"\"
+    """
+    # print(f"Sending text to Ollama model '{model_name}' for HTML generation...")
+    response = ollama.generate(model=model_name, prompt=prompt)
+    return response["response"]
+
+
+def main():
+    input_pdf = (
+        "Fabula_Ultima_-_Natural_Fantasy_Atlas_ENG_v1_1.pdf"
+    )
+    try:
+        from pathlib import Path
+        Path('./html').mkdir(exist_ok=True)
+        for page_num, text in enumerate(extract_text_from_pdf(input_pdf)):
+            html_output = convert_text_to_html(text)
+            print(f'Writing html/{page_num}.html ({len(text)} bytes)')
+            with open(f'html/{page_num}.html', 'w') as fh:
+                fh.write(html_output)
+    except FileNotFoundError:
+        print(f"Error: The file '{input_pdf}' was not found. Please check your path.")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/restyle_book.py
+++ b/scripts/restyle_book.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+"""
+Remove per-page <style> blocks from book pages and replace them with a
+shared <link rel="stylesheet" href="book-page.css"> derived from the main
+fabula-ultima-sheet.css design system.
+"""
+
+import glob
+import os
+import re
+
+HTML_DIR = os.path.join(os.path.dirname(__file__), "..", "html")
+
+# Remove <style>...</style> blocks (including surrounding blank lines)
+STYLE_BLOCK_RE = re.compile(r"\s*<style[^>]*>.*?</style>", re.DOTALL | re.IGNORECASE)
+
+# Remove any existing Google Fonts <link> tags
+GFONTS_LINK_RE = re.compile(
+    r'\s*<link[^>]+fonts\.googleapis\.com[^>]*>',
+    re.IGNORECASE,
+)
+
+CSS_LINK = '    <link rel="stylesheet" href="book-page.css">'
+
+
+def process_file(filepath: str) -> bool:
+    with open(filepath, encoding="utf-8") as f:
+        original = f.read()
+
+    content = original
+
+    # Strip <style> blocks
+    content = STYLE_BLOCK_RE.sub("", content)
+
+    # Strip any Google Fonts <link> (fonts are now loaded by book-page.css)
+    content = GFONTS_LINK_RE.sub("", content)
+
+    # Inject the shared stylesheet link, handling three head structures:
+    #  1. Has </title>  → insert after it
+    #  2. Has </head> but no </title> → insert before </head>
+    #  3. No <head> at all (bare fragment) → prepend link at top of file
+    if CSS_LINK not in content:
+        if re.search(r"</title>", content, re.IGNORECASE):
+            content = re.sub(
+                r"(</title>)",
+                r"\1\n" + CSS_LINK,
+                content, count=1, flags=re.IGNORECASE,
+            )
+        elif re.search(r"</head>", content, re.IGNORECASE):
+            content = re.sub(
+                r"(</head>)",
+                CSS_LINK + r"\n\1",
+                content, count=1, flags=re.IGNORECASE,
+            )
+        else:
+            content = CSS_LINK + "\n" + content
+
+    if content == original:
+        return False
+
+    with open(filepath, "w", encoding="utf-8") as f:
+        f.write(content)
+    return True
+
+
+def main() -> None:
+    def sort_key(p):
+        m = re.search(r"(\d+)", os.path.basename(p))
+        return int(m.group(1)) if m else -1
+
+    html_files = sorted(
+        glob.glob(os.path.join(HTML_DIR, "[0-9]*.html")),
+        key=sort_key,
+    )
+
+    changed = 0
+    for filepath in html_files:
+        if process_file(filepath):
+            changed += 1
+
+    print(f"Done. {changed}/{len(html_files)} pages updated.")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/strip_watermark.py
+++ b/scripts/strip_watermark.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+"""
+Remove "Guest Customer (Order #52072168)" watermark artifacts from all book pages.
+
+Strategy:
+- If a line's text content (with HTML tags stripped) consists entirely of
+  watermark text, remove the whole line.
+- If watermark text is embedded within a line that has other content, strip
+  just the watermark portion and tidy the surrounding punctuation.
+"""
+
+import glob
+import os
+import re
+
+HTML_DIR = os.path.join(os.path.dirname(__file__), "..", "html")
+
+# Matches the watermark text in all the forms it appears
+WATERMARK_RE = re.compile(
+    r"(?:"
+    r"Guest\s+Customer\b[^\n<]*?"           # "Guest Customer" + trailing text
+    r"|Order\s+\w[^\n<]*?52072168"          # "Order <label>: 52072168"
+    r"|Order\s*[#:]\s*:?\s*52072168"        # "Order #52072168", "Order #: 52072168"
+    r"|#\s*52072168"                         # "#52072168" standalone
+    r"|\b52072168\b"                         # bare order number
+    r")",
+    re.IGNORECASE,
+)
+
+TAG_RE = re.compile(r"<[^>]+>")
+
+# Punctuation and decoration that can be left behind after stripping
+DECORATION_RE = re.compile(r"^[\s\-–—|,.:;!?()\[\]*&#\d/\\]+$")
+
+
+def text_content(line: str) -> str:
+    """Return the visible text of a line with HTML tags removed."""
+    return TAG_RE.sub("", line)
+
+
+def is_watermark_only(line: str) -> bool:
+    """True when the line's entire visible text is watermark content."""
+    text = text_content(line).strip()
+    remaining = WATERMARK_RE.sub("", text).strip()
+    return DECORATION_RE.match(remaining) is not None or remaining == ""
+
+
+def strip_watermark_inline(line: str) -> str:
+    """Remove watermark text from a line that has other real content."""
+    result = WATERMARK_RE.sub("", line)
+    # Tidy decoration left behind after removal (e.g. "— ", " —", " | ", "---")
+    result = re.sub(r"\s*[—–\-]{1,3}\s*$", "", result, flags=re.MULTILINE)
+    result = re.sub(r"^\s*[—–\-]{1,3}\s*", "", result, flags=re.MULTILINE)
+    result = re.sub(r"\|\s*$", "", result, flags=re.MULTILINE)
+    result = re.sub(r"^\s*\|\s*", "", result, flags=re.MULTILINE)
+    result = re.sub(r"\s{2,}", " ", result)
+    return result
+
+
+def process_file(filepath: str) -> tuple[int, int]:
+    with open(filepath, encoding="utf-8") as f:
+        lines = f.readlines()
+
+    new_lines: list[str] = []
+    removed = 0
+    stripped = 0
+
+    for line in lines:
+        if not WATERMARK_RE.search(line):
+            new_lines.append(line)
+            continue
+
+        if is_watermark_only(line.strip()):
+            removed += 1
+            # Keep the newline gap only if needed for readability; skip blank result
+        else:
+            new_lines.append(strip_watermark_inline(line))
+            stripped += 1
+
+    with open(filepath, "w", encoding="utf-8") as f:
+        f.writelines(new_lines)
+
+    return removed, stripped
+
+
+def main() -> None:
+    def sort_key(p):
+        m = re.search(r"(\d+)", os.path.basename(p))
+        return int(m.group(1)) if m else -1
+
+    html_files = sorted(
+        glob.glob(os.path.join(HTML_DIR, "*.html")),
+        key=sort_key,
+    )
+
+    total_removed = 0
+    total_stripped = 0
+    affected = 0
+
+    for filepath in html_files:
+        removed, stripped = process_file(filepath)
+        if removed or stripped:
+            affected += 1
+            name = os.path.basename(filepath)
+            parts = []
+            if removed:
+                parts.append(f"{removed} line{'s' if removed != 1 else ''} removed")
+            if stripped:
+                parts.append(f"{stripped} inline")
+            print(f"  {name}: {', '.join(parts)}")
+        total_removed += removed
+        total_stripped += stripped
+
+    print(
+        f"\nDone. {affected} files changed — "
+        f"{total_removed} lines removed, {total_stripped} inline occurrences stripped."
+    )
+
+
+if __name__ == "__main__":
+    main()