feat: Add core rules

2026-06-06 13:03:00 +00:00
parent c75cd188c1
commit 75c6ab9975
583 changed files with 13580 additions and 50 deletions
--- a/scripts/add_css.py
+++ b/scripts/add_css.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+"""Prepend a book-page.css link to every HTML file in html2/."""
+
+from pathlib import Path
+
+CSS_LINK = '<link rel="stylesheet" href="book-page.css">\n'
+HTML2_DIR = Path(__file__).parent.parent / "html2"
+
+
+def main():
+    files = sorted(HTML2_DIR.glob("*.html"), key=lambda p: int(p.stem) if p.stem.isdigit() else -1)
+    for path in files:
+        content = path.read_text(encoding="utf-8")
+        if CSS_LINK.strip() not in content:
+            path.write_text(CSS_LINK + content, encoding="utf-8")
+            print(f"updated {path.name}")
+        else:
+            print(f"skipped {path.name} (already has link)")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/pdftohtml.py
+++ b/scripts/pdftohtml.py
@@ -30,7 +30,7 @@ def convert_text_to_html(raw_text, model_name="gemma4"):
    
    Requirements:
    1. Use appropriate HTML tags (<h1>, <p>, <ul>, <li>, <strong>, etc.) to recreate the structural hierarchy.
-    2. Add inline CSS or a <style> block in the <head> to make it look professional (modern sans-serif typography, clean margins, and clear layout).
+    2. Do not add CSS, styling will be added later.
    3. Return ONLY valid HTML code. Do not include markdown code block backticks (```html) or extra conversational commentary.

    Raw Document Text:
@@ -45,15 +45,16 @@ def convert_text_to_html(raw_text, model_name="gemma4"):

 def main():
    input_pdf = (
-        "Fabula_Ultima_-_Natural_Fantasy_Atlas_ENG_v1_1.pdf"
+        # "Fabula_Ultima_-_Natural_Fantasy_Atlas_ENG_v1_1.pdf"
+        "Fabula_Ultima_TTJRPG.pdf"
    )
    try:
        from pathlib import Path
-        Path('./html').mkdir(exist_ok=True)
+        Path('./html2').mkdir(exist_ok=True)
        for page_num, text in enumerate(extract_text_from_pdf(input_pdf)):
            html_output = convert_text_to_html(text)
-            print(f'Writing html/{page_num}.html ({len(text)} bytes)')
-            with open(f'html/{page_num}.html', 'w') as fh:
+            print(f'Writing html2/{page_num}.html ({len(text)} bytes)')
+            with open(f'html2/{page_num}.html', 'w') as fh:
                fh.write(html_output)
    except FileNotFoundError:
        print(f"Error: The file '{input_pdf}' was not found. Please check your path.")
--- a/scripts/restyle_book.py
+++ b/scripts/restyle_book.py
@@ -9,7 +9,7 @@ import glob
 import os
 import re

-HTML_DIR = os.path.join(os.path.dirname(__file__), "..", "html")
+HTML_DIR = os.path.join(os.path.dirname(__file__), "..", "books", "core")

 # Remove <style>...</style> blocks (including surrounding blank lines)
 STYLE_BLOCK_RE = re.compile(r"\s*<style[^>]*>.*?</style>", re.DOTALL | re.IGNORECASE)
--- a/scripts/strip_watermark.py
+++ b/scripts/strip_watermark.py
@@ -13,7 +13,7 @@ import glob
 import os
 import re

-HTML_DIR = os.path.join(os.path.dirname(__file__), "..", "html")
+HTML_DIR = os.path.join(os.path.dirname(__file__), "..", "books")

 # Matches the watermark text in all the forms it appears
 WATERMARK_RE = re.compile(
@@ -23,6 +23,7 @@ WATERMARK_RE = re.compile(
    r"|Order\s*[#:]\s*:?\s*52072168"        # "Order #52072168", "Order #: 52072168"
    r"|#\s*52072168"                         # "#52072168" standalone
    r"|\b52072168\b"                         # bare order number
+    r"|\b38246845\b"
    r")",
    re.IGNORECASE,
 )
@@ -88,10 +89,18 @@ def main() -> None:
        m = re.search(r"(\d+)", os.path.basename(p))
        return int(m.group(1)) if m else -1

-    html_files = sorted(
-        glob.glob(os.path.join(HTML_DIR, "*.html")),
-        key=sort_key,
-    )
+    from pathlib import Path
+    html_files = []
+    for root, dirs, files in Path(HTML_DIR).walk():
+        hf = filter(lambda fn: fn.endswith(".html"), files)
+        hf = [ root / fn for fn in hf ]
+        html_files.extend(hf)
+    # html_files = sorted(
+    #     glob.glob(os.path.join(HTML_DIR, "*.html")),
+    #     key=sort_key,
+    # )
+    breakpoint()
+    html_files = sorted(html_files, key=sort_key)

    total_removed = 0
    total_stripped = 0