feat: Add core rules

This commit is contained in:
2026-06-06 13:03:00 +00:00
parent c75cd188c1
commit 75c6ab9975
583 changed files with 13580 additions and 50 deletions

22
scripts/add_css.py Normal file
View File

@@ -0,0 +1,22 @@
#!/usr/bin/env python3
"""Prepend a book-page.css link to every HTML file in html2/."""
from pathlib import Path
CSS_LINK = '<link rel="stylesheet" href="book-page.css">\n'
HTML2_DIR = Path(__file__).parent.parent / "html2"
def main():
files = sorted(HTML2_DIR.glob("*.html"), key=lambda p: int(p.stem) if p.stem.isdigit() else -1)
for path in files:
content = path.read_text(encoding="utf-8")
if CSS_LINK.strip() not in content:
path.write_text(CSS_LINK + content, encoding="utf-8")
print(f"updated {path.name}")
else:
print(f"skipped {path.name} (already has link)")
if __name__ == "__main__":
main()

View File

@@ -30,7 +30,7 @@ def convert_text_to_html(raw_text, model_name="gemma4"):
Requirements:
1. Use appropriate HTML tags (<h1>, <p>, <ul>, <li>, <strong>, etc.) to recreate the structural hierarchy.
2. Add inline CSS or a <style> block in the <head> to make it look professional (modern sans-serif typography, clean margins, and clear layout).
2. Do not add CSS, styling will be added later.
3. Return ONLY valid HTML code. Do not include markdown code block backticks (```html) or extra conversational commentary.
Raw Document Text:
@@ -45,15 +45,16 @@ def convert_text_to_html(raw_text, model_name="gemma4"):
def main():
input_pdf = (
"Fabula_Ultima_-_Natural_Fantasy_Atlas_ENG_v1_1.pdf"
# "Fabula_Ultima_-_Natural_Fantasy_Atlas_ENG_v1_1.pdf"
"Fabula_Ultima_TTJRPG.pdf"
)
try:
from pathlib import Path
Path('./html').mkdir(exist_ok=True)
Path('./html2').mkdir(exist_ok=True)
for page_num, text in enumerate(extract_text_from_pdf(input_pdf)):
html_output = convert_text_to_html(text)
print(f'Writing html/{page_num}.html ({len(text)} bytes)')
with open(f'html/{page_num}.html', 'w') as fh:
print(f'Writing html2/{page_num}.html ({len(text)} bytes)')
with open(f'html2/{page_num}.html', 'w') as fh:
fh.write(html_output)
except FileNotFoundError:
print(f"Error: The file '{input_pdf}' was not found. Please check your path.")

View File

@@ -9,7 +9,7 @@ import glob
import os
import re
HTML_DIR = os.path.join(os.path.dirname(__file__), "..", "html")
HTML_DIR = os.path.join(os.path.dirname(__file__), "..", "books", "core")
# Remove <style>...</style> blocks (including surrounding blank lines)
STYLE_BLOCK_RE = re.compile(r"\s*<style[^>]*>.*?</style>", re.DOTALL | re.IGNORECASE)

View File

@@ -13,7 +13,7 @@ import glob
import os
import re
HTML_DIR = os.path.join(os.path.dirname(__file__), "..", "html")
HTML_DIR = os.path.join(os.path.dirname(__file__), "..", "books")
# Matches the watermark text in all the forms it appears
WATERMARK_RE = re.compile(
@@ -23,6 +23,7 @@ WATERMARK_RE = re.compile(
r"|Order\s*[#:]\s*:?\s*52072168" # "Order #52072168", "Order #: 52072168"
r"|#\s*52072168" # "#52072168" standalone
r"|\b52072168\b" # bare order number
r"|\b38246845\b"
r")",
re.IGNORECASE,
)
@@ -88,10 +89,18 @@ def main() -> None:
m = re.search(r"(\d+)", os.path.basename(p))
return int(m.group(1)) if m else -1
html_files = sorted(
glob.glob(os.path.join(HTML_DIR, "*.html")),
key=sort_key,
)
from pathlib import Path
html_files = []
for root, dirs, files in Path(HTML_DIR).walk():
hf = filter(lambda fn: fn.endswith(".html"), files)
hf = [ root / fn for fn in hf ]
html_files.extend(hf)
# html_files = sorted(
# glob.glob(os.path.join(HTML_DIR, "*.html")),
# key=sort_key,
# )
breakpoint()
html_files = sorted(html_files, key=sort_key)
total_removed = 0
total_stripped = 0