feat: Add core rules
This commit is contained in:
22
scripts/add_css.py
Normal file
22
scripts/add_css.py
Normal file
@@ -0,0 +1,22 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Prepend a book-page.css link to every HTML file in html2/."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
CSS_LINK = '<link rel="stylesheet" href="book-page.css">\n'
|
||||
HTML2_DIR = Path(__file__).parent.parent / "html2"
|
||||
|
||||
|
||||
def main():
|
||||
files = sorted(HTML2_DIR.glob("*.html"), key=lambda p: int(p.stem) if p.stem.isdigit() else -1)
|
||||
for path in files:
|
||||
content = path.read_text(encoding="utf-8")
|
||||
if CSS_LINK.strip() not in content:
|
||||
path.write_text(CSS_LINK + content, encoding="utf-8")
|
||||
print(f"updated {path.name}")
|
||||
else:
|
||||
print(f"skipped {path.name} (already has link)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -30,7 +30,7 @@ def convert_text_to_html(raw_text, model_name="gemma4"):
|
||||
|
||||
Requirements:
|
||||
1. Use appropriate HTML tags (<h1>, <p>, <ul>, <li>, <strong>, etc.) to recreate the structural hierarchy.
|
||||
2. Add inline CSS or a <style> block in the <head> to make it look professional (modern sans-serif typography, clean margins, and clear layout).
|
||||
2. Do not add CSS, styling will be added later.
|
||||
3. Return ONLY valid HTML code. Do not include markdown code block backticks (```html) or extra conversational commentary.
|
||||
|
||||
Raw Document Text:
|
||||
@@ -45,15 +45,16 @@ def convert_text_to_html(raw_text, model_name="gemma4"):
|
||||
|
||||
def main():
|
||||
input_pdf = (
|
||||
"Fabula_Ultima_-_Natural_Fantasy_Atlas_ENG_v1_1.pdf"
|
||||
# "Fabula_Ultima_-_Natural_Fantasy_Atlas_ENG_v1_1.pdf"
|
||||
"Fabula_Ultima_TTJRPG.pdf"
|
||||
)
|
||||
try:
|
||||
from pathlib import Path
|
||||
Path('./html').mkdir(exist_ok=True)
|
||||
Path('./html2').mkdir(exist_ok=True)
|
||||
for page_num, text in enumerate(extract_text_from_pdf(input_pdf)):
|
||||
html_output = convert_text_to_html(text)
|
||||
print(f'Writing html/{page_num}.html ({len(text)} bytes)')
|
||||
with open(f'html/{page_num}.html', 'w') as fh:
|
||||
print(f'Writing html2/{page_num}.html ({len(text)} bytes)')
|
||||
with open(f'html2/{page_num}.html', 'w') as fh:
|
||||
fh.write(html_output)
|
||||
except FileNotFoundError:
|
||||
print(f"Error: The file '{input_pdf}' was not found. Please check your path.")
|
||||
|
||||
@@ -9,7 +9,7 @@ import glob
|
||||
import os
|
||||
import re
|
||||
|
||||
HTML_DIR = os.path.join(os.path.dirname(__file__), "..", "html")
|
||||
HTML_DIR = os.path.join(os.path.dirname(__file__), "..", "books", "core")
|
||||
|
||||
# Remove <style>...</style> blocks (including surrounding blank lines)
|
||||
STYLE_BLOCK_RE = re.compile(r"\s*<style[^>]*>.*?</style>", re.DOTALL | re.IGNORECASE)
|
||||
|
||||
@@ -13,7 +13,7 @@ import glob
|
||||
import os
|
||||
import re
|
||||
|
||||
HTML_DIR = os.path.join(os.path.dirname(__file__), "..", "html")
|
||||
HTML_DIR = os.path.join(os.path.dirname(__file__), "..", "books")
|
||||
|
||||
# Matches the watermark text in all the forms it appears
|
||||
WATERMARK_RE = re.compile(
|
||||
@@ -23,6 +23,7 @@ WATERMARK_RE = re.compile(
|
||||
r"|Order\s*[#:]\s*:?\s*52072168" # "Order #52072168", "Order #: 52072168"
|
||||
r"|#\s*52072168" # "#52072168" standalone
|
||||
r"|\b52072168\b" # bare order number
|
||||
r"|\b38246845\b"
|
||||
r")",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
@@ -88,10 +89,18 @@ def main() -> None:
|
||||
m = re.search(r"(\d+)", os.path.basename(p))
|
||||
return int(m.group(1)) if m else -1
|
||||
|
||||
html_files = sorted(
|
||||
glob.glob(os.path.join(HTML_DIR, "*.html")),
|
||||
key=sort_key,
|
||||
)
|
||||
from pathlib import Path
|
||||
html_files = []
|
||||
for root, dirs, files in Path(HTML_DIR).walk():
|
||||
hf = filter(lambda fn: fn.endswith(".html"), files)
|
||||
hf = [ root / fn for fn in hf ]
|
||||
html_files.extend(hf)
|
||||
# html_files = sorted(
|
||||
# glob.glob(os.path.join(HTML_DIR, "*.html")),
|
||||
# key=sort_key,
|
||||
# )
|
||||
breakpoint()
|
||||
html_files = sorted(html_files, key=sort_key)
|
||||
|
||||
total_removed = 0
|
||||
total_stripped = 0
|
||||
|
||||
Reference in New Issue
Block a user