feat: Add core rules

This commit is contained in:
2026-06-06 13:03:00 +00:00
parent c75cd188c1
commit 75c6ab9975
583 changed files with 13580 additions and 50 deletions

View File

@@ -13,7 +13,7 @@ import glob
import os
import re
HTML_DIR = os.path.join(os.path.dirname(__file__), "..", "html")
HTML_DIR = os.path.join(os.path.dirname(__file__), "..", "books")
# Matches the watermark text in all the forms it appears
WATERMARK_RE = re.compile(
@@ -23,6 +23,7 @@ WATERMARK_RE = re.compile(
r"|Order\s*[#:]\s*:?\s*52072168" # "Order #52072168", "Order #: 52072168"
r"|#\s*52072168" # "#52072168" standalone
r"|\b52072168\b" # bare order number
r"|\b38246845\b"
r")",
re.IGNORECASE,
)
@@ -88,10 +89,18 @@ def main() -> None:
m = re.search(r"(\d+)", os.path.basename(p))
return int(m.group(1)) if m else -1
html_files = sorted(
glob.glob(os.path.join(HTML_DIR, "*.html")),
key=sort_key,
)
from pathlib import Path
html_files = []
for root, dirs, files in Path(HTML_DIR).walk():
hf = filter(lambda fn: fn.endswith(".html"), files)
hf = [ root / fn for fn in hf ]
html_files.extend(hf)
# html_files = sorted(
# glob.glob(os.path.join(HTML_DIR, "*.html")),
# key=sort_key,
# )
breakpoint()
html_files = sorted(html_files, key=sort_key)
total_removed = 0
total_stripped = 0