fabula-ultima-html/scripts/strip_watermark.py

#!/usr/bin/env python3
"""
Remove "Guest Customer (Order #52072168)" watermark artifacts from all book pages.

Strategy:
- If a line's text content (with HTML tags stripped) consists entirely of
  watermark text, remove the whole line.
- If watermark text is embedded within a line that has other content, strip
  just the watermark portion and tidy the surrounding punctuation.
"""

import glob
import os
import re

HTML_DIR = os.path.join(os.path.dirname(__file__), "..", "books")

# Matches the watermark text in all the forms it appears
WATERMARK_RE = re.compile(
    r"(?:"
    r"Guest\s+Customer\b[^\n<]*?"           # "Guest Customer" + trailing text
    r"|Order\s+\w[^\n<]*?52072168"          # "Order <label>: 52072168"
    r"|Order\s*[#:]\s*:?\s*52072168"        # "Order #52072168", "Order #: 52072168"
    r"|#\s*52072168"                         # "#52072168" standalone
    r"|\b52072168\b"                         # bare order number
    r"|\b38246845\b"
    r")",
    re.IGNORECASE,
)

TAG_RE = re.compile(r"<[^>]+>")

# Punctuation and decoration that can be left behind after stripping
DECORATION_RE = re.compile(r"^[\s\-–—|,.:;!?()\[\]*&#\d/\\]+$")


def text_content(line: str) -> str:
    """Return the visible text of a line with HTML tags removed."""
    return TAG_RE.sub("", line)


def is_watermark_only(line: str) -> bool:
    """True when the line's entire visible text is watermark content."""
    text = text_content(line).strip()
    remaining = WATERMARK_RE.sub("", text).strip()
    return DECORATION_RE.match(remaining) is not None or remaining == ""


def strip_watermark_inline(line: str) -> str:
    """Remove watermark text from a line that has other real content."""
    result = WATERMARK_RE.sub("", line)
    # Tidy decoration left behind after removal (e.g. "— ", " —", " | ", "---")
    result = re.sub(r"\s*[—–\-]{1,3}\s*$", "", result, flags=re.MULTILINE)
    result = re.sub(r"^\s*[—–\-]{1,3}\s*", "", result, flags=re.MULTILINE)
    result = re.sub(r"\|\s*$", "", result, flags=re.MULTILINE)
    result = re.sub(r"^\s*\|\s*", "", result, flags=re.MULTILINE)
    result = re.sub(r"\s{2,}", " ", result)
    return result


def process_file(filepath: str) -> tuple[int, int]:
    with open(filepath, encoding="utf-8") as f:
        lines = f.readlines()

    new_lines: list[str] = []
    removed = 0
    stripped = 0

    for line in lines:
        if not WATERMARK_RE.search(line):
            new_lines.append(line)
            continue

        if is_watermark_only(line.strip()):
            removed += 1
            # Keep the newline gap only if needed for readability; skip blank result
        else:
            new_lines.append(strip_watermark_inline(line))
            stripped += 1

    with open(filepath, "w", encoding="utf-8") as f:
        f.writelines(new_lines)

    return removed, stripped


def main() -> None:
    def sort_key(p):
        m = re.search(r"(\d+)", os.path.basename(p))
        return int(m.group(1)) if m else -1

    from pathlib import Path
    html_files = []
    for root, dirs, files in Path(HTML_DIR).walk():
        hf = filter(lambda fn: fn.endswith(".html"), files)
        hf = [ root / fn for fn in hf ]
        html_files.extend(hf)
    # html_files = sorted(
    #     glob.glob(os.path.join(HTML_DIR, "*.html")),
    #     key=sort_key,
    # )
    breakpoint()
    html_files = sorted(html_files, key=sort_key)

    total_removed = 0
    total_stripped = 0
    affected = 0

    for filepath in html_files:
        removed, stripped = process_file(filepath)
        if removed or stripped:
            affected += 1
            name = os.path.basename(filepath)
            parts = []
            if removed:
                parts.append(f"{removed} line{'s' if removed != 1 else ''} removed")
            if stripped:
                parts.append(f"{stripped} inline")
            print(f"  {name}: {', '.join(parts)}")
        total_removed += removed
        total_stripped += stripped

    print(
        f"\nDone. {affected} files changed — "
        f"{total_removed} lines removed, {total_stripped} inline occurrences stripped."
    )


if __name__ == "__main__":
    main()