#!/usr/bin/env python3 """ Remove "Guest Customer (Order #52072168)" watermark artifacts from all book pages. Strategy: - If a line's text content (with HTML tags stripped) consists entirely of watermark text, remove the whole line. - If watermark text is embedded within a line that has other content, strip just the watermark portion and tidy the surrounding punctuation. """ import glob import os import re HTML_DIR = os.path.join(os.path.dirname(__file__), "..", "books") # Matches the watermark text in all the forms it appears WATERMARK_RE = re.compile( r"(?:" r"Guest\s+Customer\b[^\n<]*?" # "Guest Customer" + trailing text r"|Order\s+\w[^\n<]*?52072168" # "Order