feat: Add book viewer at /book with shared design system
- Add html/index.html: book viewer with auto-discovering sidebar, prev/next navigation, keyboard shortcuts, and URL hash persistence - Add html/book-page.css: shared stylesheet for all book pages derived from fabula-ultima-sheet.css (dark theme, CSS variables, Cinzel/ Crimson Text fonts, common class styles) - Add book.js entry point so webpack injects the shared CSS into the book viewer; update webpack.config.js for two entry points, split CSS chunk, CopyWebpackPlugin for book pages, and /book dev server rewrite rule - Add scripts/strip_watermark.py: removes "Guest Customer (Order #52072168)" watermark artifacts from all 210 book pages - Add scripts/restyle_book.py: strips per-page <style> blocks and injects <link rel="stylesheet" href="book-page.css"> into all pages - Update Justfile deploy to scp -r dist/* for the new /book subtree Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
63
scripts/pdftohtml.py
Normal file
63
scripts/pdftohtml.py
Normal file
@@ -0,0 +1,63 @@
|
||||
# /// script
|
||||
# requires-python = ">=3.12"
|
||||
# dependencies = [
|
||||
# "cryptography",
|
||||
# "ollama",
|
||||
# "pypdf",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import ollama
|
||||
from pypdf import PdfReader
|
||||
|
||||
|
||||
def extract_text_from_pdf(pdf_path):
|
||||
"""
|
||||
Extracts raw text content from all pages of a PDF file, one page at a time.
|
||||
"""
|
||||
reader = PdfReader(pdf_path)
|
||||
full_text = ""
|
||||
for page in reader.pages:
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
yield text
|
||||
|
||||
|
||||
def convert_text_to_html(raw_text, model_name="gemma4"):
|
||||
"""Sends raw text to Ollama and requests a semantic HTML conversion."""
|
||||
prompt = f"""
|
||||
You are an expert web developer. Convert the following raw text extracted from a PDF document into a beautifully styled, clean, and semantic HTML document.
|
||||
|
||||
Requirements:
|
||||
1. Use appropriate HTML tags (<h1>, <p>, <ul>, <li>, <strong>, etc.) to recreate the structural hierarchy.
|
||||
2. Add inline CSS or a <style> block in the <head> to make it look professional (modern sans-serif typography, clean margins, and clear layout).
|
||||
3. Return ONLY valid HTML code. Do not include markdown code block backticks (```html) or extra conversational commentary.
|
||||
|
||||
Raw Document Text:
|
||||
\"\"\"
|
||||
{raw_text}
|
||||
\"\"\"
|
||||
"""
|
||||
# print(f"Sending text to Ollama model '{model_name}' for HTML generation...")
|
||||
response = ollama.generate(model=model_name, prompt=prompt)
|
||||
return response["response"]
|
||||
|
||||
|
||||
def main():
|
||||
input_pdf = (
|
||||
"Fabula_Ultima_-_Natural_Fantasy_Atlas_ENG_v1_1.pdf"
|
||||
)
|
||||
try:
|
||||
from pathlib import Path
|
||||
Path('./html').mkdir(exist_ok=True)
|
||||
for page_num, text in enumerate(extract_text_from_pdf(input_pdf)):
|
||||
html_output = convert_text_to_html(text)
|
||||
print(f'Writing html/{page_num}.html ({len(text)} bytes)')
|
||||
with open(f'html/{page_num}.html', 'w') as fh:
|
||||
fh.write(html_output)
|
||||
except FileNotFoundError:
|
||||
print(f"Error: The file '{input_pdf}' was not found. Please check your path.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user