fabula-ultima-html/scripts/pdftohtml.py

# /// script
# requires-python = ">=3.12"
# dependencies = [
#     "cryptography",
#     "ollama",
#     "pypdf",
# ]
# ///

import ollama
from pypdf import PdfReader


def extract_text_from_pdf(pdf_path):
    """
    Extracts raw text content from all pages of a PDF file, one page at a time.
    """
    reader = PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        text = page.extract_text()
        if text:
            yield text


def convert_text_to_html(raw_text, model_name="gemma4"):
    """Sends raw text to Ollama and requests a semantic HTML conversion."""
    prompt = f"""
    You are an expert web developer. Convert the following raw text extracted from a PDF document into a beautifully styled, clean, and semantic HTML document.

    Requirements:
    1. Use appropriate HTML tags (<h1>, <p>, <ul>, <li>, <strong>, etc.) to recreate the structural hierarchy.
    2. Do not add CSS, styling will be added later.
    3. Return ONLY valid HTML code. Do not include markdown code block backticks (```html) or extra conversational commentary.

    Raw Document Text:
    \"\"\"
    {raw_text}
    \"\"\"
    """
    # print(f"Sending text to Ollama model '{model_name}' for HTML generation...")
    response = ollama.generate(model=model_name, prompt=prompt)
    return response["response"]


def main():
    input_pdf = (
        # "Fabula_Ultima_-_Natural_Fantasy_Atlas_ENG_v1_1.pdf"
        "Fabula_Ultima_TTJRPG.pdf"
    )
    try:
        from pathlib import Path
        Path('./html2').mkdir(exist_ok=True)
        for page_num, text in enumerate(extract_text_from_pdf(input_pdf)):
            html_output = convert_text_to_html(text)
            print(f'Writing html2/{page_num}.html ({len(text)} bytes)')
            with open(f'html2/{page_num}.html', 'w') as fh:
                fh.write(html_output)
    except FileNotFoundError:
        print(f"Error: The file '{input_pdf}' was not found. Please check your path.")


if __name__ == "__main__":
    main()