65 lines
2.0 KiB
Python
65 lines
2.0 KiB
Python
# /// script
|
|
# requires-python = ">=3.12"
|
|
# dependencies = [
|
|
# "cryptography",
|
|
# "ollama",
|
|
# "pypdf",
|
|
# ]
|
|
# ///
|
|
|
|
import ollama
|
|
from pypdf import PdfReader
|
|
|
|
|
|
def extract_text_from_pdf(pdf_path):
|
|
"""
|
|
Extracts raw text content from all pages of a PDF file, one page at a time.
|
|
"""
|
|
reader = PdfReader(pdf_path)
|
|
full_text = ""
|
|
for page in reader.pages:
|
|
text = page.extract_text()
|
|
if text:
|
|
yield text
|
|
|
|
|
|
def convert_text_to_html(raw_text, model_name="gemma4"):
|
|
"""Sends raw text to Ollama and requests a semantic HTML conversion."""
|
|
prompt = f"""
|
|
You are an expert web developer. Convert the following raw text extracted from a PDF document into a beautifully styled, clean, and semantic HTML document.
|
|
|
|
Requirements:
|
|
1. Use appropriate HTML tags (<h1>, <p>, <ul>, <li>, <strong>, etc.) to recreate the structural hierarchy.
|
|
2. Do not add CSS, styling will be added later.
|
|
3. Return ONLY valid HTML code. Do not include markdown code block backticks (```html) or extra conversational commentary.
|
|
|
|
Raw Document Text:
|
|
\"\"\"
|
|
{raw_text}
|
|
\"\"\"
|
|
"""
|
|
# print(f"Sending text to Ollama model '{model_name}' for HTML generation...")
|
|
response = ollama.generate(model=model_name, prompt=prompt)
|
|
return response["response"]
|
|
|
|
|
|
def main():
|
|
input_pdf = (
|
|
# "Fabula_Ultima_-_Natural_Fantasy_Atlas_ENG_v1_1.pdf"
|
|
"Fabula_Ultima_TTJRPG.pdf"
|
|
)
|
|
try:
|
|
from pathlib import Path
|
|
Path('./html2').mkdir(exist_ok=True)
|
|
for page_num, text in enumerate(extract_text_from_pdf(input_pdf)):
|
|
html_output = convert_text_to_html(text)
|
|
print(f'Writing html2/{page_num}.html ({len(text)} bytes)')
|
|
with open(f'html2/{page_num}.html', 'w') as fh:
|
|
fh.write(html_output)
|
|
except FileNotFoundError:
|
|
print(f"Error: The file '{input_pdf}' was not found. Please check your path.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|