|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from datetime import datetime
|
|
import logging
|
|
import subprocess
|
|
import os
|
|
import shutil
|
|
import tempfile
|
|
|
|
|
|
|
|
from App_Function_Libraries.SQLite_DB import add_media_with_keywords
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MAX_FILE_SIZE_MB = 50
|
|
CONVERSION_TIMEOUT_SECONDS = 300
|
|
|
|
|
|
def convert_pdf_to_markdown(pdf_path):
|
|
"""
|
|
Convert a PDF file to Markdown by calling a script in another virtual environment.
|
|
"""
|
|
|
|
logging.debug(f"Marker: Converting PDF file to Markdown: {pdf_path}")
|
|
|
|
file_size_mb = os.path.getsize(pdf_path) / (1024 * 1024)
|
|
if file_size_mb > MAX_FILE_SIZE_MB:
|
|
raise ValueError(f"File size ({file_size_mb:.2f} MB) exceeds the maximum allowed size of {MAX_FILE_SIZE_MB} MB")
|
|
|
|
logging.debug("Marker: Converting PDF file to Markdown using Marker virtual environment")
|
|
|
|
other_venv_python = "Helper_Scripts/marker_venv/bin/python"
|
|
|
|
|
|
converter_script = "Helper_Scripts/PDF_Converter.py"
|
|
|
|
logging.debug("Marker: Attempting to convert PDF file to Markdown...")
|
|
try:
|
|
result = subprocess.run(
|
|
[other_venv_python, converter_script, pdf_path],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=CONVERSION_TIMEOUT_SECONDS
|
|
)
|
|
if result.returncode != 0:
|
|
raise Exception(f"Conversion failed: {result.stderr}")
|
|
return result.stdout
|
|
except subprocess.TimeoutExpired:
|
|
raise Exception(f"PDF conversion timed out after {CONVERSION_TIMEOUT_SECONDS} seconds")
|
|
|
|
|
|
def process_and_ingest_pdf(file, title, author, keywords):
|
|
if file is None:
|
|
return "Please select a PDF file to upload."
|
|
|
|
try:
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
|
|
temp_path = os.path.join(temp_dir, "temp.pdf")
|
|
|
|
|
|
shutil.copy(file.name, temp_path)
|
|
|
|
|
|
result = ingest_pdf_file(temp_path, title, author, keywords)
|
|
|
|
return result
|
|
except Exception as e:
|
|
return f"Error processing PDF: {str(e)}"
|
|
|
|
|
|
def ingest_pdf_file(file_path, title=None, author=None, keywords=None):
|
|
try:
|
|
|
|
markdown_content = convert_pdf_to_markdown(file_path)
|
|
|
|
|
|
if not title:
|
|
title = os.path.splitext(os.path.basename(file_path))[0]
|
|
|
|
|
|
if not author:
|
|
author = 'Unknown'
|
|
|
|
|
|
if not keywords:
|
|
keywords = 'pdf_file,markdown_converted'
|
|
else:
|
|
keywords = f'pdf_file,markdown_converted,{keywords}'
|
|
|
|
|
|
add_media_with_keywords(
|
|
url=file_path,
|
|
title=title,
|
|
media_type='document',
|
|
content=markdown_content,
|
|
keywords=keywords,
|
|
prompt='No prompt for PDF files',
|
|
summary='No summary for PDF files',
|
|
transcription_model='None',
|
|
author=author,
|
|
ingestion_date=datetime.now().strftime('%Y-%m-%d')
|
|
)
|
|
|
|
return f"PDF file '{title}' converted to Markdown and ingested successfully.", file_path
|
|
except ValueError as e:
|
|
logging.error(f"File size error: {str(e)}")
|
|
return f"Error: {str(e)}", file_path
|
|
except Exception as e:
|
|
logging.error(f"Error ingesting PDF file: {str(e)}")
|
|
return f"Error ingesting PDF file: {str(e)}", file_path
|
|
|
|
|
|
def process_and_cleanup_pdf(file, title, author, keywords):
|
|
if file is None:
|
|
return "No file uploaded. Please upload a PDF file."
|
|
|
|
temp_dir = tempfile.mkdtemp()
|
|
temp_file_path = os.path.join(temp_dir, "temp.pdf")
|
|
|
|
try:
|
|
|
|
shutil.copy2(file.name, temp_file_path)
|
|
|
|
|
|
result, _ = ingest_pdf_file(temp_file_path, title, author, keywords)
|
|
|
|
return result
|
|
except Exception as e:
|
|
logging.error(f"Error in processing and cleanup: {str(e)}")
|
|
return f"Error: {str(e)}"
|
|
finally:
|
|
|
|
try:
|
|
shutil.rmtree(temp_dir)
|
|
logging.info(f"Removed temporary directory: {temp_dir}")
|
|
except Exception as cleanup_error:
|
|
logging.error(f"Error during cleanup: {str(cleanup_error)}")
|
|
result += f"\nWarning: Could not remove temporary files: {str(cleanup_error)}"
|
|
|
|
|
|
|
|
|
|
|