cre-chatbot-rag / pdf_processor.py
tony-42069's picture
Add detailed debug logging for PDF processing
afb405a
raw
history blame
3.17 kB
from typing import List, Dict
import os
import pypdf
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
class PDFProcessor:
def __init__(self):
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len,
separators=["\n\n", "\n", " ", ""]
)
def process_pdf(self, pdf_path: str) -> List[Dict]:
"""
Process a PDF file and return chunks of text with metadata.
Args:
pdf_path (str): Path to the PDF file
Returns:
List[Dict]: List of text chunks with metadata
"""
print(f"Processing PDF at: {os.path.abspath(pdf_path)}")
if not os.path.exists(pdf_path):
raise FileNotFoundError(f"PDF file not found at {pdf_path}")
print(f"PDF file exists, size: {os.path.getsize(pdf_path)} bytes")
try:
print("Attempting to use PyPDFLoader...")
# Try using PyPDFLoader from langchain
loader = PyPDFLoader(pdf_path)
pages = loader.load()
print(f"Successfully loaded {len(pages)} pages with PyPDFLoader")
# Split the text into chunks
chunks = []
for page in pages:
page_chunks = self.text_splitter.split_text(page.page_content)
for chunk in page_chunks:
chunks.append({
'text': chunk,
'metadata': {'page': page.metadata['page']}
})
print(f"Created {len(chunks)} chunks from PyPDFLoader method")
return chunks
except Exception as e:
print(f"Error with PyPDFLoader: {str(e)}")
print("Trying alternative PDF processing method...")
# Fallback to direct pypdf usage
try:
print("Attempting to use pypdf directly...")
with open(pdf_path, 'rb') as file:
pdf = pypdf.PdfReader(file)
print(f"Successfully opened PDF with {len(pdf.pages)} pages")
chunks = []
for page_num in range(len(pdf.pages)):
text = pdf.pages[page_num].extract_text()
page_chunks = self.text_splitter.split_text(text)
for chunk in page_chunks:
chunks.append({
'text': chunk,
'metadata': {'page': page_num + 1}
})
print(f"Created {len(chunks)} chunks from direct pypdf method")
return chunks
except Exception as e2:
error_msg = f"Failed to process PDF with both methods.\nPyPDFLoader error: {str(e)}\npypdf error: {str(e2)}"
print(error_msg)
raise Exception(error_msg)