cre-chatbot-rag / pdf_processor.py
tony-42069's picture
Simplified PDF processing and dependencies
f3dfbd4
from typing import List, Dict
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
class PDFProcessor:
def __init__(self):
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len,
separators=["\n\n", "\n", " ", ""]
)
def process_pdf(self, pdf_path: str) -> List[Dict]:
"""
Process a PDF file and return chunks of text with metadata.
Args:
pdf_path (str): Path to the PDF file
Returns:
List[Dict]: List of text chunks with metadata
"""
# Load PDF
loader = PyPDFLoader(pdf_path)
pages = loader.load()
# Split text into chunks
chunks = self.text_splitter.split_documents(pages)
# Format chunks with metadata
processed_chunks = []
for chunk in chunks:
processed_chunks.append({
'text': chunk.page_content,
'metadata': {
'page': chunk.metadata.get('page', 0) + 1
}
})
return processed_chunks