Spaces:
Sleeping
Sleeping
from typing import List, Dict | |
import os | |
import pypdf | |
from langchain.document_loaders import PyPDFLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
class PDFProcessor: | |
def __init__(self): | |
self.text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1000, | |
chunk_overlap=200, | |
length_function=len, | |
separators=["\n\n", "\n", " ", ""] | |
) | |
def process_pdf(self, pdf_path: str) -> List[Dict]: | |
""" | |
Process a PDF file and return chunks of text with metadata. | |
Args: | |
pdf_path (str): Path to the PDF file | |
Returns: | |
List[Dict]: List of text chunks with metadata | |
""" | |
print(f"Processing PDF at: {os.path.abspath(pdf_path)}") | |
if not os.path.exists(pdf_path): | |
raise FileNotFoundError(f"PDF file not found at {pdf_path}") | |
print(f"PDF file exists, size: {os.path.getsize(pdf_path)} bytes") | |
try: | |
print("Attempting to use PyPDFLoader...") | |
# Try using PyPDFLoader from langchain | |
loader = PyPDFLoader(pdf_path) | |
pages = loader.load() | |
print(f"Successfully loaded {len(pages)} pages with PyPDFLoader") | |
# Split the text into chunks | |
chunks = [] | |
for page in pages: | |
page_chunks = self.text_splitter.split_text(page.page_content) | |
for chunk in page_chunks: | |
chunks.append({ | |
'text': chunk, | |
'metadata': {'page': page.metadata['page']} | |
}) | |
print(f"Created {len(chunks)} chunks from PyPDFLoader method") | |
return chunks | |
except Exception as e: | |
print(f"Error with PyPDFLoader: {str(e)}") | |
print("Trying alternative PDF processing method...") | |
# Fallback to direct pypdf usage | |
try: | |
print("Attempting to use pypdf directly...") | |
with open(pdf_path, 'rb') as file: | |
pdf = pypdf.PdfReader(file) | |
print(f"Successfully opened PDF with {len(pdf.pages)} pages") | |
chunks = [] | |
for page_num in range(len(pdf.pages)): | |
text = pdf.pages[page_num].extract_text() | |
page_chunks = self.text_splitter.split_text(text) | |
for chunk in page_chunks: | |
chunks.append({ | |
'text': chunk, | |
'metadata': {'page': page_num + 1} | |
}) | |
print(f"Created {len(chunks)} chunks from direct pypdf method") | |
return chunks | |
except Exception as e2: | |
error_msg = f"Failed to process PDF with both methods.\nPyPDFLoader error: {str(e)}\npypdf error: {str(e2)}" | |
print(error_msg) | |
raise Exception(error_msg) | |