Spaces:
Sleeping
Sleeping
from typing import List, Dict | |
import os | |
from langchain.document_loaders import PyPDFLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
class PDFProcessor: | |
def __init__(self): | |
self.text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1000, | |
chunk_overlap=200, | |
length_function=len, | |
separators=["\n\n", "\n", " ", ""] | |
) | |
def process_pdf(self, pdf_path: str) -> List[Dict]: | |
""" | |
Process a PDF file and return chunks of text with metadata. | |
Args: | |
pdf_path (str): Path to the PDF file | |
Returns: | |
List[Dict]: List of text chunks with metadata | |
""" | |
# Load PDF | |
loader = PyPDFLoader(pdf_path) | |
pages = loader.load() | |
# Split text into chunks | |
chunks = self.text_splitter.split_documents(pages) | |
# Format chunks with metadata | |
processed_chunks = [] | |
for chunk in chunks: | |
processed_chunks.append({ | |
'text': chunk.page_content, | |
'metadata': { | |
'page': chunk.metadata.get('page', 0) + 1 | |
} | |
}) | |
return processed_chunks | |