from langchain_community.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter import pypdf import logging logger = logging.getLogger(__name__) class GridCodeLoader: def __init__(self, file_path, pages=None): self.file_path = file_path self.pages = pages self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=2000, chunk_overlap=50, separators=["\n\n", "\n", ".", " ", ""] ) def load_and_split(self): logger.info(f"Loading PDF from {self.file_path}") # Open PDF directly first to get total pages reader = pypdf.PdfReader(self.file_path) total_pages = len(reader.pages) if isinstance(self.pages, int): # Load first n pages pages_to_load = list(range(min(self.pages, total_pages))) logger.info(f"Loaded first {len(pages_to_load)} pages from PDF") elif isinstance(self.pages, (list, tuple)): # Load specific pages pages_to_load = [p for p in self.pages if p < total_pages] logger.info(f"Loaded pages {self.pages} from PDF") else: pages_to_load = list(range(total_pages)) logger.info(f"Loaded all {len(pages_to_load)} pages from PDF") # Now use PyPDFLoader with the selected pages loader = PyPDFLoader(self.file_path) documents = loader.load() documents = [doc for i, doc in enumerate(documents) if i in pages_to_load] logger.info("Splitting documents into chunks...") chunks = self.text_splitter.split_documents(documents) logger.info(f"Created {len(chunks)} chunks") return chunks