Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
from langchain_community.document_loaders import PyPDFLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
class GridCodePDFLoader: | |
def __init__(self, pdf_path): | |
self.pdf_path = pdf_path | |
self.text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=500, | |
chunk_overlap=50, | |
separators=["\n\n", "\n", ".", " ", ""] | |
) | |
def load_and_split(self): | |
"""Load PDF and split into chunks""" | |
loader = PyPDFLoader(self.pdf_path) | |
pages = loader.load() | |
return self.text_splitter.split_documents(pages) | |
def extract_metadata(self): | |
"""Extract metadata from PDF like sections, tables etc.""" | |
# TODO: Implement metadata extraction | |
pass |