Spaces:

geekyrakshit
/

medrag

Sleeping

App Files Files Community

mratanusarkar commited on Oct 17, 2024

Commit

419f968

1 Parent(s): 391b2f3

add: docs & docstrings for pypdf2 text loader

Browse files

Files changed (4) hide show

docs/document_loader/text_loader/pymupdf4llm_text_loader.md +1 -1
docs/document_loader/text_loader/pypdf2_text_loader.md +3 -0
medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py +59 -0
mkdocs.yml +1 -0

docs/document_loader/text_loader/pymupdf4llm_text_loader.md CHANGED Viewed

@@ -1,3 +1,3 @@
-## Load text from PDF files
 ::: medrag_multi_modal.document_loader.text_loader.pymupdf4llm_text_loader


1	+ ## Load text from PDF files (using PyMuPDF4LLM)
2
3	::: medrag_multi_modal.document_loader.text_loader.pymupdf4llm_text_loader

docs/document_loader/text_loader/pypdf2_text_loader.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ ## Load text from PDF files (using PyPDF2)
2	+
3	+ ::: medrag_multi_modal.document_loader.text_loader.pypdf2_text_loader

medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py CHANGED Viewed

@@ -6,7 +6,66 @@ from .base_text_loader import BaseTextLoader
 class PyPDF2TextLoader(BaseTextLoader):
     async def _process_page(self, page_idx: int) -> Dict[str, str]:
         with open(self.document_file_path, "rb") as file:
             pdf_reader = PyPDF2.PdfReader(file)
             page = pdf_reader.pages[page_idx]

 class PyPDF2TextLoader(BaseTextLoader):
+    """
+    A concrete implementation of the BaseTextLoader for loading text from a PDF file
+    using `PyPDF2`, processing it into a simple text format, and optionally publishing
+    it to a Weave dataset.
+    This class extends the BaseTextLoader and implements the abstract methods to
+    load and process pages from a PDF file.
+    This class will handle the downloading of a PDF file from a given URL if it does not already exist locally.
+    It uses PyPDF2 to read the PDF and extract text from each page. The processed pages are stored in a list
+    of Page objects, which can be optionally published to a Weave dataset.
+    !!! example "Example Usage"
+        ```python
+        import asyncio
+        import weave
+        from medrag_multi_modal.document_loader.text_loader import PyPDF2TextLoader
+        weave.init(project_name="ml-colabs/medrag-multi-modal")
+        url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
+        loader = PyPDF2TextLoader(
+            url=url,
+            document_name="Gray's Anatomy",
+            document_file_path="grays_anatomy.pdf",
+        )
+        asyncio.run(
+            loader.load_data(
+                start_page=31,
+                end_page=36,
+                weave_dataset_name="grays-anatomy-text",
+            )
+        )
+        ```
+    Args:
+        url (str): The URL of the PDF file to download if not present locally.
+        document_name (str): The name of the document for metadata purposes.
+        document_file_path (str): The local file path where the PDF is stored or will be downloaded.
+    """
     async def _process_page(self, page_idx: int) -> Dict[str, str]:
+        """
+        Process a single page of the PDF and extract its text using PyPDF2.
+        Returns a dictionary with the processed page data.
+        The dictionary will have the following keys and values:
+            - "text": (str) the extracted text from the page.
+            - "page_idx": (int) the index of the page.
+            - "document_name": (str) the name of the document.
+            - "file_path": (str) the local file path where the PDF is stored.
+            - "file_url": (str) the URL of the PDF file.
+        Args:
+            page_idx (int): The index of the page to process.
+        Returns:
+            Dict[str, str]: A dictionary containing the processed page data.
+        """
         with open(self.document_file_path, "rb") as file:
             pdf_reader = PyPDF2.PdfReader(file)
             page = pdf_reader.pages[page_idx]

mkdocs.yml CHANGED Viewed

@@ -66,6 +66,7 @@ nav:
     - Text Loader:
       - Base: 'document_loader/text_loader/base_text_loader.md'
       - PyMuPDF4LLM: 'document_loader/text_loader/pymupdf4llm_text_loader.md'
     - Text and Image Loader: 'document_loader/load_text_image.md'
     - Image Loader: 'document_loader/load_image.md'
   - Retrieval:

     - Text Loader:
       - Base: 'document_loader/text_loader/base_text_loader.md'
       - PyMuPDF4LLM: 'document_loader/text_loader/pymupdf4llm_text_loader.md'
+      - PyPDF2: 'document_loader/text_loader/pypdf2_text_loader.md'
     - Text and Image Loader: 'document_loader/load_text_image.md'
     - Image Loader: 'document_loader/load_image.md'
   - Retrieval: