Spaces:

geekyrakshit
/

medrag

Running

App Files Files Community

mratanusarkar commited on Oct 17, 2024

Commit

fc27062

1 Parent(s): fb5095f

add: docs & docstrings for marker text loader

Browse files

Files changed (3) hide show

docs/document_loader/text_loader/marker_text_loader.md +3 -0
medrag_multi_modal/document_loader/text_loader/marker_text_loader.py +61 -1
mkdocs.yml +1 -0

docs/document_loader/text_loader/marker_text_loader.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ ## Load text from PDF files (using Marker)
2	+
3	+ ::: medrag_multi_modal.document_loader.text_loader.marker_text_loader

medrag_multi_modal/document_loader/text_loader/marker_text_loader.py CHANGED Viewed

@@ -7,7 +7,67 @@ from .base_text_loader import BaseTextLoader
 class MarkerTextLoader(BaseTextLoader):
     async def _process_page(self, page_idx: int) -> Dict[str, str]:
         model_lst = load_all_models()
         text, _, out_meta = convert_single_pdf(
@@ -21,9 +81,9 @@ class MarkerTextLoader(BaseTextLoader):
         return {
             "text": text,
-            "meta": out_meta,
             "page_idx": page_idx,
             "document_name": self.document_name,
             "file_path": self.document_file_path,
             "file_url": self.url,
         }

 class MarkerTextLoader(BaseTextLoader):
+    """
+    A concrete implementation of the BaseTextLoader for loading text from a PDF file
+    using `marker-pdf`, processing it into a structured text format, and optionally publishing
+    it to a Weave dataset.
+    This class extends the BaseTextLoader and implements the abstract methods to
+    load and process pages from a PDF file using marker-pdf, which is a pipeline of deep learning models.
+    This class will handle the downloading of a PDF file from a given URL if it does not already exist locally.
+    It uses marker-pdf to read the PDF and extract structured text from each page. The processed pages are stored
+    in a list of Page objects, which can be optionally published to a Weave dataset.
+    !!! example "Example Usage"
+        ```python
+        import asyncio
+        import weave
+        from medrag_multi_modal.document_loader.text_loader import MarkerTextLoader
+        weave.init(project_name="ml-colabs/medrag-multi-modal")
+        url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
+        loader = MarkerTextLoader(
+            url=url,
+            document_name="Gray's Anatomy",
+            document_file_path="grays_anatomy.pdf",
+        )
+        asyncio.run(
+            loader.load_data(
+                start_page=31,
+                end_page=36,
+                weave_dataset_name="grays-anatomy-text",
+            )
+        )
+        ```
+    Args:
+        url (str): The URL of the PDF file to download if not present locally.
+        document_name (str): The name of the document for metadata purposes.
+        document_file_path (str): The local file path where the PDF is stored or will be downloaded.
+    """
     async def _process_page(self, page_idx: int) -> Dict[str, str]:
+        """
+        Process a single page of the PDF and extract its structured text using marker-pdf.
+        Returns a dictionary with the processed page data.
+        The dictionary will have the following keys and values:
+            - "text": (str) the extracted structured text from the page.
+            - "page_idx": (int) the index of the page.
+            - "document_name": (str) the name of the document.
+            - "file_path": (str) the local file path where the PDF is stored.
+            - "file_url": (str) the URL of the PDF file.
+            - "meta": (dict) the metadata extracted from the page by marker-pdf.
+        Args:
+            page_idx (int): The index of the page to process.
+        Returns:
+            Dict[str, str]: A dictionary containing the processed page data.
+        """
         model_lst = load_all_models()
         text, _, out_meta = convert_single_pdf(
         return {
             "text": text,
             "page_idx": page_idx,
             "document_name": self.document_name,
             "file_path": self.document_file_path,
             "file_url": self.url,
+            "meta": out_meta,
         }

mkdocs.yml CHANGED Viewed

@@ -68,6 +68,7 @@ nav:
       - PyMuPDF4LLM: 'document_loader/text_loader/pymupdf4llm_text_loader.md'
       - PyPDF2: 'document_loader/text_loader/pypdf2_text_loader.md'
       - PDFPlumber: 'document_loader/text_loader/pdfplumber_text_loader.md'
     - Text and Image Loader: 'document_loader/load_text_image.md'
     - Image Loader: 'document_loader/load_image.md'
   - Retrieval:

       - PyMuPDF4LLM: 'document_loader/text_loader/pymupdf4llm_text_loader.md'
       - PyPDF2: 'document_loader/text_loader/pypdf2_text_loader.md'
       - PDFPlumber: 'document_loader/text_loader/pdfplumber_text_loader.md'
+      - Marker: 'document_loader/text_loader/marker_text_loader.md'
     - Text and Image Loader: 'document_loader/load_text_image.md'
     - Image Loader: 'document_loader/load_image.md'
   - Retrieval: