mratanusarkar commited on
Commit
fc27062
·
1 Parent(s): fb5095f

add: docs & docstrings for marker text loader

Browse files
docs/document_loader/text_loader/marker_text_loader.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ## Load text from PDF files (using Marker)
2
+
3
+ ::: medrag_multi_modal.document_loader.text_loader.marker_text_loader
medrag_multi_modal/document_loader/text_loader/marker_text_loader.py CHANGED
@@ -7,7 +7,67 @@ from .base_text_loader import BaseTextLoader
7
 
8
 
9
  class MarkerTextLoader(BaseTextLoader):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  async def _process_page(self, page_idx: int) -> Dict[str, str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  model_lst = load_all_models()
12
 
13
  text, _, out_meta = convert_single_pdf(
@@ -21,9 +81,9 @@ class MarkerTextLoader(BaseTextLoader):
21
 
22
  return {
23
  "text": text,
24
- "meta": out_meta,
25
  "page_idx": page_idx,
26
  "document_name": self.document_name,
27
  "file_path": self.document_file_path,
28
  "file_url": self.url,
 
29
  }
 
7
 
8
 
9
  class MarkerTextLoader(BaseTextLoader):
10
+ """
11
+ A concrete implementation of the BaseTextLoader for loading text from a PDF file
12
+ using `marker-pdf`, processing it into a structured text format, and optionally publishing
13
+ it to a Weave dataset.
14
+
15
+ This class extends the BaseTextLoader and implements the abstract methods to
16
+ load and process pages from a PDF file using marker-pdf, which is a pipeline of deep learning models.
17
+
18
+ This class will handle the downloading of a PDF file from a given URL if it does not already exist locally.
19
+ It uses marker-pdf to read the PDF and extract structured text from each page. The processed pages are stored
20
+ in a list of Page objects, which can be optionally published to a Weave dataset.
21
+
22
+ !!! example "Example Usage"
23
+ ```python
24
+ import asyncio
25
+
26
+ import weave
27
+
28
+ from medrag_multi_modal.document_loader.text_loader import MarkerTextLoader
29
+
30
+ weave.init(project_name="ml-colabs/medrag-multi-modal")
31
+ url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
32
+ loader = MarkerTextLoader(
33
+ url=url,
34
+ document_name="Gray's Anatomy",
35
+ document_file_path="grays_anatomy.pdf",
36
+ )
37
+ asyncio.run(
38
+ loader.load_data(
39
+ start_page=31,
40
+ end_page=36,
41
+ weave_dataset_name="grays-anatomy-text",
42
+ )
43
+ )
44
+ ```
45
+
46
+ Args:
47
+ url (str): The URL of the PDF file to download if not present locally.
48
+ document_name (str): The name of the document for metadata purposes.
49
+ document_file_path (str): The local file path where the PDF is stored or will be downloaded.
50
+ """
51
+
52
  async def _process_page(self, page_idx: int) -> Dict[str, str]:
53
+ """
54
+ Process a single page of the PDF and extract its structured text using marker-pdf.
55
+
56
+ Returns a dictionary with the processed page data.
57
+ The dictionary will have the following keys and values:
58
+ - "text": (str) the extracted structured text from the page.
59
+ - "page_idx": (int) the index of the page.
60
+ - "document_name": (str) the name of the document.
61
+ - "file_path": (str) the local file path where the PDF is stored.
62
+ - "file_url": (str) the URL of the PDF file.
63
+ - "meta": (dict) the metadata extracted from the page by marker-pdf.
64
+
65
+ Args:
66
+ page_idx (int): The index of the page to process.
67
+
68
+ Returns:
69
+ Dict[str, str]: A dictionary containing the processed page data.
70
+ """
71
  model_lst = load_all_models()
72
 
73
  text, _, out_meta = convert_single_pdf(
 
81
 
82
  return {
83
  "text": text,
 
84
  "page_idx": page_idx,
85
  "document_name": self.document_name,
86
  "file_path": self.document_file_path,
87
  "file_url": self.url,
88
+ "meta": out_meta,
89
  }
mkdocs.yml CHANGED
@@ -68,6 +68,7 @@ nav:
68
  - PyMuPDF4LLM: 'document_loader/text_loader/pymupdf4llm_text_loader.md'
69
  - PyPDF2: 'document_loader/text_loader/pypdf2_text_loader.md'
70
  - PDFPlumber: 'document_loader/text_loader/pdfplumber_text_loader.md'
 
71
  - Text and Image Loader: 'document_loader/load_text_image.md'
72
  - Image Loader: 'document_loader/load_image.md'
73
  - Retrieval:
 
68
  - PyMuPDF4LLM: 'document_loader/text_loader/pymupdf4llm_text_loader.md'
69
  - PyPDF2: 'document_loader/text_loader/pypdf2_text_loader.md'
70
  - PDFPlumber: 'document_loader/text_loader/pdfplumber_text_loader.md'
71
+ - Marker: 'document_loader/text_loader/marker_text_loader.md'
72
  - Text and Image Loader: 'document_loader/load_text_image.md'
73
  - Image Loader: 'document_loader/load_image.md'
74
  - Retrieval: