mratanusarkar commited on
Commit
fb5095f
·
1 Parent(s): ba60fc7

add: marker pdf text loader

Browse files
medrag_multi_modal/__init__.py CHANGED
@@ -1,5 +1,6 @@
1
  from .document_loader import (
2
  ImageLoader,
 
3
  PDFPlumberTextLoader,
4
  PyMuPDF4LLMTextLoader,
5
  PyPDF2TextLoader,
@@ -11,6 +12,7 @@ __all__ = [
11
  "PyMuPDF4LLMTextLoader",
12
  "PyPDF2TextLoader",
13
  "PDFPlumberTextLoader",
 
14
  "ImageLoader",
15
  "TextImageLoader",
16
  "MultiModalRetriever",
 
1
  from .document_loader import (
2
  ImageLoader,
3
+ MarkerTextLoader,
4
  PDFPlumberTextLoader,
5
  PyMuPDF4LLMTextLoader,
6
  PyPDF2TextLoader,
 
12
  "PyMuPDF4LLMTextLoader",
13
  "PyPDF2TextLoader",
14
  "PDFPlumberTextLoader",
15
+ "MarkerTextLoader",
16
  "ImageLoader",
17
  "TextImageLoader",
18
  "MultiModalRetriever",
medrag_multi_modal/document_loader/__init__.py CHANGED
@@ -1,11 +1,17 @@
1
  from .load_image import ImageLoader
2
  from .load_text_image import TextImageLoader
3
- from .text_loader import PDFPlumberTextLoader, PyMuPDF4LLMTextLoader, PyPDF2TextLoader
 
 
 
 
 
4
 
5
  __all__ = [
6
  "PyMuPDF4LLMTextLoader",
7
  "PyPDF2TextLoader",
8
  "PDFPlumberTextLoader",
 
9
  "ImageLoader",
10
  "TextImageLoader",
11
  ]
 
1
  from .load_image import ImageLoader
2
  from .load_text_image import TextImageLoader
3
+ from .text_loader import (
4
+ MarkerTextLoader,
5
+ PDFPlumberTextLoader,
6
+ PyMuPDF4LLMTextLoader,
7
+ PyPDF2TextLoader,
8
+ )
9
 
10
  __all__ = [
11
  "PyMuPDF4LLMTextLoader",
12
  "PyPDF2TextLoader",
13
  "PDFPlumberTextLoader",
14
+ "MarkerTextLoader",
15
  "ImageLoader",
16
  "TextImageLoader",
17
  ]
medrag_multi_modal/document_loader/text_loader/__init__.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from .pdfplumber_text_loader import PDFPlumberTextLoader
2
  from .pymupdf4llm_text_loader import PyMuPDF4LLMTextLoader
3
  from .pypdf2_text_loader import PyPDF2TextLoader
@@ -6,4 +7,5 @@ __all__ = [
6
  "PyMuPDF4LLMTextLoader",
7
  "PyPDF2TextLoader",
8
  "PDFPlumberTextLoader",
 
9
  ]
 
1
+ from .marker_text_loader import MarkerTextLoader
2
  from .pdfplumber_text_loader import PDFPlumberTextLoader
3
  from .pymupdf4llm_text_loader import PyMuPDF4LLMTextLoader
4
  from .pypdf2_text_loader import PyPDF2TextLoader
 
7
  "PyMuPDF4LLMTextLoader",
8
  "PyPDF2TextLoader",
9
  "PDFPlumberTextLoader",
10
+ "MarkerTextLoader",
11
  ]
medrag_multi_modal/document_loader/text_loader/marker_text_loader.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict
2
+
3
+ from marker.convert import convert_single_pdf
4
+ from marker.models import load_all_models
5
+
6
+ from .base_text_loader import BaseTextLoader
7
+
8
+
9
+ class MarkerTextLoader(BaseTextLoader):
10
+ async def _process_page(self, page_idx: int) -> Dict[str, str]:
11
+ model_lst = load_all_models()
12
+
13
+ text, _, out_meta = convert_single_pdf(
14
+ self.document_file_path,
15
+ model_lst,
16
+ max_pages=1,
17
+ batch_multiplier=1,
18
+ start_page=page_idx,
19
+ ocr_all_pages=True,
20
+ )
21
+
22
+ return {
23
+ "text": text,
24
+ "meta": out_meta,
25
+ "page_idx": page_idx,
26
+ "document_name": self.document_name,
27
+ "file_path": self.document_file_path,
28
+ "file_url": self.url,
29
+ }