Spaces:
Running
Running
Commit
·
6526b2f
1
Parent(s):
d191c1b
add: kwargs to interact with underlying library
Browse files- medrag_multi_modal/document_loader/text_loader/base_text_loader.py +5 -2
- medrag_multi_modal/document_loader/text_loader/marker_text_loader.py +3 -1
- medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py +3 -2
- medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py +3 -2
- medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py +3 -2
medrag_multi_modal/document_loader/text_loader/base_text_loader.py
CHANGED
@@ -65,7 +65,7 @@ class BaseTextLoader(ABC):
|
|
65 |
return start_page, end_page
|
66 |
|
67 |
@abstractmethod
|
68 |
-
async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
|
69 |
"""
|
70 |
Abstract method to process a single page of the PDF and extract the text data.
|
71 |
|
@@ -74,6 +74,7 @@ class BaseTextLoader(ABC):
|
|
74 |
|
75 |
Args:
|
76 |
page_idx (int): The index of the page to process.
|
|
|
77 |
|
78 |
Returns:
|
79 |
Dict[str, str]: A dictionary containing the processed page data.
|
@@ -85,6 +86,7 @@ class BaseTextLoader(ABC):
|
|
85 |
start_page: Optional[int] = None,
|
86 |
end_page: Optional[int] = None,
|
87 |
weave_dataset_name: Optional[str] = None,
|
|
|
88 |
) -> List[Dict[str, str]]:
|
89 |
"""
|
90 |
Asynchronously loads text from a PDF file specified by a URL or local file path.
|
@@ -106,6 +108,7 @@ class BaseTextLoader(ABC):
|
|
106 |
start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
|
107 |
end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
|
108 |
weave_dataset_name (Optional[str]): The name of the Weave dataset to publish the pages to, if provided.
|
|
|
109 |
|
110 |
Returns:
|
111 |
List[Dict[str, str]]: A list of dictionaries, each containing the text and metadata for a processed page.
|
@@ -127,7 +130,7 @@ class BaseTextLoader(ABC):
|
|
127 |
|
128 |
async def process_page(page_idx):
|
129 |
nonlocal processed_pages_counter
|
130 |
-
page_data = await self.extract_page_data(page_idx)
|
131 |
pages.append(page_data)
|
132 |
rich.print(
|
133 |
f"Processed page idx: {page_idx}, progress: {processed_pages_counter}/{total_pages}"
|
|
|
65 |
return start_page, end_page
|
66 |
|
67 |
@abstractmethod
|
68 |
+
async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
|
69 |
"""
|
70 |
Abstract method to process a single page of the PDF and extract the text data.
|
71 |
|
|
|
74 |
|
75 |
Args:
|
76 |
page_idx (int): The index of the page to process.
|
77 |
+
**kwargs: Additional keyword arguments that may be used by underlying libraries.
|
78 |
|
79 |
Returns:
|
80 |
Dict[str, str]: A dictionary containing the processed page data.
|
|
|
86 |
start_page: Optional[int] = None,
|
87 |
end_page: Optional[int] = None,
|
88 |
weave_dataset_name: Optional[str] = None,
|
89 |
+
**kwargs,
|
90 |
) -> List[Dict[str, str]]:
|
91 |
"""
|
92 |
Asynchronously loads text from a PDF file specified by a URL or local file path.
|
|
|
108 |
start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
|
109 |
end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
|
110 |
weave_dataset_name (Optional[str]): The name of the Weave dataset to publish the pages to, if provided.
|
111 |
+
**kwargs: Additional keyword arguments that will be passed to extract_page_data method and the underlying library.
|
112 |
|
113 |
Returns:
|
114 |
List[Dict[str, str]]: A list of dictionaries, each containing the text and metadata for a processed page.
|
|
|
130 |
|
131 |
async def process_page(page_idx):
|
132 |
nonlocal processed_pages_counter
|
133 |
+
page_data = await self.extract_page_data(page_idx, **kwargs)
|
134 |
pages.append(page_data)
|
135 |
rich.print(
|
136 |
f"Processed page idx: {page_idx}, progress: {processed_pages_counter}/{total_pages}"
|
medrag_multi_modal/document_loader/text_loader/marker_text_loader.py
CHANGED
@@ -49,7 +49,7 @@ class MarkerTextLoader(BaseTextLoader):
|
|
49 |
document_file_path (str): The local file path where the PDF is stored or will be downloaded.
|
50 |
"""
|
51 |
|
52 |
-
async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
|
53 |
"""
|
54 |
Process a single page of the PDF and extract its structured text using marker-pdf.
|
55 |
|
@@ -65,6 +65,7 @@ class MarkerTextLoader(BaseTextLoader):
|
|
65 |
|
66 |
Args:
|
67 |
page_idx (int): The index of the page to process.
|
|
|
68 |
|
69 |
Returns:
|
70 |
Dict[str, str]: A dictionary containing the processed page data.
|
@@ -78,6 +79,7 @@ class MarkerTextLoader(BaseTextLoader):
|
|
78 |
batch_multiplier=1,
|
79 |
start_page=page_idx,
|
80 |
ocr_all_pages=True,
|
|
|
81 |
)
|
82 |
|
83 |
return {
|
|
|
49 |
document_file_path (str): The local file path where the PDF is stored or will be downloaded.
|
50 |
"""
|
51 |
|
52 |
+
async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
|
53 |
"""
|
54 |
Process a single page of the PDF and extract its structured text using marker-pdf.
|
55 |
|
|
|
65 |
|
66 |
Args:
|
67 |
page_idx (int): The index of the page to process.
|
68 |
+
**kwargs: Additional keyword arguments to be passed to `marker.convert.convert_single_pdf`.
|
69 |
|
70 |
Returns:
|
71 |
Dict[str, str]: A dictionary containing the processed page data.
|
|
|
79 |
batch_multiplier=1,
|
80 |
start_page=page_idx,
|
81 |
ocr_all_pages=True,
|
82 |
+
**kwargs,
|
83 |
)
|
84 |
|
85 |
return {
|
medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py
CHANGED
@@ -48,7 +48,7 @@ class PDFPlumberTextLoader(BaseTextLoader):
|
|
48 |
document_file_path (str): The local file path where the PDF is stored or will be downloaded.
|
49 |
"""
|
50 |
|
51 |
-
async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
|
52 |
"""
|
53 |
Process a single page of the PDF and extract its text using pdfplumber.
|
54 |
|
@@ -63,13 +63,14 @@ class PDFPlumberTextLoader(BaseTextLoader):
|
|
63 |
|
64 |
Args:
|
65 |
page_idx (int): The index of the page to process.
|
|
|
66 |
|
67 |
Returns:
|
68 |
Dict[str, str]: A dictionary containing the processed page data.
|
69 |
"""
|
70 |
with pdfplumber.open(self.document_file_path) as pdf:
|
71 |
page = pdf.pages[page_idx]
|
72 |
-
text = page.extract_text()
|
73 |
|
74 |
return {
|
75 |
"text": text,
|
|
|
48 |
document_file_path (str): The local file path where the PDF is stored or will be downloaded.
|
49 |
"""
|
50 |
|
51 |
+
async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
|
52 |
"""
|
53 |
Process a single page of the PDF and extract its text using pdfplumber.
|
54 |
|
|
|
63 |
|
64 |
Args:
|
65 |
page_idx (int): The index of the page to process.
|
66 |
+
**kwargs: Additional keyword arguments to be passed to `pdfplumber.Page.extract_text`.
|
67 |
|
68 |
Returns:
|
69 |
Dict[str, str]: A dictionary containing the processed page data.
|
70 |
"""
|
71 |
with pdfplumber.open(self.document_file_path) as pdf:
|
72 |
page = pdf.pages[page_idx]
|
73 |
+
text = page.extract_text(**kwargs)
|
74 |
|
75 |
return {
|
76 |
"text": text,
|
medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py
CHANGED
@@ -48,7 +48,7 @@ class PyMuPDF4LLMTextLoader(BaseTextLoader):
|
|
48 |
document_file_path (str): The local file path where the PDF is stored or will be downloaded.
|
49 |
"""
|
50 |
|
51 |
-
async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
|
52 |
"""
|
53 |
Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
|
54 |
|
@@ -63,12 +63,13 @@ class PyMuPDF4LLMTextLoader(BaseTextLoader):
|
|
63 |
|
64 |
Args:
|
65 |
page_idx (int): The index of the page to process.
|
|
|
66 |
|
67 |
Returns:
|
68 |
Dict[str, str]: A dictionary containing the processed page data.
|
69 |
"""
|
70 |
text = pymupdf4llm.to_markdown(
|
71 |
-
doc=self.document_file_path, pages=[page_idx], show_progress=False
|
72 |
)
|
73 |
return {
|
74 |
"text": text,
|
|
|
48 |
document_file_path (str): The local file path where the PDF is stored or will be downloaded.
|
49 |
"""
|
50 |
|
51 |
+
async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
|
52 |
"""
|
53 |
Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
|
54 |
|
|
|
63 |
|
64 |
Args:
|
65 |
page_idx (int): The index of the page to process.
|
66 |
+
**kwargs: Additional keyword arguments to be passed to `pymupdf4llm.to_markdown`.
|
67 |
|
68 |
Returns:
|
69 |
Dict[str, str]: A dictionary containing the processed page data.
|
70 |
"""
|
71 |
text = pymupdf4llm.to_markdown(
|
72 |
+
doc=self.document_file_path, pages=[page_idx], show_progress=False, **kwargs
|
73 |
)
|
74 |
return {
|
75 |
"text": text,
|
medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py
CHANGED
@@ -48,7 +48,7 @@ class PyPDF2TextLoader(BaseTextLoader):
|
|
48 |
document_file_path (str): The local file path where the PDF is stored or will be downloaded.
|
49 |
"""
|
50 |
|
51 |
-
async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
|
52 |
"""
|
53 |
Process a single page of the PDF and extract its text using PyPDF2.
|
54 |
|
@@ -63,6 +63,7 @@ class PyPDF2TextLoader(BaseTextLoader):
|
|
63 |
|
64 |
Args:
|
65 |
page_idx (int): The index of the page to process.
|
|
|
66 |
|
67 |
Returns:
|
68 |
Dict[str, str]: A dictionary containing the processed page data.
|
@@ -70,7 +71,7 @@ class PyPDF2TextLoader(BaseTextLoader):
|
|
70 |
with open(self.document_file_path, "rb") as file:
|
71 |
pdf_reader = PyPDF2.PdfReader(file)
|
72 |
page = pdf_reader.pages[page_idx]
|
73 |
-
text = page.extract_text()
|
74 |
|
75 |
return {
|
76 |
"text": text,
|
|
|
48 |
document_file_path (str): The local file path where the PDF is stored or will be downloaded.
|
49 |
"""
|
50 |
|
51 |
+
async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
|
52 |
"""
|
53 |
Process a single page of the PDF and extract its text using PyPDF2.
|
54 |
|
|
|
63 |
|
64 |
Args:
|
65 |
page_idx (int): The index of the page to process.
|
66 |
+
**kwargs: Additional keyword arguments to be passed to `PyPDF2.PdfReader.pages[0].extract_text`.
|
67 |
|
68 |
Returns:
|
69 |
Dict[str, str]: A dictionary containing the processed page data.
|
|
|
71 |
with open(self.document_file_path, "rb") as file:
|
72 |
pdf_reader = PyPDF2.PdfReader(file)
|
73 |
page = pdf_reader.pages[page_idx]
|
74 |
+
text = page.extract_text(**kwargs)
|
75 |
|
76 |
return {
|
77 |
"text": text,
|