Spaces:

geekyrakshit
/

medrag

Running

App Files Files Community

mratanusarkar commited on Oct 17, 2024

Commit

6526b2f

1 Parent(s): d191c1b

add: kwargs to interact with underlying library

Browse files

Files changed (5) hide show

medrag_multi_modal/document_loader/text_loader/base_text_loader.py +5 -2
medrag_multi_modal/document_loader/text_loader/marker_text_loader.py +3 -1
medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py +3 -2
medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py +3 -2
medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py +3 -2

medrag_multi_modal/document_loader/text_loader/base_text_loader.py CHANGED Viewed

@@ -65,7 +65,7 @@ class BaseTextLoader(ABC):
         return start_page, end_page
     @abstractmethod
-    async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
         """
         Abstract method to process a single page of the PDF and extract the text data.
@@ -74,6 +74,7 @@ class BaseTextLoader(ABC):
         Args:
             page_idx (int): The index of the page to process.
         Returns:
             Dict[str, str]: A dictionary containing the processed page data.
@@ -85,6 +86,7 @@ class BaseTextLoader(ABC):
         start_page: Optional[int] = None,
         end_page: Optional[int] = None,
         weave_dataset_name: Optional[str] = None,
     ) -> List[Dict[str, str]]:
         """
         Asynchronously loads text from a PDF file specified by a URL or local file path.
@@ -106,6 +108,7 @@ class BaseTextLoader(ABC):
             start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
             end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
             weave_dataset_name (Optional[str]): The name of the Weave dataset to publish the pages to, if provided.
         Returns:
             List[Dict[str, str]]: A list of dictionaries, each containing the text and metadata for a processed page.
@@ -127,7 +130,7 @@ class BaseTextLoader(ABC):
         async def process_page(page_idx):
             nonlocal processed_pages_counter
-            page_data = await self.extract_page_data(page_idx)
             pages.append(page_data)
             rich.print(
                 f"Processed page idx: {page_idx}, progress: {processed_pages_counter}/{total_pages}"

         return start_page, end_page
     @abstractmethod
+    async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
         """
         Abstract method to process a single page of the PDF and extract the text data.
         Args:
             page_idx (int): The index of the page to process.
+            **kwargs: Additional keyword arguments that may be used by underlying libraries.
         Returns:
             Dict[str, str]: A dictionary containing the processed page data.
         start_page: Optional[int] = None,
         end_page: Optional[int] = None,
         weave_dataset_name: Optional[str] = None,
+        **kwargs,
     ) -> List[Dict[str, str]]:
         """
         Asynchronously loads text from a PDF file specified by a URL or local file path.
             start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
             end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
             weave_dataset_name (Optional[str]): The name of the Weave dataset to publish the pages to, if provided.
+            **kwargs: Additional keyword arguments that will be passed to extract_page_data method and the underlying library.
         Returns:
             List[Dict[str, str]]: A list of dictionaries, each containing the text and metadata for a processed page.
         async def process_page(page_idx):
             nonlocal processed_pages_counter
+            page_data = await self.extract_page_data(page_idx, **kwargs)
             pages.append(page_data)
             rich.print(
                 f"Processed page idx: {page_idx}, progress: {processed_pages_counter}/{total_pages}"

medrag_multi_modal/document_loader/text_loader/marker_text_loader.py CHANGED Viewed

@@ -49,7 +49,7 @@ class MarkerTextLoader(BaseTextLoader):
         document_file_path (str): The local file path where the PDF is stored or will be downloaded.
     """
-    async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
         """
         Process a single page of the PDF and extract its structured text using marker-pdf.
@@ -65,6 +65,7 @@ class MarkerTextLoader(BaseTextLoader):
         Args:
             page_idx (int): The index of the page to process.
         Returns:
             Dict[str, str]: A dictionary containing the processed page data.
@@ -78,6 +79,7 @@ class MarkerTextLoader(BaseTextLoader):
             batch_multiplier=1,
             start_page=page_idx,
             ocr_all_pages=True,
         )
         return {

         document_file_path (str): The local file path where the PDF is stored or will be downloaded.
     """
+    async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
         """
         Process a single page of the PDF and extract its structured text using marker-pdf.
         Args:
             page_idx (int): The index of the page to process.
+            **kwargs: Additional keyword arguments to be passed to `marker.convert.convert_single_pdf`.
         Returns:
             Dict[str, str]: A dictionary containing the processed page data.
             batch_multiplier=1,
             start_page=page_idx,
             ocr_all_pages=True,
+            **kwargs,
         )
         return {

medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py CHANGED Viewed

@@ -48,7 +48,7 @@ class PDFPlumberTextLoader(BaseTextLoader):
         document_file_path (str): The local file path where the PDF is stored or will be downloaded.
     """
-    async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
         """
         Process a single page of the PDF and extract its text using pdfplumber.
@@ -63,13 +63,14 @@ class PDFPlumberTextLoader(BaseTextLoader):
         Args:
             page_idx (int): The index of the page to process.
         Returns:
             Dict[str, str]: A dictionary containing the processed page data.
         """
         with pdfplumber.open(self.document_file_path) as pdf:
             page = pdf.pages[page_idx]
-            text = page.extract_text()
         return {
             "text": text,

         document_file_path (str): The local file path where the PDF is stored or will be downloaded.
     """
+    async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
         """
         Process a single page of the PDF and extract its text using pdfplumber.
         Args:
             page_idx (int): The index of the page to process.
+            **kwargs: Additional keyword arguments to be passed to `pdfplumber.Page.extract_text`.
         Returns:
             Dict[str, str]: A dictionary containing the processed page data.
         """
         with pdfplumber.open(self.document_file_path) as pdf:
             page = pdf.pages[page_idx]
+            text = page.extract_text(**kwargs)
         return {
             "text": text,

medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py CHANGED Viewed

@@ -48,7 +48,7 @@ class PyMuPDF4LLMTextLoader(BaseTextLoader):
         document_file_path (str): The local file path where the PDF is stored or will be downloaded.
     """
-    async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
         """
         Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
@@ -63,12 +63,13 @@ class PyMuPDF4LLMTextLoader(BaseTextLoader):
         Args:
             page_idx (int): The index of the page to process.
         Returns:
             Dict[str, str]: A dictionary containing the processed page data.
         """
         text = pymupdf4llm.to_markdown(
-            doc=self.document_file_path, pages=[page_idx], show_progress=False
         )
         return {
             "text": text,

         document_file_path (str): The local file path where the PDF is stored or will be downloaded.
     """
+    async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
         """
         Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
         Args:
             page_idx (int): The index of the page to process.
+            **kwargs: Additional keyword arguments to be passed to `pymupdf4llm.to_markdown`.
         Returns:
             Dict[str, str]: A dictionary containing the processed page data.
         """
         text = pymupdf4llm.to_markdown(
+            doc=self.document_file_path, pages=[page_idx], show_progress=False, **kwargs
         )
         return {
             "text": text,

medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py CHANGED Viewed

@@ -48,7 +48,7 @@ class PyPDF2TextLoader(BaseTextLoader):
         document_file_path (str): The local file path where the PDF is stored or will be downloaded.
     """
-    async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
         """
         Process a single page of the PDF and extract its text using PyPDF2.
@@ -63,6 +63,7 @@ class PyPDF2TextLoader(BaseTextLoader):
         Args:
             page_idx (int): The index of the page to process.
         Returns:
             Dict[str, str]: A dictionary containing the processed page data.
@@ -70,7 +71,7 @@ class PyPDF2TextLoader(BaseTextLoader):
         with open(self.document_file_path, "rb") as file:
             pdf_reader = PyPDF2.PdfReader(file)
             page = pdf_reader.pages[page_idx]
-            text = page.extract_text()
         return {
             "text": text,

         document_file_path (str): The local file path where the PDF is stored or will be downloaded.
     """
+    async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
         """
         Process a single page of the PDF and extract its text using PyPDF2.
         Args:
             page_idx (int): The index of the page to process.
+            **kwargs: Additional keyword arguments to be passed to `PyPDF2.PdfReader.pages[0].extract_text`.
         Returns:
             Dict[str, str]: A dictionary containing the processed page data.
         with open(self.document_file_path, "rb") as file:
             pdf_reader = PyPDF2.PdfReader(file)
             page = pdf_reader.pages[page_idx]
+            text = page.extract_text(**kwargs)
         return {
             "text": text,