mratanusarkar commited on
Commit
6526b2f
·
1 Parent(s): d191c1b

add: kwargs to interact with underlying library

Browse files
medrag_multi_modal/document_loader/text_loader/base_text_loader.py CHANGED
@@ -65,7 +65,7 @@ class BaseTextLoader(ABC):
65
  return start_page, end_page
66
 
67
  @abstractmethod
68
- async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
69
  """
70
  Abstract method to process a single page of the PDF and extract the text data.
71
 
@@ -74,6 +74,7 @@ class BaseTextLoader(ABC):
74
 
75
  Args:
76
  page_idx (int): The index of the page to process.
 
77
 
78
  Returns:
79
  Dict[str, str]: A dictionary containing the processed page data.
@@ -85,6 +86,7 @@ class BaseTextLoader(ABC):
85
  start_page: Optional[int] = None,
86
  end_page: Optional[int] = None,
87
  weave_dataset_name: Optional[str] = None,
 
88
  ) -> List[Dict[str, str]]:
89
  """
90
  Asynchronously loads text from a PDF file specified by a URL or local file path.
@@ -106,6 +108,7 @@ class BaseTextLoader(ABC):
106
  start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
107
  end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
108
  weave_dataset_name (Optional[str]): The name of the Weave dataset to publish the pages to, if provided.
 
109
 
110
  Returns:
111
  List[Dict[str, str]]: A list of dictionaries, each containing the text and metadata for a processed page.
@@ -127,7 +130,7 @@ class BaseTextLoader(ABC):
127
 
128
  async def process_page(page_idx):
129
  nonlocal processed_pages_counter
130
- page_data = await self.extract_page_data(page_idx)
131
  pages.append(page_data)
132
  rich.print(
133
  f"Processed page idx: {page_idx}, progress: {processed_pages_counter}/{total_pages}"
 
65
  return start_page, end_page
66
 
67
  @abstractmethod
68
+ async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
69
  """
70
  Abstract method to process a single page of the PDF and extract the text data.
71
 
 
74
 
75
  Args:
76
  page_idx (int): The index of the page to process.
77
+ **kwargs: Additional keyword arguments that may be used by underlying libraries.
78
 
79
  Returns:
80
  Dict[str, str]: A dictionary containing the processed page data.
 
86
  start_page: Optional[int] = None,
87
  end_page: Optional[int] = None,
88
  weave_dataset_name: Optional[str] = None,
89
+ **kwargs,
90
  ) -> List[Dict[str, str]]:
91
  """
92
  Asynchronously loads text from a PDF file specified by a URL or local file path.
 
108
  start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
109
  end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
110
  weave_dataset_name (Optional[str]): The name of the Weave dataset to publish the pages to, if provided.
111
+ **kwargs: Additional keyword arguments that will be passed to extract_page_data method and the underlying library.
112
 
113
  Returns:
114
  List[Dict[str, str]]: A list of dictionaries, each containing the text and metadata for a processed page.
 
130
 
131
  async def process_page(page_idx):
132
  nonlocal processed_pages_counter
133
+ page_data = await self.extract_page_data(page_idx, **kwargs)
134
  pages.append(page_data)
135
  rich.print(
136
  f"Processed page idx: {page_idx}, progress: {processed_pages_counter}/{total_pages}"
medrag_multi_modal/document_loader/text_loader/marker_text_loader.py CHANGED
@@ -49,7 +49,7 @@ class MarkerTextLoader(BaseTextLoader):
49
  document_file_path (str): The local file path where the PDF is stored or will be downloaded.
50
  """
51
 
52
- async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
53
  """
54
  Process a single page of the PDF and extract its structured text using marker-pdf.
55
 
@@ -65,6 +65,7 @@ class MarkerTextLoader(BaseTextLoader):
65
 
66
  Args:
67
  page_idx (int): The index of the page to process.
 
68
 
69
  Returns:
70
  Dict[str, str]: A dictionary containing the processed page data.
@@ -78,6 +79,7 @@ class MarkerTextLoader(BaseTextLoader):
78
  batch_multiplier=1,
79
  start_page=page_idx,
80
  ocr_all_pages=True,
 
81
  )
82
 
83
  return {
 
49
  document_file_path (str): The local file path where the PDF is stored or will be downloaded.
50
  """
51
 
52
+ async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
53
  """
54
  Process a single page of the PDF and extract its structured text using marker-pdf.
55
 
 
65
 
66
  Args:
67
  page_idx (int): The index of the page to process.
68
+ **kwargs: Additional keyword arguments to be passed to `marker.convert.convert_single_pdf`.
69
 
70
  Returns:
71
  Dict[str, str]: A dictionary containing the processed page data.
 
79
  batch_multiplier=1,
80
  start_page=page_idx,
81
  ocr_all_pages=True,
82
+ **kwargs,
83
  )
84
 
85
  return {
medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py CHANGED
@@ -48,7 +48,7 @@ class PDFPlumberTextLoader(BaseTextLoader):
48
  document_file_path (str): The local file path where the PDF is stored or will be downloaded.
49
  """
50
 
51
- async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
52
  """
53
  Process a single page of the PDF and extract its text using pdfplumber.
54
 
@@ -63,13 +63,14 @@ class PDFPlumberTextLoader(BaseTextLoader):
63
 
64
  Args:
65
  page_idx (int): The index of the page to process.
 
66
 
67
  Returns:
68
  Dict[str, str]: A dictionary containing the processed page data.
69
  """
70
  with pdfplumber.open(self.document_file_path) as pdf:
71
  page = pdf.pages[page_idx]
72
- text = page.extract_text()
73
 
74
  return {
75
  "text": text,
 
48
  document_file_path (str): The local file path where the PDF is stored or will be downloaded.
49
  """
50
 
51
+ async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
52
  """
53
  Process a single page of the PDF and extract its text using pdfplumber.
54
 
 
63
 
64
  Args:
65
  page_idx (int): The index of the page to process.
66
+ **kwargs: Additional keyword arguments to be passed to `pdfplumber.Page.extract_text`.
67
 
68
  Returns:
69
  Dict[str, str]: A dictionary containing the processed page data.
70
  """
71
  with pdfplumber.open(self.document_file_path) as pdf:
72
  page = pdf.pages[page_idx]
73
+ text = page.extract_text(**kwargs)
74
 
75
  return {
76
  "text": text,
medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py CHANGED
@@ -48,7 +48,7 @@ class PyMuPDF4LLMTextLoader(BaseTextLoader):
48
  document_file_path (str): The local file path where the PDF is stored or will be downloaded.
49
  """
50
 
51
- async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
52
  """
53
  Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
54
 
@@ -63,12 +63,13 @@ class PyMuPDF4LLMTextLoader(BaseTextLoader):
63
 
64
  Args:
65
  page_idx (int): The index of the page to process.
 
66
 
67
  Returns:
68
  Dict[str, str]: A dictionary containing the processed page data.
69
  """
70
  text = pymupdf4llm.to_markdown(
71
- doc=self.document_file_path, pages=[page_idx], show_progress=False
72
  )
73
  return {
74
  "text": text,
 
48
  document_file_path (str): The local file path where the PDF is stored or will be downloaded.
49
  """
50
 
51
+ async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
52
  """
53
  Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
54
 
 
63
 
64
  Args:
65
  page_idx (int): The index of the page to process.
66
+ **kwargs: Additional keyword arguments to be passed to `pymupdf4llm.to_markdown`.
67
 
68
  Returns:
69
  Dict[str, str]: A dictionary containing the processed page data.
70
  """
71
  text = pymupdf4llm.to_markdown(
72
+ doc=self.document_file_path, pages=[page_idx], show_progress=False, **kwargs
73
  )
74
  return {
75
  "text": text,
medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py CHANGED
@@ -48,7 +48,7 @@ class PyPDF2TextLoader(BaseTextLoader):
48
  document_file_path (str): The local file path where the PDF is stored or will be downloaded.
49
  """
50
 
51
- async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
52
  """
53
  Process a single page of the PDF and extract its text using PyPDF2.
54
 
@@ -63,6 +63,7 @@ class PyPDF2TextLoader(BaseTextLoader):
63
 
64
  Args:
65
  page_idx (int): The index of the page to process.
 
66
 
67
  Returns:
68
  Dict[str, str]: A dictionary containing the processed page data.
@@ -70,7 +71,7 @@ class PyPDF2TextLoader(BaseTextLoader):
70
  with open(self.document_file_path, "rb") as file:
71
  pdf_reader = PyPDF2.PdfReader(file)
72
  page = pdf_reader.pages[page_idx]
73
- text = page.extract_text()
74
 
75
  return {
76
  "text": text,
 
48
  document_file_path (str): The local file path where the PDF is stored or will be downloaded.
49
  """
50
 
51
+ async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
52
  """
53
  Process a single page of the PDF and extract its text using PyPDF2.
54
 
 
63
 
64
  Args:
65
  page_idx (int): The index of the page to process.
66
+ **kwargs: Additional keyword arguments to be passed to `PyPDF2.PdfReader.pages[0].extract_text`.
67
 
68
  Returns:
69
  Dict[str, str]: A dictionary containing the processed page data.
 
71
  with open(self.document_file_path, "rb") as file:
72
  pdf_reader = PyPDF2.PdfReader(file)
73
  page = pdf_reader.pages[page_idx]
74
+ text = page.extract_text(**kwargs)
75
 
76
  return {
77
  "text": text,