mratanusarkar commited on
Commit
419f968
·
1 Parent(s): 391b2f3

add: docs & docstrings for pypdf2 text loader

Browse files
docs/document_loader/text_loader/pymupdf4llm_text_loader.md CHANGED
@@ -1,3 +1,3 @@
1
- ## Load text from PDF files
2
 
3
  ::: medrag_multi_modal.document_loader.text_loader.pymupdf4llm_text_loader
 
1
+ ## Load text from PDF files (using PyMuPDF4LLM)
2
 
3
  ::: medrag_multi_modal.document_loader.text_loader.pymupdf4llm_text_loader
docs/document_loader/text_loader/pypdf2_text_loader.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ## Load text from PDF files (using PyPDF2)
2
+
3
+ ::: medrag_multi_modal.document_loader.text_loader.pypdf2_text_loader
medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py CHANGED
@@ -6,7 +6,66 @@ from .base_text_loader import BaseTextLoader
6
 
7
 
8
  class PyPDF2TextLoader(BaseTextLoader):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  async def _process_page(self, page_idx: int) -> Dict[str, str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  with open(self.document_file_path, "rb") as file:
11
  pdf_reader = PyPDF2.PdfReader(file)
12
  page = pdf_reader.pages[page_idx]
 
6
 
7
 
8
  class PyPDF2TextLoader(BaseTextLoader):
9
+ """
10
+ A concrete implementation of the BaseTextLoader for loading text from a PDF file
11
+ using `PyPDF2`, processing it into a simple text format, and optionally publishing
12
+ it to a Weave dataset.
13
+
14
+ This class extends the BaseTextLoader and implements the abstract methods to
15
+ load and process pages from a PDF file.
16
+
17
+ This class will handle the downloading of a PDF file from a given URL if it does not already exist locally.
18
+ It uses PyPDF2 to read the PDF and extract text from each page. The processed pages are stored in a list
19
+ of Page objects, which can be optionally published to a Weave dataset.
20
+
21
+ !!! example "Example Usage"
22
+ ```python
23
+ import asyncio
24
+
25
+ import weave
26
+
27
+ from medrag_multi_modal.document_loader.text_loader import PyPDF2TextLoader
28
+
29
+ weave.init(project_name="ml-colabs/medrag-multi-modal")
30
+ url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
31
+ loader = PyPDF2TextLoader(
32
+ url=url,
33
+ document_name="Gray's Anatomy",
34
+ document_file_path="grays_anatomy.pdf",
35
+ )
36
+ asyncio.run(
37
+ loader.load_data(
38
+ start_page=31,
39
+ end_page=36,
40
+ weave_dataset_name="grays-anatomy-text",
41
+ )
42
+ )
43
+ ```
44
+
45
+ Args:
46
+ url (str): The URL of the PDF file to download if not present locally.
47
+ document_name (str): The name of the document for metadata purposes.
48
+ document_file_path (str): The local file path where the PDF is stored or will be downloaded.
49
+ """
50
+
51
  async def _process_page(self, page_idx: int) -> Dict[str, str]:
52
+ """
53
+ Process a single page of the PDF and extract its text using PyPDF2.
54
+
55
+ Returns a dictionary with the processed page data.
56
+ The dictionary will have the following keys and values:
57
+ - "text": (str) the extracted text from the page.
58
+ - "page_idx": (int) the index of the page.
59
+ - "document_name": (str) the name of the document.
60
+ - "file_path": (str) the local file path where the PDF is stored.
61
+ - "file_url": (str) the URL of the PDF file.
62
+
63
+ Args:
64
+ page_idx (int): The index of the page to process.
65
+
66
+ Returns:
67
+ Dict[str, str]: A dictionary containing the processed page data.
68
+ """
69
  with open(self.document_file_path, "rb") as file:
70
  pdf_reader = PyPDF2.PdfReader(file)
71
  page = pdf_reader.pages[page_idx]
mkdocs.yml CHANGED
@@ -66,6 +66,7 @@ nav:
66
  - Text Loader:
67
  - Base: 'document_loader/text_loader/base_text_loader.md'
68
  - PyMuPDF4LLM: 'document_loader/text_loader/pymupdf4llm_text_loader.md'
 
69
  - Text and Image Loader: 'document_loader/load_text_image.md'
70
  - Image Loader: 'document_loader/load_image.md'
71
  - Retrieval:
 
66
  - Text Loader:
67
  - Base: 'document_loader/text_loader/base_text_loader.md'
68
  - PyMuPDF4LLM: 'document_loader/text_loader/pymupdf4llm_text_loader.md'
69
+ - PyPDF2: 'document_loader/text_loader/pypdf2_text_loader.md'
70
  - Text and Image Loader: 'document_loader/load_text_image.md'
71
  - Image Loader: 'document_loader/load_image.md'
72
  - Retrieval: