XThomasBU commited on
Commit
4f620b4
·
2 Parent(s): 6f6768d 351c4c7

Merge pull request #45 from DL4DS/text_extraction

Browse files

PyMuPDF and HTML to Markdown fix + GPT4o mini PDF reader

code/modules/dataloader/data_loader.py CHANGED
@@ -27,6 +27,7 @@ import tempfile
27
  import PyPDF2
28
  from modules.dataloader.pdf_readers.base import PDFReader
29
  from modules.dataloader.pdf_readers.llama import LlamaParser
 
30
 
31
  try:
32
  from modules.dataloader.helpers import get_metadata, download_pdf_from_url
@@ -89,9 +90,12 @@ class FileReader:
89
  self.kind = kind
90
  if kind == "llama":
91
  self.pdf_reader = LlamaParser()
 
 
92
  else:
93
  self.pdf_reader = PDFReader()
94
  self.web_reader = HTMLReader()
 
95
 
96
 
97
  def extract_text_from_pdf(self, pdf_path):
@@ -105,11 +109,7 @@ class FileReader:
105
  return text
106
 
107
  def read_pdf(self, temp_file_path: str):
108
- if self.kind == "llama":
109
- documents = self.pdf_reader.parse(temp_file_path) # asyncio.run(self.pdf_reader.parse(temp_file_path)) if using async
110
- else:
111
- loader = self.pdf_reader.get_loader(temp_file_path)
112
- documents = self.pdf_reader.get_documents(loader)
113
  return documents
114
 
115
  def read_txt(self, temp_file_path: str):
@@ -134,8 +134,7 @@ class FileReader:
134
  return loader.load()
135
 
136
  def read_html(self, url: str):
137
- loader = WebBaseLoader(url)
138
- return loader.load()
139
 
140
  def read_tex_from_url(self, tex_url):
141
  response = requests.get(tex_url)
@@ -289,7 +288,6 @@ class ChunkProcessor:
289
  )
290
  self.document_chunks_full.extend(document_chunks)
291
 
292
- print(f"Processed {file_path}. File_data: {file_data}")
293
  self.document_data[file_path] = file_data
294
  self.document_metadata[file_path] = file_metadata
295
 
 
27
  import PyPDF2
28
  from modules.dataloader.pdf_readers.base import PDFReader
29
  from modules.dataloader.pdf_readers.llama import LlamaParser
30
+ from modules.dataloader.pdf_readers.gpt import GPTParser
31
 
32
  try:
33
  from modules.dataloader.helpers import get_metadata, download_pdf_from_url
 
90
  self.kind = kind
91
  if kind == "llama":
92
  self.pdf_reader = LlamaParser()
93
+ elif kind == "gpt":
94
+ self.pdf_reader = GPTParser()
95
  else:
96
  self.pdf_reader = PDFReader()
97
  self.web_reader = HTMLReader()
98
+ self.logger.info(f"Initialized FileReader with {kind} PDF reader and HTML reader")
99
 
100
 
101
  def extract_text_from_pdf(self, pdf_path):
 
109
  return text
110
 
111
  def read_pdf(self, temp_file_path: str):
112
+ documents = self.pdf_reader.parse(temp_file_path)
 
 
 
 
113
  return documents
114
 
115
  def read_txt(self, temp_file_path: str):
 
134
  return loader.load()
135
 
136
  def read_html(self, url: str):
137
+ return [Document(page_content=self.web_reader.read_html(url))]
 
138
 
139
  def read_tex_from_url(self, tex_url):
140
  response = requests.get(tex_url)
 
288
  )
289
  self.document_chunks_full.extend(document_chunks)
290
 
 
291
  self.document_data[file_path] = file_data
292
  self.document_metadata[file_path] = file_metadata
293
 
code/modules/dataloader/pdf_readers/gpt.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import os
3
+ import requests
4
+
5
+ from io import BytesIO
6
+ from openai import OpenAI
7
+ from pdf2image import convert_from_path
8
+ from langchain.schema import Document
9
+
10
+
11
+ class GPTParser:
12
+ """
13
+ This class uses OpenAI's GPT-4o mini model to parse PDFs and extract text, images and equations.
14
+ It is the most advanced parser in the system and is able to handle complex formats and layouts
15
+ """
16
+
17
+ def __init__(self):
18
+ self.client = OpenAI()
19
+ self.api_key = os.getenv("OPENAI_API_KEY")
20
+ self.prompt = """
21
+ The provided documents are images of PDFs of lecture slides of deep learning material.
22
+ They contain LaTeX equations, images, and text.
23
+ The goal is to extract the text, images and equations from the slides and convert everything to markdown format. Some of the equations may be complicated.
24
+ The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$.
25
+ For images, give a description and if you can, a source. Separate each page with '---'.
26
+ Just respond with the markdown.
27
+ """
28
+
29
+ def parse(self, pdf_path):
30
+ images = convert_from_path(pdf_path)
31
+
32
+ encoded_images = [self.encode_image(image) for image in images]
33
+
34
+ chunks = [encoded_images[i:i + 5] for i in range(0, len(encoded_images), 5)]
35
+
36
+ headers = {
37
+ "Content-Type": "application/json",
38
+ "Authorization": f"Bearer {self.api_key}"
39
+ }
40
+
41
+ output = ""
42
+ for chunk_num, chunk in enumerate(chunks):
43
+ content = [{"type": "image_url", "image_url": {
44
+ "url": f"data:image/jpeg;base64,{image}"}} for image in chunk]
45
+
46
+ content.insert(0, {"type": "text", "text": self.prompt})
47
+
48
+ payload = {
49
+ "model": "gpt-4o-mini",
50
+ "messages": [
51
+ {
52
+ "role": "user",
53
+ "content": content
54
+ }
55
+ ],
56
+ }
57
+
58
+ response = requests.post(
59
+ "https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
60
+
61
+ resp = response.json()
62
+
63
+ chunk_output = resp['choices'][0]['message']['content'].replace("```", "").replace("markdown", "").replace("````", "")
64
+
65
+ output += chunk_output + "\n---\n"
66
+
67
+ output = output.split("\n---\n")
68
+ output = [doc for doc in output if doc.strip() != ""]
69
+
70
+ documents = [
71
+ Document(
72
+ page_content=page,
73
+ metadata={"source": pdf_path, "page": i}
74
+ ) for i, page in enumerate(output)
75
+ ]
76
+ return documents
77
+
78
+ def encode_image(self, image):
79
+ buffered = BytesIO()
80
+ image.save(buffered, format="JPEG")
81
+ return base64.b64encode(buffered.getvalue()).decode('utf-8')