Farid Karimli commited on
Commit
3ff5066
·
1 Parent(s): 8d6adc4

Adding improved PDF and HTML parsing to new dataloader

Browse files
code/modules/config/config.yml CHANGED
@@ -32,6 +32,7 @@ llm_params:
32
  local_llm_params:
33
  model: 'tiny-llama'
34
  temperature: 0.7
 
35
 
36
  chat_logging:
37
  log_chat: False # bool
 
32
  local_llm_params:
33
  model: 'tiny-llama'
34
  temperature: 0.7
35
+ pdf_reader: 'llama' # str [llama, pymupdf]
36
 
37
  chat_logging:
38
  log_chat: False # bool
code/modules/dataloader/data_loader.py CHANGED
@@ -20,10 +20,24 @@ from langchain_community.llms import OpenAI
20
  from langchain import PromptTemplate
21
  import json
22
  from concurrent.futures import ThreadPoolExecutor
 
 
 
 
 
23
 
24
- from modules.dataloader.helpers import get_metadata
 
 
25
 
26
 
 
 
 
 
 
 
 
27
  class PDFReader:
28
  def __init__(self):
29
  pass
@@ -35,11 +49,134 @@ class PDFReader:
35
  def get_documents(self, loader):
36
  return loader.load()
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  class FileReader:
40
- def __init__(self, logger):
41
- self.pdf_reader = PDFReader()
42
  self.logger = logger
 
 
 
 
 
 
 
43
 
44
  def extract_text_from_pdf(self, pdf_path):
45
  text = ""
@@ -51,7 +188,9 @@ class FileReader:
51
  text += page.extract_text()
52
  return text
53
 
54
- def download_pdf_from_url(self, pdf_url):
 
 
55
  response = requests.get(pdf_url)
56
  if response.status_code == 200:
57
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
@@ -63,8 +202,11 @@ class FileReader:
63
  return None
64
 
65
  def read_pdf(self, temp_file_path: str):
66
- loader = self.pdf_reader.get_loader(temp_file_path)
67
- documents = self.pdf_reader.get_documents(loader)
 
 
 
68
  return documents
69
 
70
  def read_txt(self, temp_file_path: str):
@@ -179,7 +321,6 @@ class ChunkProcessor:
179
  "https://dl4ds.github.io/sp2024/lectures/",
180
  "https://dl4ds.github.io/sp2024/schedule/",
181
  ) # For any additional metadata
182
-
183
  with ThreadPoolExecutor() as executor:
184
  executor.map(
185
  self.process_file,
@@ -250,11 +391,18 @@ class ChunkProcessor:
250
 
251
  def process_file(self, file_path, file_index, file_reader, addl_metadata):
252
  file_name = os.path.basename(file_path)
 
 
 
 
 
 
253
  if file_name in self.document_data:
 
254
  return
255
 
256
  file_type = file_name.split(".")[-1].lower()
257
- self.logger.info(f"Reading file {file_index + 1}: {file_path}")
258
 
259
  read_methods = {
260
  "pdf": file_reader.read_pdf,
@@ -268,9 +416,9 @@ class ChunkProcessor:
268
  return
269
 
270
  try:
271
- documents = read_methods[file_type](file_path)
272
  self.process_documents(
273
- documents, file_path, file_type, "file", addl_metadata
274
  )
275
  except Exception as e:
276
  self.logger.error(f"Error processing file {file_name}: {str(e)}")
@@ -330,7 +478,7 @@ class ChunkProcessor:
330
 
331
  class DataLoader:
332
  def __init__(self, config, logger=None):
333
- self.file_reader = FileReader(logger=logger)
334
  self.chunk_processor = ChunkProcessor(config, logger=logger)
335
 
336
  def get_chunks(self, uploaded_files, weblinks):
@@ -348,10 +496,15 @@ if __name__ == "__main__":
348
  with open("../code/modules/config/config.yml", "r") as f:
349
  config = yaml.safe_load(f)
350
 
 
 
 
 
 
351
  data_loader = DataLoader(config, logger=logger)
352
  document_chunks, document_names, documents, document_metadata = (
353
  data_loader.get_chunks(
354
- [],
355
  ["https://dl4ds.github.io/sp2024/"],
356
  )
357
  )
 
20
  from langchain import PromptTemplate
21
  import json
22
  from concurrent.futures import ThreadPoolExecutor
23
+ from urllib.parse import urljoin
24
+ import html2text
25
+ import bs4
26
+ import tempfile
27
+ import PyPDF2
28
 
29
+ try:
30
+ from modules.dataloader.helpers import get_metadata
31
+ from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
32
 
33
 
34
+ except:
35
+ from dataloader.helpers import get_metadata
36
+ from config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
37
+
38
+ logger = logging.getLogger(__name__)
39
+ BASE_DIR = os.getcwd()
40
+
41
  class PDFReader:
42
  def __init__(self):
43
  pass
 
49
  def get_documents(self, loader):
50
  return loader.load()
51
 
52
+ class LlamaParser:
53
+ def __init__(self):
54
+ self.GPT_API_KEY = OPENAI_API_KEY
55
+ self.LLAMA_CLOUD_API_KEY = LLAMA_CLOUD_API_KEY
56
+ print(f"LLAMA_CLOUD_API_KEY: {LLAMA_CLOUD_API_KEY}")
57
+ self.parse_url = "https://api.cloud.llamaindex.ai/api/parsing/upload"
58
+ self.headers = {
59
+ 'Accept': 'application/json',
60
+ 'Authorization': 'Bearer llx-vap5Bk2zbYLfqTq2aZDvNHwscvsBPQiSjvLOGkgUa9SS8CWB'
61
+ }
62
+ self.parser = LlamaParse(
63
+ api_key=LLAMA_CLOUD_API_KEY,
64
+ result_type="markdown",
65
+ verbose=True,
66
+ language="en",
67
+ gpt4o_mode=False,
68
+ # gpt4o_api_key=OPENAI_API_KEY,
69
+ parsing_instruction="The provided documents are PDFs of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides and convert them to markdown format. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$. For images, give a description and if you can, a source."
70
+ )
71
+
72
+ def parse(self, pdf_path):
73
+ pdf_name = os.path.basename(pdf_path)
74
+
75
+ documents = self.parser.load_data(pdf_path)
76
+ documents = [document.to_langchain_format() for document in documents]
77
+
78
+ os.remove(pdf_path) # cleanup, just in case
79
+ return documents
80
+
81
+ def make_request(self, pdf_url):
82
+ payload = {
83
+ "gpt4o_mode": "false",
84
+ "parsing_instruction": "The provided document is a PDF of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides and convert them to markdown format. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$. For images, give a description and if you can, a source.",
85
+ }
86
+
87
+ files = [
88
+ ('file', ('file', requests.get(pdf_url).content, 'application/octet-stream'))
89
+ ]
90
+
91
+ response = requests.request(
92
+ "POST", self.parse_url, headers=self.headers, data=payload, files=files)
93
+
94
+ return response.json()['id'], response.json()['status']
95
+
96
+ async def get_result(self, job_id):
97
+ url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}/result/markdown"
98
+
99
+ response = requests.request("GET", url, headers=self.headers, data={})
100
+
101
+ return response.json()['markdown']
102
+
103
+ async def _parse(self, pdf_path):
104
+ job_id, status = self.make_request(pdf_path)
105
+ # print(f"Job ID: {job_id}", f"Status: {status}")
106
+
107
+ while status != "SUCCESS":
108
+ url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}"
109
+ response = requests.request("GET", url, headers=self.headers, data={})
110
+ status = response.json()["status"]
111
+
112
+ result = await self.get_result(job_id)
113
+
114
+ documents = [
115
+ Document(
116
+ page_content=result,
117
+ metadata={"source": pdf_path}
118
+ )
119
+ ]
120
+
121
+ return documents
122
+
123
+ async def _parse(self, pdf_path):
124
+ return await self._parse(pdf_path)
125
+
126
+ class HTMLReader:
127
+ def __init__(self):
128
+ pass
129
+
130
+ def read_url(self, url):
131
+ response = requests.get(url)
132
+ if response.status_code == 200:
133
+ return response.text
134
+ else:
135
+ logger.warning(f"Failed to download HTML from URL: {url}")
136
+ return None
137
+
138
+ def check_links(self, base_url, html_content):
139
+ soup = bs4.BeautifulSoup(html_content, "html.parser")
140
+ for link in soup.find_all("a"):
141
+ href = link.get("href")
142
+
143
+ if not href or href.startswith("#"):
144
+ continue
145
+ elif not href.startswith("https"):
146
+ href = href.replace("http", "https")
147
+
148
+ absolute_url = urljoin(base_url, href)
149
+ link['href'] = absolute_url
150
+
151
+ resp = requests.head(absolute_url)
152
+ if resp.status_code != 200:
153
+ logger.warning(f"Link {absolute_url} is broken")
154
+ logger.warning(f"Status code: {resp.status_code}")
155
+
156
+ return str(soup)
157
+
158
+ def html_to_md(self, url, html_content):
159
+ html_processed = self.check_links(url, html_content)
160
+ markdown_content = html2text.html2text(html_processed)
161
+ return markdown_content
162
+
163
+ def read_html(self, url):
164
+ html_content = self.read_url(url)
165
+ if html_content:
166
+ return self.html_to_md(url, html_content)
167
+ else:
168
+ return None
169
 
170
  class FileReader:
171
+ def __init__(self, logger, kind):
 
172
  self.logger = logger
173
+ self.kind = kind
174
+ if kind == "llama":
175
+ self.pdf_reader = LlamaParser()
176
+ else:
177
+ self.pdf_reader = PDFReader()
178
+ self.web_reader = HTMLReader()
179
+
180
 
181
  def extract_text_from_pdf(self, pdf_path):
182
  text = ""
 
188
  text += page.extract_text()
189
  return text
190
 
191
+ @staticmethod
192
+ def download_pdf_from_url(pdf_url):
193
+ print("Downloading PDF from URL: ", pdf_url)
194
  response = requests.get(pdf_url)
195
  if response.status_code == 200:
196
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
 
202
  return None
203
 
204
  def read_pdf(self, temp_file_path: str):
205
+ if self.kind == "llama":
206
+ documents = self.pdf_reader.parse(temp_file_path) # asyncio.run(self.pdf_reader.parse(temp_file_path)) if using async
207
+ else:
208
+ loader = self.pdf_reader.get_loader(temp_file_path)
209
+ documents = self.pdf_reader.get_documents(loader)
210
  return documents
211
 
212
  def read_txt(self, temp_file_path: str):
 
321
  "https://dl4ds.github.io/sp2024/lectures/",
322
  "https://dl4ds.github.io/sp2024/schedule/",
323
  ) # For any additional metadata
 
324
  with ThreadPoolExecutor() as executor:
325
  executor.map(
326
  self.process_file,
 
391
 
392
  def process_file(self, file_path, file_index, file_reader, addl_metadata):
393
  file_name = os.path.basename(file_path)
394
+ storage_dir = os.path.join(os.getcwd(), self.config["vectorstore"]["data_path"])
395
+ local_path = os.path.join(storage_dir, file_name)
396
+
397
+ if not os.path.exists(local_path):
398
+ local_path = FileReader.download_pdf_from_url(pdf_url=file_path)
399
+
400
  if file_name in self.document_data:
401
+ print(f"File {file_name} already processed")
402
  return
403
 
404
  file_type = file_name.split(".")[-1].lower()
405
+ self.logger.info(f"Reading file {file_index + 1}: {local_path}")
406
 
407
  read_methods = {
408
  "pdf": file_reader.read_pdf,
 
416
  return
417
 
418
  try:
419
+ documents = read_methods[file_type](local_path)
420
  self.process_documents(
421
+ documents, local_path, file_type, "file", addl_metadata
422
  )
423
  except Exception as e:
424
  self.logger.error(f"Error processing file {file_name}: {str(e)}")
 
478
 
479
  class DataLoader:
480
  def __init__(self, config, logger=None):
481
+ self.file_reader = FileReader(logger=logger, kind=config["llm_params"]["pdf_reader"])
482
  self.chunk_processor = ChunkProcessor(config, logger=logger)
483
 
484
  def get_chunks(self, uploaded_files, weblinks):
 
496
  with open("../code/modules/config/config.yml", "r") as f:
497
  config = yaml.safe_load(f)
498
 
499
+ STORAGE_DIR = os.path.join(BASE_DIR, config['vectorstore']["data_path"])
500
+ uploaded_files = [
501
+ os.path.join(STORAGE_DIR, file) for file in os.listdir(STORAGE_DIR) if file != "urls.txt"
502
+ ]
503
+
504
  data_loader = DataLoader(config, logger=logger)
505
  document_chunks, document_names, documents, document_metadata = (
506
  data_loader.get_chunks(
507
+ uploaded_files,
508
  ["https://dl4ds.github.io/sp2024/"],
509
  )
510
  )