Farid Karimli commited on
Commit
c68577f
·
1 Parent(s): 3ff5066
code/modules/data_loader.py DELETED
@@ -1,462 +0,0 @@
1
- import os
2
- import bs4
3
- from urllib.parse import urljoin
4
- import asyncio
5
- import requests
6
- import pysrt
7
- from langchain_community.document_loaders import (
8
- PyMuPDFLoader,
9
- Docx2txtLoader,
10
- YoutubeLoader,
11
- WebBaseLoader,
12
- TextLoader,
13
- )
14
- import html2text
15
- import tempfile
16
- from langchain_community.document_loaders import UnstructuredMarkdownLoader
17
- from llama_parse import LlamaParse
18
- from langchain.schema import Document
19
- import logging
20
- from langchain.text_splitter import RecursiveCharacterTextSplitter
21
- from langchain_experimental.text_splitter import SemanticChunker
22
- from langchain_openai.embeddings import OpenAIEmbeddings
23
- from ragatouille import RAGPretrainedModel
24
- from langchain.chains import LLMChain
25
- from langchain.llms import OpenAI
26
- from langchain import PromptTemplate
27
-
28
- try:
29
- from modules.helpers import get_lecture_metadata
30
- from modules.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
31
- except:
32
- from helpers import get_lecture_metadata
33
- from constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
34
-
35
- logger = logging.getLogger(__name__)
36
- BASE_DIR = os.getcwd()
37
- STORAGE_DIR = os.path.join(BASE_DIR, "storage", "data")
38
-
39
- class PDFReader:
40
- def __init__(self):
41
- pass
42
-
43
- def get_loader(self, pdf_path):
44
- loader = PyMuPDFLoader(pdf_path)
45
- return loader
46
-
47
- def get_documents(self, loader):
48
- return loader.load()
49
-
50
-
51
- class LlamaParser:
52
- def __init__(self):
53
- self.GPT_API_KEY = OPENAI_API_KEY
54
- self.LLAMA_CLOUD_API_KEY = LLAMA_CLOUD_API_KEY
55
- self.parse_url = "https://api.cloud.llamaindex.ai/api/parsing/upload"
56
- self.headers = {
57
- 'Accept': 'application/json',
58
- 'Authorization': 'Bearer llx-vap5Bk2zbYLfqTq2aZDvNHwscvsBPQiSjvLOGkgUa9SS8CWB'
59
- }
60
- self.parser = LlamaParse(
61
- api_key=LLAMA_CLOUD_API_KEY,
62
- result_type="markdown",
63
- verbose=True,
64
- language="en",
65
- gpt4o_mode=False,
66
- # gpt4o_api_key=OPENAI_API_KEY,
67
- parsing_instruction="The provided documents are PDFs of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides and convert them to markdown format. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$. For images, give a description and if you can, a source."
68
- )
69
-
70
- def parse(self, pdf_path):
71
- pdf_name = os.path.basename(pdf_path)
72
- logger.info(f"Processing PDF: {pdf_name}. Path: {pdf_path}")
73
-
74
- path = os.path.join(STORAGE_DIR, pdf_name)
75
- if os.path.exists(path):
76
- pdf_path = os.path.join(STORAGE_DIR, path)
77
- else:
78
- pdf_path = FileReader.download_pdf_from_url(pdf_url=pdf_path)
79
-
80
- documents = self.parser.load_data(pdf_path)
81
- documents = [document.to_langchain_format() for document in documents]
82
- print(documents)
83
-
84
- os.remove(pdf_path)
85
- return documents
86
-
87
- def make_request(self, pdf_url):
88
- payload = {
89
- "gpt4o_mode": "false",
90
- "parsing_instruction": "The provided document is a PDF of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides and convert them to markdown format. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$. For images, give a description and if you can, a source.",
91
- }
92
-
93
- files = [
94
- ('file', ('file', requests.get(pdf_url).content, 'application/octet-stream'))
95
- ]
96
-
97
- response = requests.request(
98
- "POST", self.parse_url, headers=self.headers, data=payload, files=files)
99
-
100
- return response.json()['id'], response.json()['status']
101
-
102
- async def get_result(self, job_id):
103
- url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}/result/markdown"
104
-
105
- response = requests.request("GET", url, headers=self.headers, data={})
106
-
107
- return response.json()['markdown']
108
-
109
- async def _parse(self, pdf_path):
110
- job_id, status = self.make_request(pdf_path)
111
- print(f"Job ID: {job_id}", f"Status: {status}")
112
-
113
- while status != "SUCCESS":
114
- url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}"
115
- response = requests.request("GET", url, headers=self.headers, data={})
116
- status = response.json()["status"]
117
-
118
- print(status)
119
-
120
- result = await self.get_result(job_id)
121
-
122
- documents = [
123
- Document(
124
- page_content=result,
125
- metadata={"source": pdf_path}
126
- )
127
- ]
128
-
129
- return documents
130
-
131
- async def _parse(self, pdf_path):
132
- return await self._parse(pdf_path)
133
-
134
- class HTMLReader:
135
- def __init__(self):
136
- pass
137
-
138
- def read_url(self, url):
139
- response = requests.get(url)
140
- if response.status_code == 200:
141
- return response.text
142
- else:
143
- logger.warning(f"Failed to download HTML from URL: {url}")
144
- return None
145
-
146
- def check_links(self, base_url, html_content):
147
- soup = bs4.BeautifulSoup(html_content, "html.parser")
148
- for link in soup.find_all("a"):
149
- href = link.get("href")
150
-
151
- if not href or href.startswith("#"):
152
- continue
153
- elif not href.startswith("https"):
154
- href = href.replace("http", "https")
155
-
156
- absolute_url = urljoin(base_url, href)
157
- link['href'] = absolute_url
158
-
159
- resp = requests.head(absolute_url)
160
- if resp.status_code != 200:
161
- logger.warning(f"Link {absolute_url} is broken")
162
- logger.warning(f"Status code: {resp.status_code}")
163
-
164
- return str(soup)
165
-
166
- def html_to_md(self, url, html_content):
167
- html_processed = self.check_links(url, html_content)
168
- markdown_content = html2text.html2text(html_processed)
169
- return markdown_content
170
-
171
- def read_html(self, url):
172
- html_content = self.read_url(url)
173
- if html_content:
174
- return self.html_to_md(url, html_content)
175
- else:
176
- return None
177
-
178
-
179
- class FileReader:
180
- def __init__(self, kind):
181
- self.kind = kind
182
- if kind == "llama":
183
- self.pdf_reader = LlamaParser()
184
- else:
185
- self.pdf_reader = PDFReader()
186
- self.web_reader = HTMLReader()
187
-
188
- def extract_text_from_pdf(self, pdf_path):
189
- text = ""
190
- with open(pdf_path, "rb") as file:
191
- reader = PyPDF2.PdfReader(file)
192
- num_pages = len(reader.pages)
193
- for page_num in range(num_pages):
194
- page = reader.pages[page_num]
195
- text += page.extract_text()
196
- return text
197
-
198
- @staticmethod
199
- def download_pdf_from_url(pdf_url):
200
- response = requests.get(pdf_url)
201
- if response.status_code == 200:
202
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
203
- temp_file.write(response.content)
204
- temp_file_path = temp_file.name
205
- return temp_file_path
206
- else:
207
- print("Failed to download PDF from URL:", pdf_url)
208
- return None
209
-
210
- def read_pdf(self, temp_file_path: str):
211
- if self.kind == "llama":
212
- #documents = asyncio.run(self.pdf_reader.parse(temp_file_path))
213
- documents = self.pdf_reader.parse(temp_file_path)
214
- else:
215
- loader = self.pdf_reader.get_loader(temp_file_path)
216
- documents = self.pdf_reader.get_documents(loader)
217
- return documents
218
-
219
- def read_txt(self, temp_file_path: str):
220
- loader = TextLoader(temp_file_path, autodetect_encoding=True)
221
- return loader.load()
222
-
223
- def read_docx(self, temp_file_path: str):
224
- loader = Docx2txtLoader(temp_file_path)
225
- return loader.load()
226
-
227
- def read_srt(self, temp_file_path: str):
228
- subs = pysrt.open(temp_file_path)
229
- text = ""
230
- for sub in subs:
231
- text += sub.text
232
- return [Document(page_content=text)]
233
-
234
- def read_youtube_transcript(self, url: str):
235
- loader = YoutubeLoader.from_youtube_url(
236
- url, add_video_info=True, language=["en"], translation="en"
237
- )
238
- return loader.load()
239
-
240
- def read_html(self, url: str):
241
- return [Document(page_content=self.web_reader.read_html(url))]
242
-
243
-
244
- class ChunkProcessor:
245
- def __init__(self, config):
246
- self.config = config
247
-
248
- if config["splitter_options"]["use_splitter"]:
249
- if config["splitter_options"]["split_by_token"]:
250
- self.splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
251
- chunk_size=config["splitter_options"]["chunk_size"],
252
- chunk_overlap=config["splitter_options"]["chunk_overlap"],
253
- separators=config["splitter_options"]["chunk_separators"],
254
- disallowed_special=(),
255
- )
256
- else:
257
- self.splitter = RecursiveCharacterTextSplitter(
258
- chunk_size=config["splitter_options"]["chunk_size"],
259
- chunk_overlap=config["splitter_options"]["chunk_overlap"],
260
- separators=config["splitter_options"]["chunk_separators"],
261
- disallowed_special=(),
262
- )
263
- else:
264
- self.splitter = None
265
- logger.info("ChunkProcessor instance created")
266
-
267
- def remove_delimiters(self, document_chunks: list):
268
- for chunk in document_chunks:
269
- for delimiter in self.config["splitter_options"]["delimiters_to_remove"]:
270
- chunk.page_content = re.sub(delimiter, " ", chunk.page_content)
271
- return document_chunks
272
-
273
- def remove_chunks(self, document_chunks: list):
274
- front = self.config["splitter_options"]["front_chunk_to_remove"]
275
- end = self.config["splitter_options"]["last_chunks_to_remove"]
276
- for _ in range(front):
277
- del document_chunks[0]
278
- for _ in range(end):
279
- document_chunks.pop()
280
- logger.info(f"\tNumber of pages after skipping: {len(document_chunks)}")
281
- return document_chunks
282
-
283
- def process_chunks(
284
- self, documents, file_type="txt", source="", page=0, metadata={}
285
- ):
286
- documents = [Document(page_content=documents, source=source, page=page)]
287
- if file_type == "txt":
288
- document_chunks = self.splitter.split_documents(documents)
289
- elif file_type == "pdf":
290
- document_chunks = documents # Full page for now
291
-
292
- # add the source and page number back to the metadata
293
- for chunk in document_chunks:
294
- chunk.metadata["source"] = source
295
- chunk.metadata["page"] = page
296
-
297
- # add the metadata extracted from the document
298
- for key, value in metadata.items():
299
- chunk.metadata[key] = value
300
-
301
- if self.config["splitter_options"]["remove_leftover_delimiters"]:
302
- document_chunks = self.remove_delimiters(document_chunks)
303
- if self.config["splitter_options"]["remove_chunks"]:
304
- document_chunks = self.remove_chunks(document_chunks)
305
-
306
- return document_chunks
307
-
308
- def get_chunks(self, file_reader, uploaded_files, weblinks):
309
- self.document_chunks_full = []
310
- self.parent_document_names = []
311
- self.child_document_names = []
312
- self.documents = []
313
- self.document_metadata = []
314
-
315
- lecture_metadata = get_lecture_metadata(
316
- "https://dl4ds.github.io/sp2024/lectures/",
317
- "https://dl4ds.github.io/sp2024/schedule/",
318
- ) # TODO: Use more efficiently
319
-
320
- for file_index, file_path in enumerate(uploaded_files):
321
- file_name = os.path.basename(file_path)
322
- file_type = file_name.split(".")[-1].lower()
323
-
324
- # try:
325
- if file_type == "pdf":
326
- documents = file_reader.read_pdf(file_path)
327
- elif file_type == "txt":
328
- documents = file_reader.read_txt(file_path)
329
- elif file_type == "docx":
330
- documents = file_reader.read_docx(file_path)
331
- elif file_type == "srt":
332
- documents = file_reader.read_srt(file_path)
333
- else:
334
- logger.warning(f"Unsupported file type: {file_type}")
335
- continue
336
-
337
- # full_text = ""
338
- # for doc in documents:
339
- # full_text += doc.page_content
340
- # break # getting only first page for now
341
-
342
- # extracted_metadata = self.extract_metadata(full_text)
343
-
344
- for doc in documents:
345
- page_num = doc.metadata.get("page", 0)
346
- self.documents.append(doc.page_content)
347
- self.document_metadata.append({"source": file_path, "page": page_num})
348
- if "lecture" in file_path.lower():
349
- metadata = lecture_metadata.get(file_path, {})
350
- metadata["source_type"] = "lecture"
351
- self.document_metadata[-1].update(metadata)
352
- else:
353
- metadata = {"source_type": "other"}
354
-
355
- self.child_document_names.append(f"{file_name}_{page_num}")
356
-
357
- self.parent_document_names.append(file_name)
358
- if self.config["embedding_options"]["db_option"] not in ["RAGatouille"]:
359
- document_chunks = self.process_chunks(
360
- self.documents[-1],
361
- file_type,
362
- source=file_path,
363
- page=page_num,
364
- metadata=metadata,
365
- )
366
- self.document_chunks_full.extend(document_chunks)
367
-
368
- # except Exception as e:
369
- # logger.error(f"Error processing file {file_name}: {str(e)}")
370
-
371
- self.process_weblinks(file_reader, weblinks)
372
-
373
- logger.info(
374
- f"Total document chunks extracted: {len(self.document_chunks_full)}"
375
- )
376
- return (
377
- self.document_chunks_full,
378
- self.child_document_names,
379
- self.documents,
380
- self.document_metadata,
381
- )
382
-
383
- def process_weblinks(self, file_reader, weblinks):
384
- if weblinks[0] != "":
385
- logger.info(f"Splitting weblinks: total of {len(weblinks)}")
386
-
387
- for link_index, link in enumerate(weblinks):
388
- try:
389
- logger.info(f"\tSplitting link {link_index + 1} : {link}")
390
- if "youtube" in link:
391
- documents = file_reader.read_youtube_transcript(link)
392
- else:
393
- documents = file_reader.read_html(link)
394
- print(f"Link: {link}")
395
- print(documents)
396
- for doc in documents:
397
- page_num = doc.metadata.get("page", 0)
398
- self.documents.append(doc.page_content)
399
- self.document_metadata.append(
400
- {"source": link, "page": page_num}
401
- )
402
- self.child_document_names.append(f"{link}")
403
-
404
- self.parent_document_names.append(link)
405
- if self.config["embedding_options"]["db_option"] not in [
406
- "RAGatouille"
407
- ]:
408
- document_chunks = self.process_chunks(
409
- self.documents[-1],
410
- "txt",
411
- source=link,
412
- page=0,
413
- metadata={"source_type": "webpage"},
414
- )
415
- self.document_chunks_full.extend(document_chunks)
416
- except Exception as e:
417
- logger.error(
418
- f"Error splitting link {link_index + 1} : {link}: {str(e)}"
419
- )
420
-
421
-
422
- class DataLoader:
423
- def __init__(self, config):
424
- if config["llm_params"]["pdf_reader"] == "llama":
425
- if LLAMA_CLOUD_API_KEY == None or OPENAI_API_KEY == None:
426
- raise ValueError(
427
- "Please set the LLAMA_CLOUD_API_KEY and GPT4o_API_KEY environment variables"
428
- )
429
-
430
- self.file_reader = FileReader(kind=config["llm_params"]["pdf_reader"])
431
- self.chunk_processor = ChunkProcessor(config)
432
-
433
- def get_chunks(self, uploaded_files, weblinks):
434
- return self.chunk_processor.get_chunks(
435
- self.file_reader, uploaded_files, weblinks
436
- )
437
-
438
-
439
- if __name__ == "__main__":
440
- # read config.yml file
441
- import yaml
442
- import os
443
- BASE_DIR = os.path.dirname(os.path.abspath(__file__))
444
-
445
- with open(os.path.join(BASE_DIR, "../", "config.yml"), "r") as f:
446
- config = yaml.safe_load(f)
447
-
448
- # create DataLoader instance
449
- chunk_processor = ChunkProcessor(config)
450
- file_reader = FileReader(kind=config["llm_params"]["pdf_reader"])
451
-
452
- weblinks = ["https://dl4ds.github.io/sp2024/"]
453
-
454
- uploaded_files = []
455
-
456
- # get document chunks
457
- document_chunks, child_document_names, documents, document_metadata = chunk_processor.get_chunks(
458
- file_reader, uploaded_files, weblinks
459
- )
460
-
461
-
462
- print(document_chunks)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code/modules/dataloader/data_loader.py CHANGED
@@ -102,7 +102,6 @@ class LlamaParser:
102
 
103
  async def _parse(self, pdf_path):
104
  job_id, status = self.make_request(pdf_path)
105
- # print(f"Job ID: {job_id}", f"Status: {status}")
106
 
107
  while status != "SUCCESS":
108
  url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}"
@@ -398,7 +397,6 @@ class ChunkProcessor:
398
  local_path = FileReader.download_pdf_from_url(pdf_url=file_path)
399
 
400
  if file_name in self.document_data:
401
- print(f"File {file_name} already processed")
402
  return
403
 
404
  file_type = file_name.split(".")[-1].lower()
 
102
 
103
  async def _parse(self, pdf_path):
104
  job_id, status = self.make_request(pdf_path)
 
105
 
106
  while status != "SUCCESS":
107
  url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}"
 
397
  local_path = FileReader.download_pdf_from_url(pdf_url=file_path)
398
 
399
  if file_name in self.document_data:
 
400
  return
401
 
402
  file_type = file_name.split(".")[-1].lower()
code/modules/dataloader/webpage_crawler.py CHANGED
@@ -66,7 +66,6 @@ class WebpageCrawler:
66
  )
67
  for link in unchecked_links:
68
  dict_links[link] = "Checked"
69
- print(f"Checked: {link}")
70
  dict_links.update(
71
  {
72
  link: "Not-checked"
 
66
  )
67
  for link in unchecked_links:
68
  dict_links[link] = "Checked"
 
69
  dict_links.update(
70
  {
71
  link: "Not-checked"