XThomasBU commited on
Commit
30045eb
·
2 Parent(s): 1d7972f 0958f93

Merge pull request #39 from DL4DS/text_extraction

Browse files

SCRUM-57 LLamaParser patch, DataLoader restructure and more

code/.chainlit/config.toml CHANGED
@@ -23,7 +23,7 @@ allow_origins = ["*"]
23
  unsafe_allow_html = false
24
 
25
  # Process and display mathematical expressions. This can clash with "$" characters in messages.
26
- latex = false
27
 
28
  # Automatically tag threads with the current chat profile (if a chat profile is used)
29
  auto_tag_thread = true
@@ -85,31 +85,34 @@ custom_meta_image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/f/
85
  # custom_build = "./public/build"
86
 
87
  [UI.theme]
88
- default = "light"
89
  #layout = "wide"
90
  #font_family = "Inter, sans-serif"
91
  # Override default MUI light theme. (Check theme.ts)
92
  [UI.theme.light]
93
- background = "#FAFAFA"
94
- paper = "#FFFFFF"
95
 
96
  [UI.theme.light.primary]
97
- main = "#b22222" # Brighter shade of red
98
- dark = "#8b0000" # Darker shade of the brighter red
99
- light = "#ff6347" # Lighter shade of the brighter red
100
  [UI.theme.light.text]
101
- primary = "#212121"
102
- secondary = "#616161"
 
103
  # Override default MUI dark theme. (Check theme.ts)
104
  [UI.theme.dark]
105
- background = "#1C1C1C" # Slightly lighter dark background color
106
- paper = "#2A2A2A" # Slightly lighter dark paper color
107
 
108
  [UI.theme.dark.primary]
109
- main = "#89CFF0" # Primary color
110
- dark = "#3700B3" # Dark variant of primary color
111
- light = "#CFBCFF" # Lighter variant of primary color
112
-
 
 
113
 
114
  [meta]
115
- generated_by = "1.1.302"
 
23
  unsafe_allow_html = false
24
 
25
  # Process and display mathematical expressions. This can clash with "$" characters in messages.
26
+ latex = true
27
 
28
  # Automatically tag threads with the current chat profile (if a chat profile is used)
29
  auto_tag_thread = true
 
85
  # custom_build = "./public/build"
86
 
87
  [UI.theme]
88
+ default = "dark"
89
  #layout = "wide"
90
  #font_family = "Inter, sans-serif"
91
  # Override default MUI light theme. (Check theme.ts)
92
  [UI.theme.light]
93
+ #background = "#FAFAFA"
94
+ #paper = "#FFFFFF"
95
 
96
  [UI.theme.light.primary]
97
+ #main = "#F80061"
98
+ #dark = "#980039"
99
+ #light = "#FFE7EB"
100
  [UI.theme.light.text]
101
+ #primary = "#212121"
102
+ #secondary = "#616161"
103
+
104
  # Override default MUI dark theme. (Check theme.ts)
105
  [UI.theme.dark]
106
+ #background = "#FAFAFA"
107
+ #paper = "#FFFFFF"
108
 
109
  [UI.theme.dark.primary]
110
+ #main = "#F80061"
111
+ #dark = "#980039"
112
+ #light = "#FFE7EB"
113
+ [UI.theme.dark.text]
114
+ #primary = "#EEEEEE"
115
+ #secondary = "#BDBDBD"
116
 
117
  [meta]
118
+ generated_by = "1.1.304"
code/main.py CHANGED
@@ -173,4 +173,6 @@ async def main(message):
173
  answer_with_sources, source_elements, sources_dict = get_sources(res, answer)
174
  processor._process(message.content, answer, sources_dict)
175
 
 
 
176
  await cl.Message(content=answer_with_sources, elements=source_elements).send()
 
173
  answer_with_sources, source_elements, sources_dict = get_sources(res, answer)
174
  processor._process(message.content, answer, sources_dict)
175
 
176
+ answer_with_sources = answer_with_sources.replace("$$", "$")
177
+
178
  await cl.Message(content=answer_with_sources, elements=source_elements).send()
code/modules/config/config.yml CHANGED
@@ -34,7 +34,7 @@ llm_params:
34
  local_llm_params:
35
  model: 'tiny-llama'
36
  temperature: 0.7
37
- pdf_reader: 'llama' # str [llama, pymupdf]
38
 
39
  chat_logging:
40
  log_chat: False # bool
 
34
  local_llm_params:
35
  model: 'tiny-llama'
36
  temperature: 0.7
37
+ pdf_reader: 'llama' # str [llama, pymupdf, gpt]
38
 
39
  chat_logging:
40
  log_chat: False # bool
code/modules/config/constants.py CHANGED
@@ -15,7 +15,9 @@ opening_message = f"Hey, What Can I Help You With?\n\nYou can me ask me question
15
  # Prompt Templates
16
 
17
  openai_prompt_template = """Use the following pieces of information to answer the user's question.
18
- You are an intelligent chatbot designed to help students with questions regarding the course. Render math equations in LaTeX format between $$ signs, and explain the parameters and variables in the equations.
 
 
19
  If you don't know the answer, just say that you don't know.
20
 
21
  Context: {context}
@@ -26,8 +28,9 @@ Helpful answer:
26
  """
27
 
28
  openai_prompt_template_with_history = """Use the following pieces of information to answer the user's question.
29
- You are an intelligent chatbot designed to help students with questions regarding the course. Render math equations in LaTeX format between $$ signs.
30
-
 
31
  If you don't know the answer, just say that you don't know, don't try to make up an answer.
32
 
33
  Use the history to answer the question if you can.
 
15
  # Prompt Templates
16
 
17
  openai_prompt_template = """Use the following pieces of information to answer the user's question.
18
+ You are an intelligent chatbot designed to help students with questions regarding the course.
19
+ Render math equations in LaTeX format between $ or $$ signs, stick to the parameter and variable icons found in your context.
20
+ Be sure to explain the parameters and variables in the equations.
21
  If you don't know the answer, just say that you don't know.
22
 
23
  Context: {context}
 
28
  """
29
 
30
  openai_prompt_template_with_history = """Use the following pieces of information to answer the user's question.
31
+ You are an intelligent chatbot designed to help students with questions regarding the course.
32
+ Render math equations in LaTeX format between $ or $$ signs, stick to the parameter and variable icons found in your context.
33
+ Be sure to explain the parameters and variables in the equations.
34
  If you don't know the answer, just say that you don't know, don't try to make up an answer.
35
 
36
  Use the history to answer the question if you can.
code/modules/dataloader/data_loader.py CHANGED
@@ -25,101 +25,19 @@ import html2text
25
  import bs4
26
  import tempfile
27
  import PyPDF2
 
 
28
 
29
  try:
30
- from modules.dataloader.helpers import get_metadata
31
  from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
32
-
33
-
34
  except:
35
- from dataloader.helpers import get_metadata
36
  from config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
37
 
38
  logger = logging.getLogger(__name__)
39
  BASE_DIR = os.getcwd()
40
 
41
- class PDFReader:
42
- def __init__(self):
43
- pass
44
-
45
- def get_loader(self, pdf_path):
46
- loader = PyMuPDFLoader(pdf_path)
47
- return loader
48
-
49
- def get_documents(self, loader):
50
- return loader.load()
51
-
52
- class LlamaParser:
53
- def __init__(self):
54
- self.GPT_API_KEY = OPENAI_API_KEY
55
- self.LLAMA_CLOUD_API_KEY = LLAMA_CLOUD_API_KEY
56
- self.parse_url = "https://api.cloud.llamaindex.ai/api/parsing/upload"
57
- self.headers = {
58
- 'Accept': 'application/json',
59
- 'Authorization': f'Bearer {LLAMA_CLOUD_API_KEY}'
60
- }
61
- self.parser = LlamaParse(
62
- api_key=LLAMA_CLOUD_API_KEY,
63
- result_type="markdown",
64
- verbose=True,
65
- language="en",
66
- gpt4o_mode=False,
67
- # gpt4o_api_key=OPENAI_API_KEY,
68
- parsing_instruction="The provided documents are PDFs of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides and convert them to markdown format. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$. For images, give a description and if you can, a source."
69
- )
70
-
71
- def parse(self, pdf_path):
72
- pdf_name = os.path.basename(pdf_path)
73
-
74
- documents = self.parser.load_data(pdf_path)
75
- documents = [document.to_langchain_format() for document in documents]
76
-
77
- os.remove(pdf_path) # cleanup, just in case
78
- return documents
79
-
80
- def make_request(self, pdf_url):
81
- payload = {
82
- "gpt4o_mode": "false",
83
- "parsing_instruction": "The provided document is a PDF of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides and convert them to markdown format. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$. For images, give a description and if you can, a source.",
84
- }
85
-
86
- files = [
87
- ('file', ('file', requests.get(pdf_url).content, 'application/octet-stream'))
88
- ]
89
-
90
- response = requests.request(
91
- "POST", self.parse_url, headers=self.headers, data=payload, files=files)
92
-
93
- return response.json()['id'], response.json()['status']
94
-
95
- async def get_result(self, job_id):
96
- url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}/result/markdown"
97
-
98
- response = requests.request("GET", url, headers=self.headers, data={})
99
-
100
- return response.json()['markdown']
101
-
102
- async def _parse(self, pdf_path):
103
- job_id, status = self.make_request(pdf_path)
104
-
105
- while status != "SUCCESS":
106
- url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}"
107
- response = requests.request("GET", url, headers=self.headers, data={})
108
- status = response.json()["status"]
109
-
110
- result = await self.get_result(job_id)
111
-
112
- documents = [
113
- Document(
114
- page_content=result,
115
- metadata={"source": pdf_path}
116
- )
117
- ]
118
-
119
- return documents
120
-
121
- async def _parse(self, pdf_path):
122
- return await self._parse(pdf_path)
123
 
124
  class HTMLReader:
125
  def __init__(self):
@@ -186,18 +104,6 @@ class FileReader:
186
  text += page.extract_text()
187
  return text
188
 
189
- @staticmethod
190
- def download_pdf_from_url(pdf_url):
191
- response = requests.get(pdf_url)
192
- if response.status_code == 200:
193
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
194
- temp_file.write(response.content)
195
- temp_file_path = temp_file.name
196
- return temp_file_path
197
- else:
198
- self.logger.error(f"Failed to download PDF from URL: {pdf_url}")
199
- return None
200
-
201
  def read_pdf(self, temp_file_path: str):
202
  if self.kind == "llama":
203
  documents = self.pdf_reader.parse(temp_file_path) # asyncio.run(self.pdf_reader.parse(temp_file_path)) if using async
@@ -383,22 +289,17 @@ class ChunkProcessor:
383
  )
384
  self.document_chunks_full.extend(document_chunks)
385
 
 
386
  self.document_data[file_path] = file_data
387
  self.document_metadata[file_path] = file_metadata
388
 
389
  def process_file(self, file_path, file_index, file_reader, addl_metadata):
390
  file_name = os.path.basename(file_path)
391
- storage_dir = os.path.join(os.getcwd(), self.config["vectorstore"]["data_path"])
392
- local_path = os.path.join(storage_dir, file_name)
393
-
394
- if not os.path.exists(local_path):
395
- local_path = FileReader.download_pdf_from_url(pdf_url=file_path)
396
 
397
  if file_name in self.document_data:
398
  return
399
 
400
- file_type = file_name.split(".")[-1].lower()
401
- self.logger.info(f"Reading file {file_index + 1}: {local_path}")
402
 
403
  read_methods = {
404
  "pdf": file_reader.read_pdf,
@@ -412,9 +313,10 @@ class ChunkProcessor:
412
  return
413
 
414
  try:
415
- documents = read_methods[file_type](local_path)
 
416
  self.process_documents(
417
- documents, local_path, file_type, "file", addl_metadata
418
  )
419
  except Exception as e:
420
  self.logger.error(f"Error processing file {file_name}: {str(e)}")
@@ -500,10 +402,11 @@ if __name__ == "__main__":
500
  data_loader = DataLoader(config, logger=logger)
501
  document_chunks, document_names, documents, document_metadata = (
502
  data_loader.get_chunks(
503
- uploaded_files,
504
- ["https://dl4ds.github.io/sp2024/"],
505
  )
506
  )
507
 
508
- print(document_names)
509
  print(len(document_chunks))
 
 
25
  import bs4
26
  import tempfile
27
  import PyPDF2
28
+ from modules.dataloader.pdf_readers.base import PDFReader
29
+ from modules.dataloader.pdf_readers.llama import LlamaParser
30
 
31
  try:
32
+ from modules.dataloader.helpers import get_metadata, download_pdf_from_url
33
  from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
 
 
34
  except:
35
+ from dataloader.helpers import get_metadata, download_pdf_from_url
36
  from config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
37
 
38
  logger = logging.getLogger(__name__)
39
  BASE_DIR = os.getcwd()
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  class HTMLReader:
43
  def __init__(self):
 
104
  text += page.extract_text()
105
  return text
106
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  def read_pdf(self, temp_file_path: str):
108
  if self.kind == "llama":
109
  documents = self.pdf_reader.parse(temp_file_path) # asyncio.run(self.pdf_reader.parse(temp_file_path)) if using async
 
289
  )
290
  self.document_chunks_full.extend(document_chunks)
291
 
292
+ print(f"Processed {file_path}. File_data: {file_data}")
293
  self.document_data[file_path] = file_data
294
  self.document_metadata[file_path] = file_metadata
295
 
296
  def process_file(self, file_path, file_index, file_reader, addl_metadata):
297
  file_name = os.path.basename(file_path)
 
 
 
 
 
298
 
299
  if file_name in self.document_data:
300
  return
301
 
302
+ file_type = file_name.split(".")[-1]
 
303
 
304
  read_methods = {
305
  "pdf": file_reader.read_pdf,
 
313
  return
314
 
315
  try:
316
+ documents = read_methods[file_type](file_path)
317
+
318
  self.process_documents(
319
+ documents, file_path, file_type, "file", addl_metadata
320
  )
321
  except Exception as e:
322
  self.logger.error(f"Error processing file {file_name}: {str(e)}")
 
402
  data_loader = DataLoader(config, logger=logger)
403
  document_chunks, document_names, documents, document_metadata = (
404
  data_loader.get_chunks(
405
+ ["https://dl4ds.github.io/sp2024/static_files/lectures/05_loss_functions_v2.pdf"],
406
+ [],
407
  )
408
  )
409
 
410
+ print(document_names[:5])
411
  print(len(document_chunks))
412
+
code/modules/dataloader/helpers.py CHANGED
@@ -1,7 +1,7 @@
1
  import requests
2
  from bs4 import BeautifulSoup
3
- from tqdm import tqdm
4
-
5
 
6
  def get_urls_from_file(file_path: str):
7
  """
@@ -106,3 +106,23 @@ def get_metadata(lectures_url, schedule_url):
106
  continue
107
 
108
  return lecture_metadata
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
+ from urllib.parse import urlparse
4
+ import tempfile
5
 
6
  def get_urls_from_file(file_path: str):
7
  """
 
106
  continue
107
 
108
  return lecture_metadata
109
+
110
+
111
+ def download_pdf_from_url(pdf_url):
112
+ """
113
+ Function to temporarily download a PDF file from a URL and return the local file path.
114
+
115
+ Args:
116
+ pdf_url (str): The URL of the PDF file to download.
117
+
118
+ Returns:
119
+ str: The local file path of the downloaded PDF file.
120
+ """
121
+ response = requests.get(pdf_url)
122
+ if response.status_code == 200:
123
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
124
+ temp_file.write(response.content)
125
+ temp_file_path = temp_file.name
126
+ return temp_file_path
127
+ else:
128
+ return None
code/modules/dataloader/pdf_readers/base.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyMuPDFLoader
2
+
3
+
4
+ class PDFReader:
5
+ def __init__(self):
6
+ pass
7
+
8
+ def get_loader(self, pdf_path):
9
+ loader = PyMuPDFLoader(pdf_path)
10
+ return loader
11
+
12
+ def parse(self, pdf_path):
13
+ loader = self.get_loader(pdf_path)
14
+ return loader.load()
code/modules/dataloader/pdf_readers/llama.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from llama_parse import LlamaParse
4
+ from langchain.schema import Document
5
+ from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
6
+ from modules.dataloader.helpers import download_pdf_from_url
7
+
8
+
9
+
10
+ class LlamaParser:
11
+ def __init__(self):
12
+ self.GPT_API_KEY = OPENAI_API_KEY
13
+ self.LLAMA_CLOUD_API_KEY = LLAMA_CLOUD_API_KEY
14
+ self.parse_url = "https://api.cloud.llamaindex.ai/api/parsing/upload"
15
+ self.headers = {
16
+ 'Accept': 'application/json',
17
+ 'Authorization': f'Bearer {LLAMA_CLOUD_API_KEY}'
18
+ }
19
+ self.parser = LlamaParse(
20
+ api_key=LLAMA_CLOUD_API_KEY,
21
+ result_type="markdown",
22
+ verbose=True,
23
+ language="en",
24
+ gpt4o_mode=False,
25
+ # gpt4o_api_key=OPENAI_API_KEY,
26
+ parsing_instruction="The provided documents are PDFs of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX format, between $ signs. For images, if you can, give a description and a source."
27
+ )
28
+
29
+ def parse(self, pdf_path):
30
+ if not os.path.exists(pdf_path):
31
+ pdf_path = download_pdf_from_url(pdf_path)
32
+
33
+ documents = self.parser.load_data(pdf_path)
34
+ document = [document.to_langchain_format() for document in documents][0]
35
+
36
+ content = document.page_content
37
+ pages = content.split("\n---\n")
38
+ pages = [page.strip() for page in pages]
39
+
40
+ documents = [
41
+ Document(
42
+ page_content=page,
43
+ metadata={"source": pdf_path, "page": i}
44
+ ) for i, page in enumerate(pages)
45
+ ]
46
+
47
+ return documents
48
+
49
+ def make_request(self, pdf_url):
50
+ payload = {
51
+ "gpt4o_mode": "false",
52
+ "parsing_instruction": "The provided document is a PDF of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides and convert them to markdown format. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$. For images, give a description and if you can, a source.",
53
+ }
54
+
55
+ files = [
56
+ ('file', ('file', requests.get(pdf_url).content, 'application/octet-stream'))
57
+ ]
58
+
59
+ response = requests.request(
60
+ "POST", self.parse_url, headers=self.headers, data=payload, files=files)
61
+
62
+ return response.json()['id'], response.json()['status']
63
+
64
+ async def get_result(self, job_id):
65
+ url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}/result/markdown"
66
+
67
+ response = requests.request("GET", url, headers=self.headers, data={})
68
+
69
+ return response.json()['markdown']
70
+
71
+ async def _parse(self, pdf_path):
72
+ job_id, status = self.make_request(pdf_path)
73
+
74
+ while status != "SUCCESS":
75
+ url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}"
76
+ response = requests.request("GET", url, headers=self.headers, data={})
77
+ status = response.json()["status"]
78
+
79
+ result = await self.get_result(job_id)
80
+
81
+ documents = [
82
+ Document(
83
+ page_content=result,
84
+ metadata={"source": pdf_path}
85
+ )
86
+ ]
87
+
88
+ return documents
89
+
90
+ async def _parse(self, pdf_path):
91
+ return await self._parse(pdf_path)
92
+