Farid Karimli commited on
Commit
638bffe
·
1 Parent(s): 9a544d2

LLaMa parser fix

Browse files
code/.chainlit/config.toml CHANGED
@@ -23,7 +23,7 @@ allow_origins = ["*"]
23
  unsafe_allow_html = false
24
 
25
  # Process and display mathematical expressions. This can clash with "$" characters in messages.
26
- latex = false
27
 
28
  # Automatically tag threads with the current chat profile (if a chat profile is used)
29
  auto_tag_thread = true
@@ -85,31 +85,34 @@ custom_meta_image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/f/
85
  # custom_build = "./public/build"
86
 
87
  [UI.theme]
88
- default = "light"
89
  #layout = "wide"
90
  #font_family = "Inter, sans-serif"
91
  # Override default MUI light theme. (Check theme.ts)
92
  [UI.theme.light]
93
- background = "#FAFAFA"
94
- paper = "#FFFFFF"
95
 
96
  [UI.theme.light.primary]
97
- main = "#b22222" # Brighter shade of red
98
- dark = "#8b0000" # Darker shade of the brighter red
99
- light = "#ff6347" # Lighter shade of the brighter red
100
  [UI.theme.light.text]
101
- primary = "#212121"
102
- secondary = "#616161"
 
103
  # Override default MUI dark theme. (Check theme.ts)
104
  [UI.theme.dark]
105
- background = "#1C1C1C" # Slightly lighter dark background color
106
- paper = "#2A2A2A" # Slightly lighter dark paper color
107
 
108
  [UI.theme.dark.primary]
109
- main = "#89CFF0" # Primary color
110
- dark = "#3700B3" # Dark variant of primary color
111
- light = "#CFBCFF" # Lighter variant of primary color
112
-
 
 
113
 
114
  [meta]
115
- generated_by = "1.1.302"
 
23
  unsafe_allow_html = false
24
 
25
  # Process and display mathematical expressions. This can clash with "$" characters in messages.
26
+ latex = true
27
 
28
  # Automatically tag threads with the current chat profile (if a chat profile is used)
29
  auto_tag_thread = true
 
85
  # custom_build = "./public/build"
86
 
87
  [UI.theme]
88
+ default = "dark"
89
  #layout = "wide"
90
  #font_family = "Inter, sans-serif"
91
  # Override default MUI light theme. (Check theme.ts)
92
  [UI.theme.light]
93
+ #background = "#FAFAFA"
94
+ #paper = "#FFFFFF"
95
 
96
  [UI.theme.light.primary]
97
+ #main = "#F80061"
98
+ #dark = "#980039"
99
+ #light = "#FFE7EB"
100
  [UI.theme.light.text]
101
+ #primary = "#212121"
102
+ #secondary = "#616161"
103
+
104
  # Override default MUI dark theme. (Check theme.ts)
105
  [UI.theme.dark]
106
+ #background = "#FAFAFA"
107
+ #paper = "#FFFFFF"
108
 
109
  [UI.theme.dark.primary]
110
+ #main = "#F80061"
111
+ #dark = "#980039"
112
+ #light = "#FFE7EB"
113
+ [UI.theme.dark.text]
114
+ #primary = "#EEEEEE"
115
+ #secondary = "#BDBDBD"
116
 
117
  [meta]
118
+ generated_by = "1.1.304"
code/main.py CHANGED
@@ -173,4 +173,6 @@ async def main(message):
173
  answer_with_sources, source_elements, sources_dict = get_sources(res, answer)
174
  processor._process(message.content, answer, sources_dict)
175
 
 
 
176
  await cl.Message(content=answer_with_sources, elements=source_elements).send()
 
173
  answer_with_sources, source_elements, sources_dict = get_sources(res, answer)
174
  processor._process(message.content, answer, sources_dict)
175
 
176
+ answer_with_sources = answer_with_sources.replace("$$", "$")
177
+
178
  await cl.Message(content=answer_with_sources, elements=source_elements).send()
code/modules/config/config.yml CHANGED
@@ -7,7 +7,7 @@ vectorstore:
7
  data_path: '../storage/data' # str
8
  url_file_path: '../storage/data/urls.txt' # str
9
  expand_urls: True # bool
10
- db_option : 'FAISS' # str [FAISS, Chroma, RAGatouille, RAPTOR]
11
  db_path : '../vectorstores' # str
12
  model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
13
  search_top_k : 3 # int
@@ -32,7 +32,7 @@ llm_params:
32
  local_llm_params:
33
  model: 'tiny-llama'
34
  temperature: 0.7
35
- pdf_reader: 'llama' # str [llama, pymupdf]
36
 
37
  chat_logging:
38
  log_chat: False # bool
 
7
  data_path: '../storage/data' # str
8
  url_file_path: '../storage/data/urls.txt' # str
9
  expand_urls: True # bool
10
+ db_option : 'RAGatouille' # str [FAISS, Chroma, RAGatouille, RAPTOR]
11
  db_path : '../vectorstores' # str
12
  model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
13
  search_top_k : 3 # int
 
32
  local_llm_params:
33
  model: 'tiny-llama'
34
  temperature: 0.7
35
+ pdf_reader: 'llama' # str [llama, pymupdf, gpt]
36
 
37
  chat_logging:
38
  log_chat: False # bool
code/modules/config/constants.py CHANGED
@@ -15,7 +15,9 @@ opening_message = f"Hey, What Can I Help You With?\n\nYou can me ask me question
15
  # Prompt Templates
16
 
17
  openai_prompt_template = """Use the following pieces of information to answer the user's question.
18
- You are an intelligent chatbot designed to help students with questions regarding the course. Render math equations in LaTeX format between $$ signs, and explain the parameters and variables in the equations.
 
 
19
  If you don't know the answer, just say that you don't know.
20
 
21
  Context: {context}
@@ -26,8 +28,9 @@ Helpful answer:
26
  """
27
 
28
  openai_prompt_template_with_history = """Use the following pieces of information to answer the user's question.
29
- You are an intelligent chatbot designed to help students with questions regarding the course. Render math equations in LaTeX format between $$ signs.
30
-
 
31
  If you don't know the answer, just say that you don't know, don't try to make up an answer.
32
 
33
  Use the history to answer the question if you can.
 
15
  # Prompt Templates
16
 
17
  openai_prompt_template = """Use the following pieces of information to answer the user's question.
18
+ You are an intelligent chatbot designed to help students with questions regarding the course.
19
+ Render math equations in LaTeX format between $ or $$ signs, stick to the parameter and variable icons found in your context.
20
+ Be sure to explain the parameters and variables in the equations.
21
  If you don't know the answer, just say that you don't know.
22
 
23
  Context: {context}
 
28
  """
29
 
30
  openai_prompt_template_with_history = """Use the following pieces of information to answer the user's question.
31
+ You are an intelligent chatbot designed to help students with questions regarding the course.
32
+ Render math equations in LaTeX format between $ or $$ signs, stick to the parameter and variable icons found in your context.
33
+ Be sure to explain the parameters and variables in the equations.
34
  If you don't know the answer, just say that you don't know, don't try to make up an answer.
35
 
36
  Use the history to answer the question if you can.
code/modules/dataloader/data_loader.py CHANGED
@@ -27,12 +27,10 @@ import tempfile
27
  import PyPDF2
28
 
29
  try:
30
- from modules.dataloader.helpers import get_metadata
31
  from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
32
-
33
-
34
  except:
35
- from dataloader.helpers import get_metadata
36
  from config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
37
 
38
  logger = logging.getLogger(__name__)
@@ -51,6 +49,7 @@ class PDFReader:
51
 
52
  class LlamaParser:
53
  def __init__(self):
 
54
  self.GPT_API_KEY = OPENAI_API_KEY
55
  self.LLAMA_CLOUD_API_KEY = LLAMA_CLOUD_API_KEY
56
  self.parse_url = "https://api.cloud.llamaindex.ai/api/parsing/upload"
@@ -65,16 +64,30 @@ class LlamaParser:
65
  language="en",
66
  gpt4o_mode=False,
67
  # gpt4o_api_key=OPENAI_API_KEY,
68
- parsing_instruction="The provided documents are PDFs of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides and convert them to markdown format. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$. For images, give a description and if you can, a source."
69
  )
70
 
71
  def parse(self, pdf_path):
72
  pdf_name = os.path.basename(pdf_path)
73
 
 
 
 
 
74
  documents = self.parser.load_data(pdf_path)
75
- documents = [document.to_langchain_format() for document in documents]
 
 
 
 
 
 
 
 
 
 
 
76
 
77
- os.remove(pdf_path) # cleanup, just in case
78
  return documents
79
 
80
  def make_request(self, pdf_url):
@@ -186,18 +199,6 @@ class FileReader:
186
  text += page.extract_text()
187
  return text
188
 
189
- @staticmethod
190
- def download_pdf_from_url(pdf_url):
191
- response = requests.get(pdf_url)
192
- if response.status_code == 200:
193
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
194
- temp_file.write(response.content)
195
- temp_file_path = temp_file.name
196
- return temp_file_path
197
- else:
198
- self.logger.error(f"Failed to download PDF from URL: {pdf_url}")
199
- return None
200
-
201
  def read_pdf(self, temp_file_path: str):
202
  if self.kind == "llama":
203
  documents = self.pdf_reader.parse(temp_file_path) # asyncio.run(self.pdf_reader.parse(temp_file_path)) if using async
@@ -383,22 +384,17 @@ class ChunkProcessor:
383
  )
384
  self.document_chunks_full.extend(document_chunks)
385
 
 
386
  self.document_data[file_path] = file_data
387
  self.document_metadata[file_path] = file_metadata
388
 
389
  def process_file(self, file_path, file_index, file_reader, addl_metadata):
390
  file_name = os.path.basename(file_path)
391
- storage_dir = os.path.join(os.getcwd(), self.config["vectorstore"]["data_path"])
392
- local_path = os.path.join(storage_dir, file_name)
393
-
394
- if not os.path.exists(local_path):
395
- local_path = FileReader.download_pdf_from_url(pdf_url=file_path)
396
 
397
  if file_name in self.document_data:
398
  return
399
 
400
- file_type = file_name.split(".")[-1].lower()
401
- self.logger.info(f"Reading file {file_index + 1}: {local_path}")
402
 
403
  read_methods = {
404
  "pdf": file_reader.read_pdf,
@@ -412,9 +408,10 @@ class ChunkProcessor:
412
  return
413
 
414
  try:
415
- documents = read_methods[file_type](local_path)
 
416
  self.process_documents(
417
- documents, local_path, file_type, "file", addl_metadata
418
  )
419
  except Exception as e:
420
  self.logger.error(f"Error processing file {file_name}: {str(e)}")
@@ -500,10 +497,11 @@ if __name__ == "__main__":
500
  data_loader = DataLoader(config, logger=logger)
501
  document_chunks, document_names, documents, document_metadata = (
502
  data_loader.get_chunks(
503
- uploaded_files,
504
- ["https://dl4ds.github.io/sp2024/"],
505
  )
506
  )
507
 
508
- print(document_names)
509
  print(len(document_chunks))
 
 
27
  import PyPDF2
28
 
29
  try:
30
+ from modules.dataloader.helpers import get_metadata, download_pdf_from_url
31
  from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
 
 
32
  except:
33
+ from dataloader.helpers import get_metadata, download_pdf_from_url
34
  from config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
35
 
36
  logger = logging.getLogger(__name__)
 
49
 
50
  class LlamaParser:
51
  def __init__(self):
52
+ print("Initializing LlamaParser")
53
  self.GPT_API_KEY = OPENAI_API_KEY
54
  self.LLAMA_CLOUD_API_KEY = LLAMA_CLOUD_API_KEY
55
  self.parse_url = "https://api.cloud.llamaindex.ai/api/parsing/upload"
 
64
  language="en",
65
  gpt4o_mode=False,
66
  # gpt4o_api_key=OPENAI_API_KEY,
67
+ parsing_instruction="The provided documents are PDFs of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX format, between $ signs. For images, if you can, give a description and a source."
68
  )
69
 
70
  def parse(self, pdf_path):
71
  pdf_name = os.path.basename(pdf_path)
72
 
73
+ if not os.path.exists(pdf_path):
74
+ logger.warning(f"File {pdf_name} does not exist locally, installing temporarily...")
75
+ pdf_path = download_pdf_from_url(pdf_path)
76
+
77
  documents = self.parser.load_data(pdf_path)
78
+ document = [document.to_langchain_format() for document in documents][0]
79
+
80
+ content = document.page_content
81
+ pages = content.split("\n---\n")
82
+ pages = [page.strip() for page in pages]
83
+
84
+ documents = [
85
+ Document(
86
+ page_content=page,
87
+ metadata={"source": pdf_path, "page": i}
88
+ ) for i, page in enumerate(pages)
89
+ ]
90
 
 
91
  return documents
92
 
93
  def make_request(self, pdf_url):
 
199
  text += page.extract_text()
200
  return text
201
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  def read_pdf(self, temp_file_path: str):
203
  if self.kind == "llama":
204
  documents = self.pdf_reader.parse(temp_file_path) # asyncio.run(self.pdf_reader.parse(temp_file_path)) if using async
 
384
  )
385
  self.document_chunks_full.extend(document_chunks)
386
 
387
+ print(f"Processed {file_path}. File_data: {file_data}")
388
  self.document_data[file_path] = file_data
389
  self.document_metadata[file_path] = file_metadata
390
 
391
  def process_file(self, file_path, file_index, file_reader, addl_metadata):
392
  file_name = os.path.basename(file_path)
 
 
 
 
 
393
 
394
  if file_name in self.document_data:
395
  return
396
 
397
+ file_type = file_name.split(".")[-1]
 
398
 
399
  read_methods = {
400
  "pdf": file_reader.read_pdf,
 
408
  return
409
 
410
  try:
411
+ documents = read_methods[file_type](file_path)
412
+
413
  self.process_documents(
414
+ documents, file_path, file_type, "file", addl_metadata
415
  )
416
  except Exception as e:
417
  self.logger.error(f"Error processing file {file_name}: {str(e)}")
 
497
  data_loader = DataLoader(config, logger=logger)
498
  document_chunks, document_names, documents, document_metadata = (
499
  data_loader.get_chunks(
500
+ ["https://dl4ds.github.io/sp2024/static_files/lectures/05_loss_functions_v2.pdf"],
501
+ [],
502
  )
503
  )
504
 
505
+ print(document_names[:5])
506
  print(len(document_chunks))
507
+
code/modules/dataloader/helpers.py CHANGED
@@ -1,7 +1,7 @@
1
  import requests
2
  from bs4 import BeautifulSoup
3
- from tqdm import tqdm
4
-
5
 
6
  def get_urls_from_file(file_path: str):
7
  """
@@ -106,3 +106,23 @@ def get_metadata(lectures_url, schedule_url):
106
  continue
107
 
108
  return lecture_metadata
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
+ from urllib.parse import urlparse
4
+ import tempfile
5
 
6
  def get_urls_from_file(file_path: str):
7
  """
 
106
  continue
107
 
108
  return lecture_metadata
109
+
110
+
111
+ def download_pdf_from_url(pdf_url):
112
+ """
113
+ Function to temporarily download a PDF file from a URL and return the local file path.
114
+
115
+ Args:
116
+ pdf_url (str): The URL of the PDF file to download.
117
+
118
+ Returns:
119
+ str: The local file path of the downloaded PDF file.
120
+ """
121
+ response = requests.get(pdf_url)
122
+ if response.status_code == 200:
123
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
124
+ temp_file.write(response.content)
125
+ temp_file_path = temp_file.name
126
+ return temp_file_path
127
+ else:
128
+ return None