XThomasBU commited on
Commit
b2c9100
·
1 Parent(s): f0018f2

Added 'date' to metadata

Browse files
code/modules/data_loader.py CHANGED
@@ -180,7 +180,8 @@ class ChunkProcessor:
180
  self.document_metadata = []
181
 
182
  lecture_metadata = get_lecture_metadata(
183
- "https://dl4ds.github.io/sp2024/lectures/"
 
184
  ) # TODO: Use more efficiently
185
 
186
  for file_index, file_path in enumerate(uploaded_files):
 
180
  self.document_metadata = []
181
 
182
  lecture_metadata = get_lecture_metadata(
183
+ "https://dl4ds.github.io/sp2024/lectures/",
184
+ "https://dl4ds.github.io/sp2024/schedule/",
185
  ) # TODO: Use more efficiently
186
 
187
  for file_index, file_path in enumerate(uploaded_files):
code/modules/helpers.py CHANGED
@@ -152,6 +152,7 @@ def get_sources(res, answer):
152
  lecture_tldr = source_metadata.get("tldr", "N/A")
153
  lecture_recording = source_metadata.get("lecture_recording", "N/A")
154
  suggested_readings = source_metadata.get("suggested_readings", "N/A")
 
155
 
156
  source_type = source_metadata.get("source_type", "N/A")
157
 
@@ -165,6 +166,7 @@ def get_sources(res, answer):
165
  "lecture_tldr": lecture_tldr,
166
  "lecture_recording": lecture_recording,
167
  "suggested_readings": suggested_readings,
 
168
  "source_type": source_type,
169
  }
170
  else:
@@ -206,6 +208,7 @@ def get_sources(res, answer):
206
  full_answer += f"\nSource: {source_data['url']}\n"
207
  full_answer += f"Page: {source_data['page']}\n"
208
  full_answer += f"Type: {source_data['source_type']}\n"
 
209
  full_answer += f"TL;DR: {source_data['lecture_tldr']}\n"
210
  full_answer += f"Lecture Recording: {source_data['lecture_recording']}\n"
211
  full_answer += f"Suggested Readings: {source_data['suggested_readings']}\n"
@@ -213,18 +216,42 @@ def get_sources(res, answer):
213
  return full_answer, source_elements
214
 
215
 
216
- def get_lecture_metadata(schedule_url):
217
  """
218
- Function to get the lecture metadata from the schedule URL.
219
  """
220
  lecture_metadata = {}
221
 
 
 
 
 
222
  # Get the main schedule page content
223
- r = requests.get(schedule_url)
224
- soup = BeautifulSoup(r.text, "html.parser")
225
 
226
  # Find all lecture blocks
227
- lecture_blocks = soup.find_all("div", class_="lecture-container")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
 
229
  for block in lecture_blocks:
230
  try:
@@ -237,6 +264,9 @@ def get_lecture_metadata(schedule_url):
237
  # Extract the link to the slides
238
  slides_link_tag = block.find("a", title="Download slides")
239
  slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
 
 
 
240
 
241
  # Extract the link to the lecture recording
242
  recording_link_tag = block.find("a", title="Download lecture recording")
@@ -257,9 +287,12 @@ def get_lecture_metadata(schedule_url):
257
  else:
258
  suggested_readings = "No specific readings provided."
259
 
 
 
 
260
  # Add to the dictionary
261
- slides_link = f"https://dl4ds.github.io{slides_link}"
262
  lecture_metadata[slides_link] = {
 
263
  "tldr": tldr,
264
  "title": title,
265
  "lecture_recording": recording_link,
 
152
  lecture_tldr = source_metadata.get("tldr", "N/A")
153
  lecture_recording = source_metadata.get("lecture_recording", "N/A")
154
  suggested_readings = source_metadata.get("suggested_readings", "N/A")
155
+ date = source_metadata.get("date", "N/A")
156
 
157
  source_type = source_metadata.get("source_type", "N/A")
158
 
 
166
  "lecture_tldr": lecture_tldr,
167
  "lecture_recording": lecture_recording,
168
  "suggested_readings": suggested_readings,
169
+ "date": date,
170
  "source_type": source_type,
171
  }
172
  else:
 
208
  full_answer += f"\nSource: {source_data['url']}\n"
209
  full_answer += f"Page: {source_data['page']}\n"
210
  full_answer += f"Type: {source_data['source_type']}\n"
211
+ full_answer += f"Date: {source_data['date']}\n"
212
  full_answer += f"TL;DR: {source_data['lecture_tldr']}\n"
213
  full_answer += f"Lecture Recording: {source_data['lecture_recording']}\n"
214
  full_answer += f"Suggested Readings: {source_data['suggested_readings']}\n"
 
216
  return full_answer, source_elements
217
 
218
 
219
+ def get_lecture_metadata(lectures_url, schedule_url):
220
  """
221
+ Function to get the lecture metadata from the lectures and schedule URLs.
222
  """
223
  lecture_metadata = {}
224
 
225
+ # Get the main lectures page content
226
+ r_lectures = requests.get(lectures_url)
227
+ soup_lectures = BeautifulSoup(r_lectures.text, "html.parser")
228
+
229
  # Get the main schedule page content
230
+ r_schedule = requests.get(schedule_url)
231
+ soup_schedule = BeautifulSoup(r_schedule.text, "html.parser")
232
 
233
  # Find all lecture blocks
234
+ lecture_blocks = soup_lectures.find_all("div", class_="lecture-container")
235
+
236
+ # Create a mapping from slides link to date
237
+ date_mapping = {}
238
+ schedule_rows = soup_schedule.find_all("li", class_="table-row-lecture")
239
+ for row in schedule_rows:
240
+ try:
241
+ date = (
242
+ row.find("div", {"data-label": "Date"}).get_text(separator=" ").strip()
243
+ )
244
+ description_div = row.find("div", {"data-label": "Description"})
245
+ slides_link_tag = description_div.find("a", title="Download slides")
246
+ slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
247
+ slides_link = (
248
+ f"https://dl4ds.github.io{slides_link}" if slides_link else None
249
+ )
250
+ if slides_link:
251
+ date_mapping[slides_link] = date
252
+ except Exception as e:
253
+ print(f"Error processing schedule row: {e}")
254
+ continue
255
 
256
  for block in lecture_blocks:
257
  try:
 
264
  # Extract the link to the slides
265
  slides_link_tag = block.find("a", title="Download slides")
266
  slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
267
+ slides_link = (
268
+ f"https://dl4ds.github.io{slides_link}" if slides_link else None
269
+ )
270
 
271
  # Extract the link to the lecture recording
272
  recording_link_tag = block.find("a", title="Download lecture recording")
 
287
  else:
288
  suggested_readings = "No specific readings provided."
289
 
290
+ # Get the date from the schedule
291
+ date = date_mapping.get(slides_link, "No date available")
292
+
293
  # Add to the dictionary
 
294
  lecture_metadata[slides_link] = {
295
+ "date": date,
296
  "tldr": tldr,
297
  "title": title,
298
  "lecture_recording": recording_link,