XThomasBU
commited on
Commit
·
b2c9100
1
Parent(s):
f0018f2
Added 'date' to metadata
Browse files- code/modules/data_loader.py +2 -1
- code/modules/helpers.py +39 -6
code/modules/data_loader.py
CHANGED
@@ -180,7 +180,8 @@ class ChunkProcessor:
|
|
180 |
self.document_metadata = []
|
181 |
|
182 |
lecture_metadata = get_lecture_metadata(
|
183 |
-
"https://dl4ds.github.io/sp2024/lectures/"
|
|
|
184 |
) # TODO: Use more efficiently
|
185 |
|
186 |
for file_index, file_path in enumerate(uploaded_files):
|
|
|
180 |
self.document_metadata = []
|
181 |
|
182 |
lecture_metadata = get_lecture_metadata(
|
183 |
+
"https://dl4ds.github.io/sp2024/lectures/",
|
184 |
+
"https://dl4ds.github.io/sp2024/schedule/",
|
185 |
) # TODO: Use more efficiently
|
186 |
|
187 |
for file_index, file_path in enumerate(uploaded_files):
|
code/modules/helpers.py
CHANGED
@@ -152,6 +152,7 @@ def get_sources(res, answer):
|
|
152 |
lecture_tldr = source_metadata.get("tldr", "N/A")
|
153 |
lecture_recording = source_metadata.get("lecture_recording", "N/A")
|
154 |
suggested_readings = source_metadata.get("suggested_readings", "N/A")
|
|
|
155 |
|
156 |
source_type = source_metadata.get("source_type", "N/A")
|
157 |
|
@@ -165,6 +166,7 @@ def get_sources(res, answer):
|
|
165 |
"lecture_tldr": lecture_tldr,
|
166 |
"lecture_recording": lecture_recording,
|
167 |
"suggested_readings": suggested_readings,
|
|
|
168 |
"source_type": source_type,
|
169 |
}
|
170 |
else:
|
@@ -206,6 +208,7 @@ def get_sources(res, answer):
|
|
206 |
full_answer += f"\nSource: {source_data['url']}\n"
|
207 |
full_answer += f"Page: {source_data['page']}\n"
|
208 |
full_answer += f"Type: {source_data['source_type']}\n"
|
|
|
209 |
full_answer += f"TL;DR: {source_data['lecture_tldr']}\n"
|
210 |
full_answer += f"Lecture Recording: {source_data['lecture_recording']}\n"
|
211 |
full_answer += f"Suggested Readings: {source_data['suggested_readings']}\n"
|
@@ -213,18 +216,42 @@ def get_sources(res, answer):
|
|
213 |
return full_answer, source_elements
|
214 |
|
215 |
|
216 |
-
def get_lecture_metadata(schedule_url):
|
217 |
"""
|
218 |
-
Function to get the lecture metadata from the schedule
|
219 |
"""
|
220 |
lecture_metadata = {}
|
221 |
|
|
|
|
|
|
|
|
|
222 |
# Get the main schedule page content
|
223 |
-
|
224 |
-
|
225 |
|
226 |
# Find all lecture blocks
|
227 |
-
lecture_blocks =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
|
229 |
for block in lecture_blocks:
|
230 |
try:
|
@@ -237,6 +264,9 @@ def get_lecture_metadata(schedule_url):
|
|
237 |
# Extract the link to the slides
|
238 |
slides_link_tag = block.find("a", title="Download slides")
|
239 |
slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
|
|
|
|
|
|
|
240 |
|
241 |
# Extract the link to the lecture recording
|
242 |
recording_link_tag = block.find("a", title="Download lecture recording")
|
@@ -257,9 +287,12 @@ def get_lecture_metadata(schedule_url):
|
|
257 |
else:
|
258 |
suggested_readings = "No specific readings provided."
|
259 |
|
|
|
|
|
|
|
260 |
# Add to the dictionary
|
261 |
-
slides_link = f"https://dl4ds.github.io{slides_link}"
|
262 |
lecture_metadata[slides_link] = {
|
|
|
263 |
"tldr": tldr,
|
264 |
"title": title,
|
265 |
"lecture_recording": recording_link,
|
|
|
152 |
lecture_tldr = source_metadata.get("tldr", "N/A")
|
153 |
lecture_recording = source_metadata.get("lecture_recording", "N/A")
|
154 |
suggested_readings = source_metadata.get("suggested_readings", "N/A")
|
155 |
+
date = source_metadata.get("date", "N/A")
|
156 |
|
157 |
source_type = source_metadata.get("source_type", "N/A")
|
158 |
|
|
|
166 |
"lecture_tldr": lecture_tldr,
|
167 |
"lecture_recording": lecture_recording,
|
168 |
"suggested_readings": suggested_readings,
|
169 |
+
"date": date,
|
170 |
"source_type": source_type,
|
171 |
}
|
172 |
else:
|
|
|
208 |
full_answer += f"\nSource: {source_data['url']}\n"
|
209 |
full_answer += f"Page: {source_data['page']}\n"
|
210 |
full_answer += f"Type: {source_data['source_type']}\n"
|
211 |
+
full_answer += f"Date: {source_data['date']}\n"
|
212 |
full_answer += f"TL;DR: {source_data['lecture_tldr']}\n"
|
213 |
full_answer += f"Lecture Recording: {source_data['lecture_recording']}\n"
|
214 |
full_answer += f"Suggested Readings: {source_data['suggested_readings']}\n"
|
|
|
216 |
return full_answer, source_elements
|
217 |
|
218 |
|
219 |
+
def get_lecture_metadata(lectures_url, schedule_url):
|
220 |
"""
|
221 |
+
Function to get the lecture metadata from the lectures and schedule URLs.
|
222 |
"""
|
223 |
lecture_metadata = {}
|
224 |
|
225 |
+
# Get the main lectures page content
|
226 |
+
r_lectures = requests.get(lectures_url)
|
227 |
+
soup_lectures = BeautifulSoup(r_lectures.text, "html.parser")
|
228 |
+
|
229 |
# Get the main schedule page content
|
230 |
+
r_schedule = requests.get(schedule_url)
|
231 |
+
soup_schedule = BeautifulSoup(r_schedule.text, "html.parser")
|
232 |
|
233 |
# Find all lecture blocks
|
234 |
+
lecture_blocks = soup_lectures.find_all("div", class_="lecture-container")
|
235 |
+
|
236 |
+
# Create a mapping from slides link to date
|
237 |
+
date_mapping = {}
|
238 |
+
schedule_rows = soup_schedule.find_all("li", class_="table-row-lecture")
|
239 |
+
for row in schedule_rows:
|
240 |
+
try:
|
241 |
+
date = (
|
242 |
+
row.find("div", {"data-label": "Date"}).get_text(separator=" ").strip()
|
243 |
+
)
|
244 |
+
description_div = row.find("div", {"data-label": "Description"})
|
245 |
+
slides_link_tag = description_div.find("a", title="Download slides")
|
246 |
+
slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
|
247 |
+
slides_link = (
|
248 |
+
f"https://dl4ds.github.io{slides_link}" if slides_link else None
|
249 |
+
)
|
250 |
+
if slides_link:
|
251 |
+
date_mapping[slides_link] = date
|
252 |
+
except Exception as e:
|
253 |
+
print(f"Error processing schedule row: {e}")
|
254 |
+
continue
|
255 |
|
256 |
for block in lecture_blocks:
|
257 |
try:
|
|
|
264 |
# Extract the link to the slides
|
265 |
slides_link_tag = block.find("a", title="Download slides")
|
266 |
slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
|
267 |
+
slides_link = (
|
268 |
+
f"https://dl4ds.github.io{slides_link}" if slides_link else None
|
269 |
+
)
|
270 |
|
271 |
# Extract the link to the lecture recording
|
272 |
recording_link_tag = block.find("a", title="Download lecture recording")
|
|
|
287 |
else:
|
288 |
suggested_readings = "No specific readings provided."
|
289 |
|
290 |
+
# Get the date from the schedule
|
291 |
+
date = date_mapping.get(slides_link, "No date available")
|
292 |
+
|
293 |
# Add to the dictionary
|
|
|
294 |
lecture_metadata[slides_link] = {
|
295 |
+
"date": date,
|
296 |
"tldr": tldr,
|
297 |
"title": title,
|
298 |
"lecture_recording": recording_link,
|