|
import requests |
|
from bs4 import BeautifulSoup |
|
from urllib.parse import urlparse |
|
import tempfile |
|
|
|
def get_urls_from_file(file_path: str): |
|
""" |
|
Function to get urls from a file |
|
""" |
|
with open(file_path, "r") as f: |
|
urls = f.readlines() |
|
urls = [url.strip() for url in urls] |
|
return urls |
|
|
|
|
|
def get_base_url(url): |
|
parsed_url = urlparse(url) |
|
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/" |
|
return base_url |
|
|
|
|
|
def get_metadata(lectures_url, schedule_url): |
|
""" |
|
Function to get the lecture metadata from the lectures and schedule URLs. |
|
""" |
|
lecture_metadata = {} |
|
|
|
|
|
r_lectures = requests.get(lectures_url) |
|
soup_lectures = BeautifulSoup(r_lectures.text, "html.parser") |
|
|
|
|
|
r_schedule = requests.get(schedule_url) |
|
soup_schedule = BeautifulSoup(r_schedule.text, "html.parser") |
|
|
|
|
|
lecture_blocks = soup_lectures.find_all("div", class_="lecture-container") |
|
|
|
|
|
date_mapping = {} |
|
schedule_rows = soup_schedule.find_all("li", class_="table-row-lecture") |
|
for row in schedule_rows: |
|
try: |
|
date = ( |
|
row.find("div", {"data-label": "Date"}).get_text(separator=" ").strip() |
|
) |
|
description_div = row.find("div", {"data-label": "Description"}) |
|
slides_link_tag = description_div.find("a", title="Download slides") |
|
slides_link = slides_link_tag["href"].strip() if slides_link_tag else None |
|
slides_link = ( |
|
f"https://dl4ds.github.io{slides_link}" if slides_link else None |
|
) |
|
if slides_link: |
|
date_mapping[slides_link] = date |
|
except Exception as e: |
|
print(f"Error processing schedule row: {e}") |
|
continue |
|
|
|
for block in lecture_blocks: |
|
try: |
|
|
|
title = block.find("span", style="font-weight: bold;").text.strip() |
|
|
|
|
|
tldr = block.find("strong", text="tl;dr:").next_sibling.strip() |
|
|
|
|
|
slides_link_tag = block.find("a", title="Download slides") |
|
slides_link = slides_link_tag["href"].strip() if slides_link_tag else None |
|
slides_link = ( |
|
f"https://dl4ds.github.io{slides_link}" if slides_link else None |
|
) |
|
|
|
|
|
recording_link_tag = block.find("a", title="Download lecture recording") |
|
recording_link = ( |
|
recording_link_tag["href"].strip() if recording_link_tag else None |
|
) |
|
|
|
|
|
suggested_readings_tag = block.find("p", text="Suggested Readings:") |
|
if suggested_readings_tag: |
|
suggested_readings = suggested_readings_tag.find_next_sibling("ul") |
|
if suggested_readings: |
|
suggested_readings = suggested_readings.get_text( |
|
separator="\n" |
|
).strip() |
|
else: |
|
suggested_readings = "No specific readings provided." |
|
else: |
|
suggested_readings = "No specific readings provided." |
|
|
|
|
|
date = date_mapping.get(slides_link, "No date available") |
|
|
|
|
|
lecture_metadata[slides_link] = { |
|
"date": date, |
|
"tldr": tldr, |
|
"title": title, |
|
"lecture_recording": recording_link, |
|
"suggested_readings": suggested_readings, |
|
} |
|
except Exception as e: |
|
print(f"Error processing block: {e}") |
|
continue |
|
|
|
return lecture_metadata |
|
|
|
|
|
def download_pdf_from_url(pdf_url): |
|
""" |
|
Function to temporarily download a PDF file from a URL and return the local file path. |
|
|
|
Args: |
|
pdf_url (str): The URL of the PDF file to download. |
|
|
|
Returns: |
|
str: The local file path of the downloaded PDF file. |
|
""" |
|
response = requests.get(pdf_url) |
|
if response.status_code == 200: |
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: |
|
temp_file.write(response.content) |
|
temp_file_path = temp_file.name |
|
return temp_file_path |
|
else: |
|
return None |
|
|