|
import requests |
|
from bs4 import BeautifulSoup |
|
from tqdm import tqdm |
|
from urllib.parse import urlparse |
|
import chainlit as cl |
|
from langchain import PromptTemplate |
|
import requests |
|
from bs4 import BeautifulSoup |
|
|
|
try: |
|
from modules.constants import * |
|
except: |
|
from constants import * |
|
|
|
""" |
|
Ref: https://python.plainenglish.io/scraping-the-subpages-on-a-website-ea2d4e3db113 |
|
""" |
|
|
|
|
|
class WebpageCrawler: |
|
def __init__(self): |
|
pass |
|
|
|
def getdata(self, url): |
|
r = requests.get(url) |
|
return r.text |
|
|
|
def url_exists(self, url): |
|
try: |
|
response = requests.head(url) |
|
return response.status_code == 200 |
|
except requests.ConnectionError: |
|
return False |
|
|
|
def get_links(self, website_link, base_url=None): |
|
if base_url is None: |
|
base_url = website_link |
|
html_data = self.getdata(website_link) |
|
soup = BeautifulSoup(html_data, "html.parser") |
|
list_links = [] |
|
for link in soup.find_all("a", href=True): |
|
|
|
|
|
|
|
link["href"] = link["href"].strip() |
|
|
|
if str(link["href"]).startswith((str(website_link))): |
|
list_links.append(link["href"]) |
|
|
|
|
|
if str(link["href"]).startswith("/"): |
|
if link["href"] not in self.dict_href_links: |
|
print(link["href"]) |
|
self.dict_href_links[link["href"]] = None |
|
link_with_www = base_url + link["href"][1:] |
|
if self.url_exists(link_with_www): |
|
print("adjusted link =", link_with_www) |
|
list_links.append(link_with_www) |
|
|
|
|
|
dict_links = dict.fromkeys(list_links, "Not-checked") |
|
return dict_links |
|
|
|
def get_subpage_links(self, l, base_url): |
|
for link in tqdm(l): |
|
print("checking link:", link) |
|
if not link.endswith("/"): |
|
l[link] = "Checked" |
|
dict_links_subpages = {} |
|
else: |
|
|
|
if l[link] == "Not-checked": |
|
dict_links_subpages = self.get_links(link, base_url) |
|
|
|
l[link] = "Checked" |
|
else: |
|
|
|
dict_links_subpages = {} |
|
|
|
l = {**dict_links_subpages, **l} |
|
return l |
|
|
|
def get_all_pages(self, url, base_url): |
|
dict_links = {url: "Not-checked"} |
|
self.dict_href_links = {} |
|
counter, counter2 = None, 0 |
|
while counter != 0: |
|
counter2 += 1 |
|
dict_links2 = self.get_subpage_links(dict_links, base_url) |
|
|
|
|
|
counter = sum(value == "Not-checked" for value in dict_links2.values()) |
|
dict_links = dict_links2 |
|
checked_urls = [ |
|
url for url, status in dict_links.items() if status == "Checked" |
|
] |
|
return checked_urls |
|
|
|
|
|
def get_urls_from_file(file_path: str): |
|
""" |
|
Function to get urls from a file |
|
""" |
|
with open(file_path, "r") as f: |
|
urls = f.readlines() |
|
urls = [url.strip() for url in urls] |
|
return urls |
|
|
|
|
|
def get_base_url(url): |
|
parsed_url = urlparse(url) |
|
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/" |
|
return base_url |
|
|
|
|
|
def get_prompt(config): |
|
if config["llm_params"]["use_history"]: |
|
if config["llm_params"]["llm_loader"] == "local_llm": |
|
custom_prompt_template = tinyllama_prompt_template_with_history |
|
elif config["llm_params"]["llm_loader"] == "openai": |
|
custom_prompt_template = openai_prompt_template_with_history |
|
|
|
|
|
prompt = PromptTemplate( |
|
template=custom_prompt_template, |
|
input_variables=["context", "chat_history", "question"], |
|
) |
|
else: |
|
if config["llm_params"]["llm_loader"] == "local_llm": |
|
custom_prompt_template = tinyllama_prompt_template |
|
elif config["llm_params"]["llm_loader"] == "openai": |
|
custom_prompt_template = openai_prompt_template |
|
|
|
|
|
prompt = PromptTemplate( |
|
template=custom_prompt_template, |
|
input_variables=["context", "question"], |
|
) |
|
return prompt |
|
|
|
|
|
def get_sources(res, answer): |
|
source_elements = [] |
|
source_dict = {} |
|
|
|
for idx, source in enumerate(res["source_documents"]): |
|
source_metadata = source.metadata |
|
url = source_metadata["source"] |
|
score = source_metadata.get("score", "N/A") |
|
page = source_metadata.get("page", 1) |
|
|
|
lecture_tldr = source_metadata.get("tldr", "N/A") |
|
lecture_recording = source_metadata.get("lecture_recording", "N/A") |
|
suggested_readings = source_metadata.get("suggested_readings", "N/A") |
|
date = source_metadata.get("date", "N/A") |
|
|
|
source_type = source_metadata.get("source_type", "N/A") |
|
|
|
url_name = f"{url}_{page}" |
|
if url_name not in source_dict: |
|
source_dict[url_name] = { |
|
"text": source.page_content, |
|
"url": url, |
|
"score": score, |
|
"page": page, |
|
"lecture_tldr": lecture_tldr, |
|
"lecture_recording": lecture_recording, |
|
"suggested_readings": suggested_readings, |
|
"date": date, |
|
"source_type": source_type, |
|
} |
|
else: |
|
source_dict[url_name]["text"] += f"\n\n{source.page_content}" |
|
|
|
|
|
full_answer = "**Answer:**\n" |
|
full_answer += answer |
|
|
|
|
|
full_answer += "\n\n**Sources:**\n" |
|
for idx, (url_name, source_data) in enumerate(source_dict.items()): |
|
full_answer += f"\nSource {idx + 1} (Score: {source_data['score']}): {source_data['url']}\n" |
|
|
|
name = f"Source {idx + 1} Text\n" |
|
full_answer += name |
|
source_elements.append(cl.Text(name=name, content=source_data["text"])) |
|
|
|
|
|
if source_data["url"].lower().endswith(".pdf"): |
|
name = f"Source {idx + 1} PDF\n" |
|
full_answer += name |
|
pdf_url = f"{source_data['url']}#page={source_data['page']+1}" |
|
source_elements.append(cl.Pdf(name=name, url=pdf_url)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
full_answer += "\n**Metadata:**\n" |
|
for url_name, source_data in source_dict.items(): |
|
full_answer += f"\nSource: {source_data['url']}\n" |
|
full_answer += f"Page: {source_data['page']}\n" |
|
full_answer += f"Type: {source_data['source_type']}\n" |
|
full_answer += f"Date: {source_data['date']}\n" |
|
full_answer += f"TL;DR: {source_data['lecture_tldr']}\n" |
|
full_answer += f"Lecture Recording: {source_data['lecture_recording']}\n" |
|
full_answer += f"Suggested Readings: {source_data['suggested_readings']}\n" |
|
|
|
return full_answer, source_elements |
|
|
|
|
|
def get_lecture_metadata(lectures_url, schedule_url): |
|
""" |
|
Function to get the lecture metadata from the lectures and schedule URLs. |
|
""" |
|
lecture_metadata = {} |
|
|
|
|
|
r_lectures = requests.get(lectures_url) |
|
soup_lectures = BeautifulSoup(r_lectures.text, "html.parser") |
|
|
|
|
|
r_schedule = requests.get(schedule_url) |
|
soup_schedule = BeautifulSoup(r_schedule.text, "html.parser") |
|
|
|
|
|
lecture_blocks = soup_lectures.find_all("div", class_="lecture-container") |
|
|
|
|
|
date_mapping = {} |
|
schedule_rows = soup_schedule.find_all("li", class_="table-row-lecture") |
|
for row in schedule_rows: |
|
try: |
|
date = ( |
|
row.find("div", {"data-label": "Date"}).get_text(separator=" ").strip() |
|
) |
|
description_div = row.find("div", {"data-label": "Description"}) |
|
slides_link_tag = description_div.find("a", title="Download slides") |
|
slides_link = slides_link_tag["href"].strip() if slides_link_tag else None |
|
slides_link = ( |
|
f"https://dl4ds.github.io{slides_link}" if slides_link else None |
|
) |
|
if slides_link: |
|
date_mapping[slides_link] = date |
|
except Exception as e: |
|
print(f"Error processing schedule row: {e}") |
|
continue |
|
|
|
for block in lecture_blocks: |
|
try: |
|
|
|
title = block.find("span", style="font-weight: bold;").text.strip() |
|
|
|
|
|
tldr = block.find("strong", text="tl;dr:").next_sibling.strip() |
|
|
|
|
|
slides_link_tag = block.find("a", title="Download slides") |
|
slides_link = slides_link_tag["href"].strip() if slides_link_tag else None |
|
slides_link = ( |
|
f"https://dl4ds.github.io{slides_link}" if slides_link else None |
|
) |
|
|
|
|
|
recording_link_tag = block.find("a", title="Download lecture recording") |
|
recording_link = ( |
|
recording_link_tag["href"].strip() if recording_link_tag else None |
|
) |
|
|
|
|
|
suggested_readings_tag = block.find("p", text="Suggested Readings:") |
|
if suggested_readings_tag: |
|
suggested_readings = suggested_readings_tag.find_next_sibling("ul") |
|
if suggested_readings: |
|
suggested_readings = suggested_readings.get_text( |
|
separator="\n" |
|
).strip() |
|
else: |
|
suggested_readings = "No specific readings provided." |
|
else: |
|
suggested_readings = "No specific readings provided." |
|
|
|
|
|
date = date_mapping.get(slides_link, "No date available") |
|
|
|
|
|
lecture_metadata[slides_link] = { |
|
"date": date, |
|
"tldr": tldr, |
|
"title": title, |
|
"lecture_recording": recording_link, |
|
"suggested_readings": suggested_readings, |
|
} |
|
except Exception as e: |
|
print(f"Error processing block: {e}") |
|
continue |
|
|
|
return lecture_metadata |
|
|