Spaces:
Running
Running
from seleliumdriver import WebScraper | |
from bs4 import BeautifulSoup | |
import time | |
import requests | |
def get_links(url: str): | |
browser = WebScraper("huggingface", hidden=True) | |
browser.get(url) | |
time.sleep(5) # Important to sleep to continue using this | |
pagehtml = browser.get_html() | |
browser.close_browser() | |
fullPage = BeautifulSoup(pagehtml, "lxml") | |
articals = fullPage.find("div", {"class": "j-archive-article"}) | |
output = [] | |
for link in articals.findAll("a", {"class" : "tit ng-binding ng-scope"}): | |
output.append("https://www.aimspress.com" + link.get("href")) | |
if len(output) < 1: | |
raise ValueError("Invalid url found") | |
return output | |
def save(dt): | |
with open("data.html", "w") as op: | |
op.write(str(dt)) | |
print("Done saved") | |
def get_author_details(url: str): | |
browser = requests.session() | |
data = browser.get(url) | |
fullPage = BeautifulSoup(data.text, "lxml") | |
authors = fullPage.find("ul", {"class" : "article-author clear"}) | |
output = [] | |
author_about = fullPage.find("ul", {"class" : "about-author"}) | |
authors_about = [d.text.strip() for d in author_about.findAll("div", {"class" : "lostOf"})] | |
for author in authors.findAll("li"): | |
author_name = author.find("a", {"type" : "authors.authorNameEn"}).text.strip() | |
mail = author.find("a", {"class" : "com-mail"}) | |
if mail: | |
mail = mail.get("href").split(":", 1)[1].strip() | |
else: | |
continue | |
try: | |
author_value_tag = author.find("a", {"class" : "com-num"}).get("data-tagval") | |
if "," in author_value_tag: | |
author_value_tag = [int(da) - 1 for da in author_value_tag.split(",")] | |
else: | |
author_value_tag = [int(author_value_tag) - 1] | |
address = None | |
for a in author_value_tag: | |
if address: | |
address = f"{address} & {authors_about[a]}" | |
else: | |
address = authors_about[a] | |
break | |
except: | |
author_name = author.find("a", {"type" : "authors.authorNameEn"}).text.strip() | |
mail = author.find("a", {"class" : "com-mail"}).get("title").split(":", 1)[1].strip() | |
try: | |
address = author.find("a", {"class" : "com-user"}).get("title").split(":", 1)[1].strip() | |
except: | |
address = url | |
output.append( | |
{ | |
"Name" : author_name, | |
"Email" : mail, | |
"Address" : address | |
} | |
) | |
return output | |