PrmScrp / aiimsscrapper.py
H4CK3R-5M4CK3R
Added auto notification and removed multiple address
8fbf770
raw
history blame
2.6 kB
from seleliumdriver import WebScraper
from bs4 import BeautifulSoup
import time
import requests
def get_links(url: str):
browser = WebScraper("huggingface", hidden=True)
browser.get(url)
time.sleep(5) # Important to sleep to continue using this
pagehtml = browser.get_html()
browser.close_browser()
fullPage = BeautifulSoup(pagehtml, "lxml")
articals = fullPage.find("div", {"class": "j-archive-article"})
output = []
for link in articals.findAll("a", {"class" : "tit ng-binding ng-scope"}):
output.append("https://www.aimspress.com" + link.get("href"))
if len(output) < 1:
raise ValueError("Invalid url found")
return output
def save(dt):
with open("data.html", "w") as op:
op.write(str(dt))
print("Done saved")
def get_author_details(url: str):
browser = requests.session()
data = browser.get(url)
fullPage = BeautifulSoup(data.text, "lxml")
authors = fullPage.find("ul", {"class" : "article-author clear"})
output = []
author_about = fullPage.find("ul", {"class" : "about-author"})
authors_about = [d.text.strip() for d in author_about.findAll("div", {"class" : "lostOf"})]
for author in authors.findAll("li"):
author_name = author.find("a", {"type" : "authors.authorNameEn"}).text.strip()
mail = author.find("a", {"class" : "com-mail"})
if mail:
mail = mail.get("href").split(":", 1)[1].strip()
else:
continue
try:
author_value_tag = author.find("a", {"class" : "com-num"}).get("data-tagval")
if "," in author_value_tag:
author_value_tag = [int(da) - 1 for da in author_value_tag.split(",")]
else:
author_value_tag = [int(author_value_tag) - 1]
address = None
for a in author_value_tag:
if address:
address = f"{address} & {authors_about[a]}"
else:
address = authors_about[a]
break
except:
author_name = author.find("a", {"type" : "authors.authorNameEn"}).text.strip()
mail = author.find("a", {"class" : "com-mail"}).get("title").split(":", 1)[1].strip()
try:
address = author.find("a", {"class" : "com-user"}).get("title").split(":", 1)[1].strip()
except:
address = url
output.append(
{
"Name" : author_name,
"Email" : mail,
"Address" : address
}
)
return output