Spaces:

pryanshusharma
/

PrmScrp

Running

PrmScrp / aiimsscrapper.py

H4CK3R-5M4CK3R

Added auto notification and removed multiple address

8fbf770 6 months ago

2.6 kB

	from seleliumdriver import WebScraper
	from bs4 import BeautifulSoup
	import time
	import requests

	def get_links(url: str):
	browser = WebScraper("huggingface", hidden=True)
	browser.get(url)
	time.sleep(5) # Important to sleep to continue using this
	pagehtml = browser.get_html()
	browser.close_browser()
	fullPage = BeautifulSoup(pagehtml, "lxml")
	articals = fullPage.find("div", {"class": "j-archive-article"})
	output = []
	for link in articals.findAll("a", {"class" : "tit ng-binding ng-scope"}):
	output.append("https://www.aimspress.com" + link.get("href"))
	if len(output) < 1:
	raise ValueError("Invalid url found")
	return output

	def save(dt):
	with open("data.html", "w") as op:
	op.write(str(dt))
	print("Done saved")

	def get_author_details(url: str):
	browser = requests.session()
	data = browser.get(url)
	fullPage = BeautifulSoup(data.text, "lxml")
	authors = fullPage.find("ul", {"class" : "article-author clear"})
	output = []
	author_about = fullPage.find("ul", {"class" : "about-author"})
	authors_about = [d.text.strip() for d in author_about.findAll("div", {"class" : "lostOf"})]
	for author in authors.findAll("li"):
	author_name = author.find("a", {"type" : "authors.authorNameEn"}).text.strip()
	mail = author.find("a", {"class" : "com-mail"})
	if mail:
	mail = mail.get("href").split(":", 1)[1].strip()
	else:
	continue
	try:
	author_value_tag = author.find("a", {"class" : "com-num"}).get("data-tagval")
	if "," in author_value_tag:
	author_value_tag = [int(da) - 1 for da in author_value_tag.split(",")]
	else:
	author_value_tag = [int(author_value_tag) - 1]
	address = None
	for a in author_value_tag:
	if address:
	address = f"{address} & {authors_about[a]}"
	else:
	address = authors_about[a]
	break
	except:
	author_name = author.find("a", {"type" : "authors.authorNameEn"}).text.strip()
	mail = author.find("a", {"class" : "com-mail"}).get("title").split(":", 1)[1].strip()
	try:
	address = author.find("a", {"class" : "com-user"}).get("title").split(":", 1)[1].strip()
	except:
	address = url
	output.append(
	{
	"Name" : author_name,
	"Email" : mail,
	"Address" : address
	}
	)
	return output