Spaces:

pryanshusharma
/

PrmScrp

Running

PrmScrp / amsscrapper.py

H4CK3R-5M4CK3R

Scrpr

57273d8 6 months ago

2.35 kB

	import requests
	from bs4 import BeautifulSoup
	from sheets import ExcelAutomator

	def get_headers(data: str) -> dict:
	"""This funciton helps to get the headers form the string to the dict

	Args:
	data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e)

	Returns:
	dict: Return the dict or you can say header
	"""
	data = data.strip()
	data = data.split("\n")
	out = {}
	for dt in data:
	key = dt.split(":", 1)[0].strip()
	value = dt.split(":", 1)[1].strip()

	if value.lower() == "none":
	value = None
	elif value.lower() == "true":
	value = True
	elif value.lower() == "false":
	value = False

	out[key] = value
	return out

	def getlinks(url: str) -> list:
	browser = requests.session()
	# url = f"https://www.ams.org/journals/jams/{year}-{volume}-{issue}/home.html?active=allissues"
	data = browser.get(url)
	fullPage = BeautifulSoup(data.text, "lxml")
	article = fullPage.find("article", {"class" : "contentList"})
	output = []
	lnk = url.split('home.html', 1)[0]
	for allarticle in article.findAll("dl"):
	output.append(f'{lnk}{allarticle.find("dt").find("a").get("href")}')
	return output

	def get_authors(url: str):
	browser = requests.session()
	data = browser.get(url)
	fullPage = BeautifulSoup(data.text, "lxml")
	details = fullPage.find("section", {"id" : "additionalinformation"})
	email = None
	address = None
	author_name = None
	output = []
	for author in details.findAll("li"):
	if email != None and author_name != None and address != None:
	output.append(
	{
	"Name" : author_name,
	"Email" : email,
	"Address" : address
	}
	)
	email = None
	author_name = None
	address = None
	if author.find("strong"):
	author_name = author.text
	elif "Email:" in author.text:
	email = author.text.split(":", 1)[1].strip()
	elif "Affiliation:" in author.text:
	address = author.text.split(":", 1)[1].strip()
	if author_name == None:
	continue
	return output