PrmScrp / amsscrapper.py
H4CK3R-5M4CK3R
Scrpr
57273d8
raw
history blame
2.35 kB
import requests
from bs4 import BeautifulSoup
from sheets import ExcelAutomator
def get_headers(data: str) -> dict:
"""This funciton helps to get the headers form the string to the dict
Args:
data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e)
Returns:
dict: Return the dict or you can say header
"""
data = data.strip()
data = data.split("\n")
out = {}
for dt in data:
key = dt.split(":", 1)[0].strip()
value = dt.split(":", 1)[1].strip()
if value.lower() == "none":
value = None
elif value.lower() == "true":
value = True
elif value.lower() == "false":
value = False
out[key] = value
return out
def getlinks(url: str) -> list:
browser = requests.session()
# url = f"https://www.ams.org/journals/jams/{year}-{volume}-{issue}/home.html?active=allissues"
data = browser.get(url)
fullPage = BeautifulSoup(data.text, "lxml")
article = fullPage.find("article", {"class" : "contentList"})
output = []
lnk = url.split('home.html', 1)[0]
for allarticle in article.findAll("dl"):
output.append(f'{lnk}{allarticle.find("dt").find("a").get("href")}')
return output
def get_authors(url: str):
browser = requests.session()
data = browser.get(url)
fullPage = BeautifulSoup(data.text, "lxml")
details = fullPage.find("section", {"id" : "additionalinformation"})
email = None
address = None
author_name = None
output = []
for author in details.findAll("li"):
if email != None and author_name != None and address != None:
output.append(
{
"Name" : author_name,
"Email" : email,
"Address" : address
}
)
email = None
author_name = None
address = None
if author.find("strong"):
author_name = author.text
elif "Email:" in author.text:
email = author.text.split(":", 1)[1].strip()
elif "Affiliation:" in author.text:
address = author.text.split(":", 1)[1].strip()
if author_name == None:
continue
return output