PrmScrp / degruyterscrapper.py
H4CK3R-5M4CK3R
Scrpr
57273d8
raw
history blame
2.14 kB
import requests
from bs4 import BeautifulSoup
def get_headers(data: str) -> dict:
"""This funciton helps to get the headers form the string to the dict
Args:
data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e)
Returns:
dict: Return the dict or you can say header
"""
data = data.strip()
data = data.split("\n")
out = {}
for dt in data:
key = dt.split(":", 1)[0].strip()
value = dt.split(":", 1)[1].strip()
if value.lower() == "none":
value = None
elif value.lower() == "true":
value = True
elif value.lower() == "false":
value = False
out[key] = value
return out
def getLinks(url: str) -> list:
browser = requests.session()
# url = f"https://www.degruyter.com/journal/key/fca/{volume}/{issue}/html"
data = browser.get(url)
fullPage = BeautifulSoup(data.text, "lxml")
links = fullPage.find("div", {"id" : "issue-subject-group-researchpaper"})
output = []
for link in links.findAll("div", {"class" : "text-container"}):
link = link.find("a", {"class" : "issueContentsArticleLink linkHoverDark d-inline-block"}).get("href")
output.append(f"https://www.degruyter.com{link}")
return output
def get_author_details(url: str) -> list:
browser = requests.session()
data = browser.get(url)
authors = BeautifulSoup(data.text, "lxml")
authors = authors.find("ul", {"class" : "contributors list-unstyled mb-2"})
output = []
for author in authors.findAll("span", {"class" : "contributor"}):
author_name = author.text.strip()
author_address = author.find("contributor-popdown").get("affiliations").strip()
email = author.find("contributor-popdown").get("email").strip()
if len(email.strip()) < 1:
continue
output.append(
{
"Name" : author_name,
"Email" : email,
"Address" : author_address
}
)
return output