PrmScrp / degruyterscrapper.py
pryanshusharma's picture
Update degruyterscrapper.py
a73bd7d verified
import requests
from bs4 import BeautifulSoup
def get_headers(data: str) -> dict:
"""This funciton helps to get the headers form the string to the dict
Args:
data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e)
Returns:
dict: Return the dict or you can say header
"""
data = data.strip()
data = data.split("\n")
out = {}
for dt in data:
key = dt.split(":", 1)[0].strip()
value = dt.split(":", 1)[1].strip()
if value.lower() == "none":
value = None
elif value.lower() == "true":
value = True
elif value.lower() == "false":
value = False
out[key] = value
return out
def getLinks(url: str) -> list:
browser = requests.session()
data = browser.get(url)
fullPage = BeautifulSoup(data.text, "lxml")
try:
links = fullPage.find("div", {"id" : "issue-subject-group-researchpaper"})
output = []
for link in links.findAll("div", {"class" : "text-container"}):
link = link.find("a", {"class" : "issueContentsArticleLink linkHoverDark d-inline-block"}).get("href")
output.append(f"https://www.degruyter.com{link}")
except:
links = fullPage.findAll("a", {"class" : "issueContentsArticleLink linkHoverDark d-inline-block"})
if len(links) < 1:
raise AttributeError("Not found")
for link in links:
output.append(f"https://www.degruyter.com{link.get('href')}")
return output
def get_author_details(url: str) -> list:
browser = requests.session()
data = browser.get(url)
authors = BeautifulSoup(data.text, "lxml")
authors = authors.find("ul", {"class" : "contributors list-unstyled mb-2"})
output = []
for author in authors.findAll("span", {"class" : "contributor"}):
author_name = author.text.strip()
author_address = author.find("contributor-popdown").get("affiliations").strip()
# if ";" in author_address:
# author_address = author_address.split(";")[0]
email = author.find("contributor-popdown").get("email").strip()
if len(email.strip()) < 1:
continue
output.append(
{
"Name" : author_name,
"Email" : email,
"Address" : author_address
}
)
return output