Spaces:
Running
Running
import requests | |
from bs4 import BeautifulSoup | |
def get_headers(data: str) -> dict: | |
"""This funciton helps to get the headers form the string to the dict | |
Args: | |
data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e) | |
Returns: | |
dict: Return the dict or you can say header | |
""" | |
data = data.strip() | |
data = data.split("\n") | |
out = {} | |
for dt in data: | |
key = dt.split(":", 1)[0].strip() | |
value = dt.split(":", 1)[1].strip() | |
if value.lower() == "none": | |
value = None | |
elif value.lower() == "true": | |
value = True | |
elif value.lower() == "false": | |
value = False | |
out[key] = value | |
return out | |
def getLinks(url: str) -> list: | |
browser = requests.session() | |
data = browser.get(url) | |
fullPage = BeautifulSoup(data.text, "lxml") | |
try: | |
links = fullPage.find("div", {"id" : "issue-subject-group-researchpaper"}) | |
output = [] | |
for link in links.findAll("div", {"class" : "text-container"}): | |
link = link.find("a", {"class" : "issueContentsArticleLink linkHoverDark d-inline-block"}).get("href") | |
output.append(f"https://www.degruyter.com{link}") | |
except: | |
links = fullPage.findAll("a", {"class" : "issueContentsArticleLink linkHoverDark d-inline-block"}) | |
if len(links) < 1: | |
raise AttributeError("Not found") | |
for link in links: | |
output.append(f"https://www.degruyter.com{link.get('href')}") | |
return output | |
def get_author_details(url: str) -> list: | |
browser = requests.session() | |
data = browser.get(url) | |
authors = BeautifulSoup(data.text, "lxml") | |
authors = authors.find("ul", {"class" : "contributors list-unstyled mb-2"}) | |
output = [] | |
for author in authors.findAll("span", {"class" : "contributor"}): | |
author_name = author.text.strip() | |
author_address = author.find("contributor-popdown").get("affiliations").strip() | |
# if ";" in author_address: | |
# author_address = author_address.split(";")[0] | |
email = author.find("contributor-popdown").get("email").strip() | |
if len(email.strip()) < 1: | |
continue | |
output.append( | |
{ | |
"Name" : author_name, | |
"Email" : email, | |
"Address" : author_address | |
} | |
) | |
return output | |