Spaces:

pryanshusharma
/

PrmScrp

Sleeping

File size: 2,476 Bytes

import requests
from bs4 import BeautifulSoup

def get_headers(data: str) -> dict:
    """This funciton helps to get the headers form the string to the dict

    Args:
        data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e)

    Returns:
        dict: Return the dict or you can say header
    """
    data = data.strip()
    data = data.split("\n")
    out = {}
    for dt in data:
        key = dt.split(":", 1)[0].strip()
        value = dt.split(":", 1)[1].strip()

        if value.lower() == "none":
            value = None
        elif value.lower() == "true":
            value = True
        elif value.lower() == "false":
            value = False

        out[key] = value
    return out

def getLinks(url: str) -> list:
    browser = requests.session()
    data = browser.get(url)
    fullPage = BeautifulSoup(data.text, "lxml")
    try:
        links = fullPage.find("div", {"id" : "issue-subject-group-researchpaper"})
        output = []
        for link in links.findAll("div", {"class" : "text-container"}):
            link = link.find("a", {"class" : "issueContentsArticleLink linkHoverDark d-inline-block"}).get("href")
            output.append(f"https://www.degruyter.com{link}")
    except:
        links = fullPage.findAll("a", {"class" : "issueContentsArticleLink linkHoverDark d-inline-block"})
        if len(links) < 1:
            raise AttributeError("Not found")
        for link in links:
            output.append(f"https://www.degruyter.com{link.get('href')}")
    return output

def get_author_details(url: str) -> list:
    browser = requests.session()
    data = browser.get(url)
    authors = BeautifulSoup(data.text, "lxml")
    authors = authors.find("ul", {"class" : "contributors list-unstyled mb-2"})
    output = []
    for author in authors.findAll("span", {"class" : "contributor"}):
        author_name = author.text.strip()
        author_address = author.find("contributor-popdown").get("affiliations").strip()
        # if ";" in author_address:
        #     author_address = author_address.split(";")[0]
        email = author.find("contributor-popdown").get("email").strip()
        if len(email.strip()) < 1:
            continue
        output.append(
            {
                "Name" : author_name,
                "Email" : email,
                "Address" : author_address
            }
        )
    return output