Spaces:

pryanshusharma
/

PrmScrp

Sleeping

File size: 3,271 Bytes

import requests
from bs4 import BeautifulSoup
import json

def get_headers(data: str) -> dict:
    """This funciton helps to get the headers form the string to the dict

    Args:
        data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e)

    Returns:
        dict: Return the dict or you can say header
    """
    data = data.strip()
    data = data.split("\n")
    out = {}
    for dt in data:
        key = dt.split(":", 1)[0].strip()
        value = dt.split(":", 1)[1].strip()

        if value.lower() == "none":
            value = None
        elif value.lower() == "true":
            value = True
        elif value.lower() == "false":
            value = False

        out[key] = value
    return out

def get_all_articals_link(url: str) -> dict:
    browser = requests.session()
    # url = f"https://link.springer.com/journal/208/volumes-and-issues/{volume}-{issue}"
    headers = """
    User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
    Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
    Accept-Language: en-US,en;q=0.5
    Accept-Encoding: gzip, deflate, br
    Referer: https://link.springer.com/journal/208/volumes-and-issues
    Alt-Used: link.springer.com
    Connection: keep-alive
    Upgrade-Insecure-Requests: 1
    Sec-Fetch-Dest: document
    Sec-Fetch-Mode: navigate
    Sec-Fetch-Site: same-origin
    Sec-Fetch-User: ?1
    Sec-GPC: 1
    TE: trailers
    """

    head = get_headers(headers)

    data = browser.get(url, headers=head)

    fullpage = BeautifulSoup(data.text, "lxml")

    orderlist = fullpage.find("ol", {"class" : "u-list-reset"})
    allLinks = []
    for dt in orderlist.findAll("li"):
        if not dt.find("a"):
            continue
        allLinks.append(dt.find("a").get("href"))
    return allLinks

def get_authors(url: str) -> list:
    browser = requests.session()
    headers = """
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br
Referer: https://link.springer.com/journal/208/volumes-and-issues
Alt-Used: link.springer.com
Connection: keep-alive
Upgrade-Insecure-Requests: 1
Sec-Fetch-Dest: document
Sec-Fetch-Mode: navigate
Sec-Fetch-Site: same-origin
Sec-Fetch-User: ?1
Sec-GPC: 1
TE: trailers
    """

    head = get_headers(headers)
    data = browser.get(url, headers=head)

    main_page = BeautifulSoup(data.text, "lxml")
    json_data = main_page.find("script", {"type" : "application/ld+json"}).text
    json_data = json.loads(json_data)
    authors = json_data['mainEntity']['author']
    output = []
    for author in authors:
        if 'email' in author:
            try:
                address = author['affiliation'][0]['address']['name']
            except:
                address = ""
            output.append(
                {
                    "Name" : author['name'],
                    'Email' : author['email'],
                    'Address' : address
                }
            )
    return output