Spaces:

pryanshusharma
/

PrmScrp

Sleeping

File size: 7,122 Bytes

import os
from requests import session
from bs4 import BeautifulSoup
import base64
import urllib.parse
import traceback
import json
from sheets import ExcelAutomator

req = session()

def get_headers(data: str) -> dict:
    data = data.strip()
    data = data.split("\n")
    out = {}
    for dt in data:
        key = dt.split(":", 1)[0].strip()
        value = dt.split(":", 1)[1].strip()

        if value.lower() == "none":
            value = None
        elif value.lower() == "true":
            value = True
        elif value.lower() == "false":
            value = False

        out[key] = value
    return out

def get_email_from_encoding(encoded_str):
    try:
        base64_decoded = base64.b64decode(encoded_str).decode('utf-8')
        url_decoded = urllib.parse.unquote(base64_decoded)
        decoded_json = json.loads(url_decoded)
        try:
            if decoded_json["#name"] == 'e-address':
                if decoded_json['$']['type'] == 'email':
                    if 'href' in decoded_json['$']:
                        if 'mailto:' in decoded_json['$']['href']:
                            return decoded_json['$']['href'].replace("mailto:", "")
                        else:
                            return None
                    else:
                        return decoded_json['_']
                else:
                    return None
            else:
                return None
        except Exception as e:
            with open("jsondata.json", "w") as op:
                json.dump(decoded_json, op)
            print(f"ERROR : {e},\n---------------------------------------------------------\n{traceback.format_exc()}\n\n---------------------------------------------------------")
            exit()
    except:
        return None

def run(url: str, last_artical_name: str=None) -> tuple:
    """This function helps to get the detail from the first site

    Args:
        volume (int): Pass the volume number
        issue (int): Pass the issue number

    Returns:
        tuple : It includes auth data and page title
    """
    headers = """
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br
Connection: keep-alive
Upgrade-Insecure-Requests: 1
Sec-Fetch-Dest: document
Sec-Fetch-Mode: navigate
Sec-Fetch-Site: none
Sec-Fetch-User: ?1
Sec-GPC: 1
    """

    headers = get_headers(headers)

    # url = f"https://www.sciencedirect.com/journal/advances-in-applied-mathematics/vol/{volume}/suppl/C"

    data = req.get(url, headers=headers)
    
    artical_links = []
    fullpage = BeautifulSoup(str(data.text), "lxml")
    if fullpage.title.string.strip() == last_artical_name:
        return None, fullpage.title.string.strip()
    for link in fullpage.findAll("a", {"class" : "anchor article-content-title u-margin-xs-top u-margin-s-bottom anchor-default"}):
        artical_links.append("https://www.sciencedirect.com" + link.get("href"))
    print(f"Total artical found : {len(artical_links)}")
    n = 1
    auth = []
    print(f"Getting all artical from - {fullpage.title.string}")
    for li in artical_links:
        print(f"Fetching data of {n} artical")
        authors = stage_two(li)
        auth.extend(authors)
        n += 1
    return auth, fullpage.title.string.strip()

def stage_two(url: str) -> list:

    headers = """
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br
Connection: keep-alive
Upgrade-Insecure-Requests: 1
Sec-Fetch-Dest: document
Sec-Fetch-Mode: navigate
Sec-Fetch-Site: none
Sec-Fetch-User: ?1
Sec-GPC: 1
    """
    headers = get_headers(headers)

    data = req.get(url, headers=headers)
    page = BeautifulSoup(data.text, "lxml")
    json_data = page.find("script", {"type" : "application/json"})
    json_data = json.loads(json_data.text.strip())
    authors_detail = []
    address = json_data['authors']['affiliations']
    n = 1
    if len(json_data['authors']['content']) < 1:
        return authors_detail
    if not '$$' in json_data['authors']['content'][0]:
        with open("jsondata.json", "w") as op:
            json.dump(json_data, op, indent=4)
        print("ERROR Check jsondata file")
        exit()
    address = "Not Found"
    addr = []
    authr = []
    for author in json_data['authors']['content'][0]['$$']:
        if author['#name'] == 'author':
            # Its author data
            author_name = " "
            for au in author['$$']:
                if au['#name'] == 'given-name' or au['#name'] == 'name':
                    author_name = au['_'] + author_name
                if au['#name'] == 'surname':
                    author_name = f"{author_name}{au['_']}"
                if au['#name'] == 'encoded-e-address':
                    email = get_email_from_encoding(au['__encoded'])
            if email:
                authr.append(
                    {
                        'Name' : author_name,
                        'Email' : email
                    }
                )
        if author['#name'] == 'affiliation':
            for cor in author['$$']:
                if '_' in cor:
                    if address == "Not Found":
                        address = cor['_']
                    else:
                        address = f"{address} {cor['_']}"
            addr.append(address)

    output = []
    for aut in authr:
        try:
            address = addr[authr.index(aut)]
        except:
            address = "Not Found"
        if address == "Not Found":
            address = ""
        output.append(
            {
                'Name' : aut['Name'],
                'Email' : aut['Email'],
                'Address' : address
            }
        )
    return output

def get_author_info_specific(vol: int) -> list:
    print(f"Getting detail of volume {vol}")
    data, page_title = run(vol)
    return data

def get_author_info_in_range(from_vol: int, to_vol: int) -> list:
    allAuthors = []
    last_page_title = None
    for i in range(from_vol, to_vol + 1):
        print(f"Getting data of vol {i}")
        try:
            data, page_title = run(i, last_page_title)
            if last_page_title == page_title:
                print(f"All issues covered of vol {i} changing volume")
                print("--------------------------------------------------------------------------")
                break
            else:
                last_page_title = page_title
            allAuthors.extend(data)
            print(f"Data recieved total authors : {len(allAuthors)}")
        except Exception as e:
            print(f"ERROR : {traceback.format_exc()}")
            print(f"All issues covered of vol {i}")
            print("--------------------------------------------------------------------------")
            break
    return allAuthors