import os from requests import session from bs4 import BeautifulSoup import base64 import urllib.parse import traceback import json from sheets import ExcelAutomator req = session() def get_headers(data: str) -> dict: data = data.strip() data = data.split("\n") out = {} for dt in data: key = dt.split(":", 1)[0].strip() value = dt.split(":", 1)[1].strip() if value.lower() == "none": value = None elif value.lower() == "true": value = True elif value.lower() == "false": value = False out[key] = value return out def get_email_from_encoding(encoded_str): base64_decoded = base64.b64decode(encoded_str).decode('utf-8') url_decoded = urllib.parse.unquote(base64_decoded) decoded_json = json.loads(url_decoded) try: if decoded_json["#name"] == 'e-address': if decoded_json['$']['type'] == 'email': if 'href' in decoded_json['$']: if 'mailto:' in decoded_json['$']['href']: return decoded_json['$']['href'].replace("mailto:", "") else: return None else: return decoded_json['_'] else: return None else: return None except Exception as e: with open("jsondata.json", "w") as op: json.dump(decoded_json, op) print(f"ERROR : {e},\n---------------------------------------------------------\n{traceback.format_exc()}\n\n---------------------------------------------------------") exit() def run(url: str, last_artical_name: str=None) -> tuple: """This function helps to get the detail from the first site Args: volume (int): Pass the volume number issue (int): Pass the issue number Returns: tuple : It includes auth data and page title """ headers = """ User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0 Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8 Accept-Language: en-US,en;q=0.5 Accept-Encoding: gzip, deflate, br Connection: keep-alive Upgrade-Insecure-Requests: 1 Sec-Fetch-Dest: document Sec-Fetch-Mode: navigate Sec-Fetch-Site: none Sec-Fetch-User: ?1 Sec-GPC: 1 """ headers = get_headers(headers) # url = f"https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{volume}/issue/{issue}" data = req.get(url, headers=headers) artical_links = [] fullpage = BeautifulSoup(str(data.text), "lxml") if fullpage.title.string.strip() == last_artical_name: return None, fullpage.title.string.strip() for link in fullpage.findAll("a", {"class" : "anchor article-content-title u-margin-xs-top u-margin-s-bottom anchor-default"}): artical_links.append("https://www.sciencedirect.com" + link.get("href")) print(f"Total artical found : {len(artical_links)}") n = 1 auth = [] print(f"Getting all artical from - {fullpage.title.string}") for li in artical_links: print(f"Fetching data of {n} artical") authors = stage_two(li) auth.extend(authors) n += 1 return auth, fullpage.title.string.strip() def stage_two(url: str) -> list: headers = """ User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0 Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8 Accept-Language: en-US,en;q=0.5 Accept-Encoding: gzip, deflate, br Connection: keep-alive Upgrade-Insecure-Requests: 1 Sec-Fetch-Dest: document Sec-Fetch-Mode: navigate Sec-Fetch-Site: none Sec-Fetch-User: ?1 Sec-GPC: 1 """ headers = get_headers(headers) data = req.get(url, headers=headers) page = BeautifulSoup(data.text, "lxml") json_data = page.find("script", {"type" : "application/json"}) json_data = json.loads(json_data.text.strip()) authors_detail = [] address = json_data['authors']['affiliations'] n = 1 if len(json_data['authors']['content']) < 1: return authors_detail if not '$$' in json_data['authors']['content'][0]: with open("jsondata.json", "w") as op: json.dump(json_data, op, indent=4) print("ERROR Check jsondata file") exit() address = "Not Found" addr = [] authr = [] email = None for author in json_data['authors']['content'][0]['$$']: if author['#name'] == 'author': # Its author data author_name = " " for au in author['$$']: if au['#name'] == 'given-name' or au['#name'] == 'name': author_name = au['_'] + author_name if au['#name'] == 'surname': author_name = f"{author_name}{au['_']}" if au['#name'] == 'encoded-e-address': email = get_email_from_encoding(au['__encoded']) if email: authr.append( { 'Name' : author_name, 'Email' : email } ) else: continue if author['#name'] == 'affiliation': for cor in author['$$']: if '_' in cor: if address == "Not Found": address = cor['_'] else: address = f"{address} {cor['_']}" addr.append(address) output = [] for aut in authr: try: address = addr[authr.index(aut)] except: address = "Not Found" if address == "Not Found": address = url output.append( { 'Name' : aut['Name'], 'Email' : aut['Email'], 'Address' : address } ) return output def get_author_info_specific(vol: int, issue: int) -> list: print(f"Getting detail of volume {vol} and issue {issue}") data, page_title = run(vol, issue) return data def get_author_info_in_range(from_vol: int, to_vol: int) -> list: allAuthors = [] last_page_title = None for i in range(from_vol, to_vol + 1): print(f"Getting data of vol {i}") d = 1 while True: try: data, page_title = run(i, d, last_page_title) if last_page_title == page_title: print(f"All issues covered of vol {i} changing volume") print("--------------------------------------------------------------------------") break else: last_page_title = page_title allAuthors.extend(data) print(f"Issue {d} data recieved total authors : {len(allAuthors)}") except Exception as e: print(f"ERROR : {traceback.format_exc()}") print(f"All issues covered of vol {i}") print("--------------------------------------------------------------------------") break d += 1 return allAuthors