Spaces:
Running
Running
import os | |
from requests import session | |
from bs4 import BeautifulSoup | |
import base64 | |
import urllib.parse | |
import traceback | |
import json | |
from sheets import ExcelAutomator | |
req = session() | |
def get_headers(data: str) -> dict: | |
data = data.strip() | |
data = data.split("\n") | |
out = {} | |
for dt in data: | |
key = dt.split(":", 1)[0].strip() | |
value = dt.split(":", 1)[1].strip() | |
if value.lower() == "none": | |
value = None | |
elif value.lower() == "true": | |
value = True | |
elif value.lower() == "false": | |
value = False | |
out[key] = value | |
return out | |
def get_email_from_encoding(encoded_str): | |
base64_decoded = base64.b64decode(encoded_str).decode('utf-8') | |
url_decoded = urllib.parse.unquote(base64_decoded) | |
decoded_json = json.loads(url_decoded) | |
try: | |
if decoded_json["#name"] == 'e-address': | |
if decoded_json['$']['type'] == 'email': | |
if 'href' in decoded_json['$']: | |
if 'mailto:' in decoded_json['$']['href']: | |
return decoded_json['$']['href'].replace("mailto:", "") | |
else: | |
return None | |
else: | |
return decoded_json['_'] | |
else: | |
return None | |
else: | |
return None | |
except Exception as e: | |
with open("jsondata.json", "w") as op: | |
json.dump(decoded_json, op) | |
print(f"ERROR : {e},\n---------------------------------------------------------\n{traceback.format_exc()}\n\n---------------------------------------------------------") | |
exit() | |
def run(url: str, last_artical_name: str=None) -> tuple: | |
"""This function helps to get the detail from the first site | |
Args: | |
volume (int): Pass the volume number | |
issue (int): Pass the issue number | |
Returns: | |
tuple : It includes auth data and page title | |
""" | |
headers = """ | |
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0 | |
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8 | |
Accept-Language: en-US,en;q=0.5 | |
Accept-Encoding: gzip, deflate, br | |
Connection: keep-alive | |
Upgrade-Insecure-Requests: 1 | |
Sec-Fetch-Dest: document | |
Sec-Fetch-Mode: navigate | |
Sec-Fetch-Site: none | |
Sec-Fetch-User: ?1 | |
Sec-GPC: 1 | |
""" | |
headers = get_headers(headers) | |
# url = f"https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{volume}/issue/{issue}" | |
data = req.get(url, headers=headers) | |
artical_links = [] | |
fullpage = BeautifulSoup(str(data.text), "lxml") | |
if fullpage.title.string.strip() == last_artical_name: | |
return None, fullpage.title.string.strip() | |
for link in fullpage.findAll("a", {"class" : "anchor article-content-title u-margin-xs-top u-margin-s-bottom anchor-default"}): | |
artical_links.append("https://www.sciencedirect.com" + link.get("href")) | |
print(f"Total artical found : {len(artical_links)}") | |
n = 1 | |
auth = [] | |
print(f"Getting all artical from - {fullpage.title.string}") | |
for li in artical_links: | |
print(f"Fetching data of {n} artical") | |
authors = stage_two(li) | |
auth.extend(authors) | |
n += 1 | |
return auth, fullpage.title.string.strip() | |
def stage_two(url: str) -> list: | |
headers = """ | |
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0 | |
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8 | |
Accept-Language: en-US,en;q=0.5 | |
Accept-Encoding: gzip, deflate, br | |
Connection: keep-alive | |
Upgrade-Insecure-Requests: 1 | |
Sec-Fetch-Dest: document | |
Sec-Fetch-Mode: navigate | |
Sec-Fetch-Site: none | |
Sec-Fetch-User: ?1 | |
Sec-GPC: 1 | |
""" | |
headers = get_headers(headers) | |
data = req.get(url, headers=headers) | |
page = BeautifulSoup(data.text, "lxml") | |
json_data = page.find("script", {"type" : "application/json"}) | |
json_data = json.loads(json_data.text.strip()) | |
authors_detail = [] | |
address = json_data['authors']['affiliations'] | |
n = 1 | |
if len(json_data['authors']['content']) < 1: | |
return authors_detail | |
if not '$$' in json_data['authors']['content'][0]: | |
with open("jsondata.json", "w") as op: | |
json.dump(json_data, op, indent=4) | |
print("ERROR Check jsondata file") | |
exit() | |
address = "Not Found" | |
addr = [] | |
authr = [] | |
email = None | |
for author in json_data['authors']['content'][0]['$$']: | |
if author['#name'] == 'author': | |
# Its author data | |
author_name = " " | |
for au in author['$$']: | |
if au['#name'] == 'given-name' or au['#name'] == 'name': | |
author_name = au['_'] + author_name | |
if au['#name'] == 'surname': | |
author_name = f"{author_name}{au['_']}" | |
if au['#name'] == 'encoded-e-address': | |
email = get_email_from_encoding(au['__encoded']) | |
if email: | |
authr.append( | |
{ | |
'Name' : author_name, | |
'Email' : email | |
} | |
) | |
else: | |
continue | |
if author['#name'] == 'affiliation': | |
for cor in author['$$']: | |
if '_' in cor: | |
if address == "Not Found": | |
address = cor['_'] | |
else: | |
address = f"{address} {cor['_']}" | |
addr.append(address) | |
output = [] | |
for aut in authr: | |
try: | |
address = addr[authr.index(aut)] | |
except: | |
address = "Not Found" | |
if address == "Not Found": | |
address = url | |
output.append( | |
{ | |
'Name' : aut['Name'], | |
'Email' : aut['Email'], | |
'Address' : address | |
} | |
) | |
return output | |
def get_author_info_specific(vol: int, issue: int) -> list: | |
print(f"Getting detail of volume {vol} and issue {issue}") | |
data, page_title = run(vol, issue) | |
return data | |
def get_author_info_in_range(from_vol: int, to_vol: int) -> list: | |
allAuthors = [] | |
last_page_title = None | |
for i in range(from_vol, to_vol + 1): | |
print(f"Getting data of vol {i}") | |
d = 1 | |
while True: | |
try: | |
data, page_title = run(i, d, last_page_title) | |
if last_page_title == page_title: | |
print(f"All issues covered of vol {i} changing volume") | |
print("--------------------------------------------------------------------------") | |
break | |
else: | |
last_page_title = page_title | |
allAuthors.extend(data) | |
print(f"Issue {d} data recieved total authors : {len(allAuthors)}") | |
except Exception as e: | |
print(f"ERROR : {traceback.format_exc()}") | |
print(f"All issues covered of vol {i}") | |
print("--------------------------------------------------------------------------") | |
break | |
d += 1 | |
return allAuthors | |