PrmScrp / sciencedirect.py
H4CK3R-5M4CK3R
Scrpr
57273d8
raw
history blame
7.22 kB
import os
from requests import session
from bs4 import BeautifulSoup
import base64
import urllib.parse
import traceback
import json
from sheets import ExcelAutomator
req = session()
def get_headers(data: str) -> dict:
data = data.strip()
data = data.split("\n")
out = {}
for dt in data:
key = dt.split(":", 1)[0].strip()
value = dt.split(":", 1)[1].strip()
if value.lower() == "none":
value = None
elif value.lower() == "true":
value = True
elif value.lower() == "false":
value = False
out[key] = value
return out
def get_email_from_encoding(encoded_str):
base64_decoded = base64.b64decode(encoded_str).decode('utf-8')
url_decoded = urllib.parse.unquote(base64_decoded)
decoded_json = json.loads(url_decoded)
try:
if decoded_json["#name"] == 'e-address':
if decoded_json['$']['type'] == 'email':
if 'href' in decoded_json['$']:
if 'mailto:' in decoded_json['$']['href']:
return decoded_json['$']['href'].replace("mailto:", "")
else:
return None
else:
return decoded_json['_']
else:
return None
else:
return None
except Exception as e:
with open("jsondata.json", "w") as op:
json.dump(decoded_json, op)
print(f"ERROR : {e},\n---------------------------------------------------------\n{traceback.format_exc()}\n\n---------------------------------------------------------")
exit()
def run(url: str, last_artical_name: str=None) -> tuple:
"""This function helps to get the detail from the first site
Args:
volume (int): Pass the volume number
issue (int): Pass the issue number
Returns:
tuple : It includes auth data and page title
"""
headers = """
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br
Connection: keep-alive
Upgrade-Insecure-Requests: 1
Sec-Fetch-Dest: document
Sec-Fetch-Mode: navigate
Sec-Fetch-Site: none
Sec-Fetch-User: ?1
Sec-GPC: 1
"""
headers = get_headers(headers)
# url = f"https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{volume}/issue/{issue}"
data = req.get(url, headers=headers)
artical_links = []
fullpage = BeautifulSoup(str(data.text), "lxml")
if fullpage.title.string.strip() == last_artical_name:
return None, fullpage.title.string.strip()
for link in fullpage.findAll("a", {"class" : "anchor article-content-title u-margin-xs-top u-margin-s-bottom anchor-default"}):
artical_links.append("https://www.sciencedirect.com" + link.get("href"))
print(f"Total artical found : {len(artical_links)}")
n = 1
auth = []
print(f"Getting all artical from - {fullpage.title.string}")
for li in artical_links:
print(f"Fetching data of {n} artical")
authors = stage_two(li)
auth.extend(authors)
n += 1
return auth, fullpage.title.string.strip()
def stage_two(url: str) -> list:
headers = """
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br
Connection: keep-alive
Upgrade-Insecure-Requests: 1
Sec-Fetch-Dest: document
Sec-Fetch-Mode: navigate
Sec-Fetch-Site: none
Sec-Fetch-User: ?1
Sec-GPC: 1
"""
headers = get_headers(headers)
data = req.get(url, headers=headers)
page = BeautifulSoup(data.text, "lxml")
json_data = page.find("script", {"type" : "application/json"})
json_data = json.loads(json_data.text.strip())
authors_detail = []
address = json_data['authors']['affiliations']
n = 1
if len(json_data['authors']['content']) < 1:
return authors_detail
if not '$$' in json_data['authors']['content'][0]:
with open("jsondata.json", "w") as op:
json.dump(json_data, op, indent=4)
print("ERROR Check jsondata file")
exit()
address = "Not Found"
addr = []
authr = []
email = None
for author in json_data['authors']['content'][0]['$$']:
if author['#name'] == 'author':
# Its author data
author_name = " "
for au in author['$$']:
if au['#name'] == 'given-name' or au['#name'] == 'name':
author_name = au['_'] + author_name
if au['#name'] == 'surname':
author_name = f"{author_name}{au['_']}"
if au['#name'] == 'encoded-e-address':
email = get_email_from_encoding(au['__encoded'])
if email:
authr.append(
{
'Name' : author_name,
'Email' : email
}
)
else:
continue
if author['#name'] == 'affiliation':
for cor in author['$$']:
if '_' in cor:
if address == "Not Found":
address = cor['_']
else:
address = f"{address} {cor['_']}"
addr.append(address)
output = []
for aut in authr:
try:
address = addr[authr.index(aut)]
except:
address = "Not Found"
if address == "Not Found":
address = url
output.append(
{
'Name' : aut['Name'],
'Email' : aut['Email'],
'Address' : address
}
)
return output
def get_author_info_specific(vol: int, issue: int) -> list:
print(f"Getting detail of volume {vol} and issue {issue}")
data, page_title = run(vol, issue)
return data
def get_author_info_in_range(from_vol: int, to_vol: int) -> list:
allAuthors = []
last_page_title = None
for i in range(from_vol, to_vol + 1):
print(f"Getting data of vol {i}")
d = 1
while True:
try:
data, page_title = run(i, d, last_page_title)
if last_page_title == page_title:
print(f"All issues covered of vol {i} changing volume")
print("--------------------------------------------------------------------------")
break
else:
last_page_title = page_title
allAuthors.extend(data)
print(f"Issue {d} data recieved total authors : {len(allAuthors)}")
except Exception as e:
print(f"ERROR : {traceback.format_exc()}")
print(f"All issues covered of vol {i}")
print("--------------------------------------------------------------------------")
break
d += 1
return allAuthors