Spaces:

pryanshusharma
/

PrmScrp

Running

PrmScrp / sciencedirect.py

H4CK3R-5M4CK3R

Scrpr

57273d8 6 months ago

7.22 kB

	import os
	from requests import session
	from bs4 import BeautifulSoup
	import base64
	import urllib.parse
	import traceback
	import json
	from sheets import ExcelAutomator

	req = session()

	def get_headers(data: str) -> dict:
	data = data.strip()
	data = data.split("\n")
	out = {}
	for dt in data:
	key = dt.split(":", 1)[0].strip()
	value = dt.split(":", 1)[1].strip()

	if value.lower() == "none":
	value = None
	elif value.lower() == "true":
	value = True
	elif value.lower() == "false":
	value = False

	out[key] = value
	return out

	def get_email_from_encoding(encoded_str):
	base64_decoded = base64.b64decode(encoded_str).decode('utf-8')
	url_decoded = urllib.parse.unquote(base64_decoded)
	decoded_json = json.loads(url_decoded)
	try:
	if decoded_json["#name"] == 'e-address':
	if decoded_json['$']['type'] == 'email':
	if 'href' in decoded_json['$']:
	if 'mailto:' in decoded_json['$']['href']:
	return decoded_json['$']['href'].replace("mailto:", "")
	else:
	return None
	else:
	return decoded_json['_']
	else:
	return None
	else:
	return None
	except Exception as e:
	with open("jsondata.json", "w") as op:
	json.dump(decoded_json, op)
	print(f"ERROR : {e},\n---------------------------------------------------------\n{traceback.format_exc()}\n\n---------------------------------------------------------")
	exit()

	def run(url: str, last_artical_name: str=None) -> tuple:
	"""This function helps to get the detail from the first site

	Args:
	volume (int): Pass the volume number
	issue (int): Pass the issue number

	Returns:
	tuple : It includes auth data and page title
	"""
	headers = """
	User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
	Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,/;q=0.8
	Accept-Language: en-US,en;q=0.5
	Accept-Encoding: gzip, deflate, br
	Connection: keep-alive
	Upgrade-Insecure-Requests: 1
	Sec-Fetch-Dest: document
	Sec-Fetch-Mode: navigate
	Sec-Fetch-Site: none
	Sec-Fetch-User: ?1
	Sec-GPC: 1
	"""

	headers = get_headers(headers)

	# url = f"https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{volume}/issue/{issue}"

	data = req.get(url, headers=headers)

	artical_links = []
	fullpage = BeautifulSoup(str(data.text), "lxml")
	if fullpage.title.string.strip() == last_artical_name:
	return None, fullpage.title.string.strip()
	for link in fullpage.findAll("a", {"class" : "anchor article-content-title u-margin-xs-top u-margin-s-bottom anchor-default"}):
	artical_links.append("https://www.sciencedirect.com" + link.get("href"))
	print(f"Total artical found : {len(artical_links)}")
	n = 1
	auth = []
	print(f"Getting all artical from - {fullpage.title.string}")
	for li in artical_links:
	print(f"Fetching data of {n} artical")
	authors = stage_two(li)
	auth.extend(authors)
	n += 1
	return auth, fullpage.title.string.strip()

	def stage_two(url: str) -> list:

	headers = """
	User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
	Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,/;q=0.8
	Accept-Language: en-US,en;q=0.5
	Accept-Encoding: gzip, deflate, br
	Connection: keep-alive
	Upgrade-Insecure-Requests: 1
	Sec-Fetch-Dest: document
	Sec-Fetch-Mode: navigate
	Sec-Fetch-Site: none
	Sec-Fetch-User: ?1
	Sec-GPC: 1
	"""
	headers = get_headers(headers)

	data = req.get(url, headers=headers)
	page = BeautifulSoup(data.text, "lxml")
	json_data = page.find("script", {"type" : "application/json"})
	json_data = json.loads(json_data.text.strip())
	authors_detail = []
	address = json_data['authors']['affiliations']
	n = 1
	if len(json_data['authors']['content']) < 1:
	return authors_detail
	if not '$$' in json_data['authors']['content'][0]:
	with open("jsondata.json", "w") as op:
	json.dump(json_data, op, indent=4)
	print("ERROR Check jsondata file")
	exit()
	address = "Not Found"
	addr = []
	authr = []
	email = None
	for author in json_data['authors']['content'][0]['$$']:
	if author['#name'] == 'author':
	# Its author data
	author_name = " "
	for au in author['$$']:
	if au['#name'] == 'given-name' or au['#name'] == 'name':
	author_name = au['_'] + author_name
	if au['#name'] == 'surname':
	author_name = f"{author_name}{au['_']}"
	if au['#name'] == 'encoded-e-address':
	email = get_email_from_encoding(au['__encoded'])
	if email:
	authr.append(
	{
	'Name' : author_name,
	'Email' : email
	}
	)
	else:
	continue
	if author['#name'] == 'affiliation':
	for cor in author['$$']:
	if '_' in cor:
	if address == "Not Found":
	address = cor['_']
	else:
	address = f"{address} {cor['_']}"
	addr.append(address)

	output = []
	for aut in authr:
	try:
	address = addr[authr.index(aut)]
	except:
	address = "Not Found"
	if address == "Not Found":
	address = url
	output.append(
	{
	'Name' : aut['Name'],
	'Email' : aut['Email'],
	'Address' : address
	}
	)
	return output

	def get_author_info_specific(vol: int, issue: int) -> list:
	print(f"Getting detail of volume {vol} and issue {issue}")
	data, page_title = run(vol, issue)
	return data

	def get_author_info_in_range(from_vol: int, to_vol: int) -> list:
	allAuthors = []
	last_page_title = None
	for i in range(from_vol, to_vol + 1):
	print(f"Getting data of vol {i}")
	d = 1
	while True:
	try:
	data, page_title = run(i, d, last_page_title)
	if last_page_title == page_title:
	print(f"All issues covered of vol {i} changing volume")
	print("--------------------------------------------------------------------------")
	break
	else:
	last_page_title = page_title
	allAuthors.extend(data)
	print(f"Issue {d} data recieved total authors : {len(allAuthors)}")
	except Exception as e:
	print(f"ERROR : {traceback.format_exc()}")
	print(f"All issues covered of vol {i}")
	print("--------------------------------------------------------------------------")
	break
	d += 1
	return allAuthors