Spaces:

pryanshusharma
/

PrmScrp

Sleeping

App Files Files Community

PrmScrp / springerscrapper.py

pryanshusharma

Update springerscrapper.py

5cde1d2 verified 7 months ago

raw

history blame

3.27 kB

	import requests
	from bs4 import BeautifulSoup
	import json

	def get_headers(data: str) -> dict:
	"""This funciton helps to get the headers form the string to the dict

	Args:
	data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e)

	Returns:
	dict: Return the dict or you can say header
	"""
	data = data.strip()
	data = data.split("\n")
	out = {}
	for dt in data:
	key = dt.split(":", 1)[0].strip()
	value = dt.split(":", 1)[1].strip()

	if value.lower() == "none":
	value = None
	elif value.lower() == "true":
	value = True
	elif value.lower() == "false":
	value = False

	out[key] = value
	return out

	def get_all_articals_link(url: str) -> dict:
	browser = requests.session()
	# url = f"https://link.springer.com/journal/208/volumes-and-issues/{volume}-{issue}"
	headers = """
	User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
	Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,/;q=0.8
	Accept-Language: en-US,en;q=0.5
	Accept-Encoding: gzip, deflate, br
	Referer: https://link.springer.com/journal/208/volumes-and-issues
	Alt-Used: link.springer.com
	Connection: keep-alive
	Upgrade-Insecure-Requests: 1
	Sec-Fetch-Dest: document
	Sec-Fetch-Mode: navigate
	Sec-Fetch-Site: same-origin
	Sec-Fetch-User: ?1
	Sec-GPC: 1
	TE: trailers
	"""

	head = get_headers(headers)

	data = browser.get(url, headers=head)

	fullpage = BeautifulSoup(data.text, "lxml")

	orderlist = fullpage.find("ol", {"class" : "u-list-reset"})
	allLinks = []
	for dt in orderlist.findAll("li"):
	if not dt.find("a"):
	continue
	allLinks.append(dt.find("a").get("href"))
	return allLinks

	def get_authors(url: str) -> list:
	browser = requests.session()
	headers = """
	User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
	Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,/;q=0.8
	Accept-Language: en-US,en;q=0.5
	Accept-Encoding: gzip, deflate, br
	Referer: https://link.springer.com/journal/208/volumes-and-issues
	Alt-Used: link.springer.com
	Connection: keep-alive
	Upgrade-Insecure-Requests: 1
	Sec-Fetch-Dest: document
	Sec-Fetch-Mode: navigate
	Sec-Fetch-Site: same-origin
	Sec-Fetch-User: ?1
	Sec-GPC: 1
	TE: trailers
	"""

	head = get_headers(headers)
	data = browser.get(url, headers=head)

	main_page = BeautifulSoup(data.text, "lxml")
	json_data = main_page.find("script", {"type" : "application/ld+json"}).text
	json_data = json.loads(json_data)
	authors = json_data['mainEntity']['author']
	output = []
	for author in authors:
	if 'email' in author:
	try:
	address = author['affiliation'][0]['address']['name']
	except:
	address = ""
	output.append(
	{
	"Name" : author['name'],
	'Email' : author['email'],
	'Address' : address
	}
	)
	return output