PrmScrp / springerscrapper.py
pryanshusharma's picture
Update springerscrapper.py
5cde1d2 verified
raw
history blame
3.27 kB
import requests
from bs4 import BeautifulSoup
import json
def get_headers(data: str) -> dict:
"""This funciton helps to get the headers form the string to the dict
Args:
data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e)
Returns:
dict: Return the dict or you can say header
"""
data = data.strip()
data = data.split("\n")
out = {}
for dt in data:
key = dt.split(":", 1)[0].strip()
value = dt.split(":", 1)[1].strip()
if value.lower() == "none":
value = None
elif value.lower() == "true":
value = True
elif value.lower() == "false":
value = False
out[key] = value
return out
def get_all_articals_link(url: str) -> dict:
browser = requests.session()
# url = f"https://link.springer.com/journal/208/volumes-and-issues/{volume}-{issue}"
headers = """
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br
Referer: https://link.springer.com/journal/208/volumes-and-issues
Alt-Used: link.springer.com
Connection: keep-alive
Upgrade-Insecure-Requests: 1
Sec-Fetch-Dest: document
Sec-Fetch-Mode: navigate
Sec-Fetch-Site: same-origin
Sec-Fetch-User: ?1
Sec-GPC: 1
TE: trailers
"""
head = get_headers(headers)
data = browser.get(url, headers=head)
fullpage = BeautifulSoup(data.text, "lxml")
orderlist = fullpage.find("ol", {"class" : "u-list-reset"})
allLinks = []
for dt in orderlist.findAll("li"):
if not dt.find("a"):
continue
allLinks.append(dt.find("a").get("href"))
return allLinks
def get_authors(url: str) -> list:
browser = requests.session()
headers = """
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br
Referer: https://link.springer.com/journal/208/volumes-and-issues
Alt-Used: link.springer.com
Connection: keep-alive
Upgrade-Insecure-Requests: 1
Sec-Fetch-Dest: document
Sec-Fetch-Mode: navigate
Sec-Fetch-Site: same-origin
Sec-Fetch-User: ?1
Sec-GPC: 1
TE: trailers
"""
head = get_headers(headers)
data = browser.get(url, headers=head)
main_page = BeautifulSoup(data.text, "lxml")
json_data = main_page.find("script", {"type" : "application/ld+json"}).text
json_data = json.loads(json_data)
authors = json_data['mainEntity']['author']
output = []
for author in authors:
if 'email' in author:
try:
address = author['affiliation'][0]['address']['name']
except:
address = ""
output.append(
{
"Name" : author['name'],
'Email' : author['email'],
'Address' : address
}
)
return output