PrmScrp / springerscrapper.py
H4CK3R-5M4CK3R
Scrpr
57273d8
raw
history blame
3.28 kB
import requests
from bs4 import BeautifulSoup
import json
from sheets import ExcelAutomator
def get_headers(data: str) -> dict:
"""This funciton helps to get the headers form the string to the dict
Args:
data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e)
Returns:
dict: Return the dict or you can say header
"""
data = data.strip()
data = data.split("\n")
out = {}
for dt in data:
key = dt.split(":", 1)[0].strip()
value = dt.split(":", 1)[1].strip()
if value.lower() == "none":
value = None
elif value.lower() == "true":
value = True
elif value.lower() == "false":
value = False
out[key] = value
return out
def get_all_articals_link(url: str) -> dict:
browser = requests.session()
# url = f"https://link.springer.com/journal/208/volumes-and-issues/{volume}-{issue}"
headers = """
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br
Referer: https://link.springer.com/journal/208/volumes-and-issues
Alt-Used: link.springer.com
Connection: keep-alive
Upgrade-Insecure-Requests: 1
Sec-Fetch-Dest: document
Sec-Fetch-Mode: navigate
Sec-Fetch-Site: same-origin
Sec-Fetch-User: ?1
Sec-GPC: 1
TE: trailers
"""
head = get_headers(headers)
data = browser.get(url, headers=head)
fullpage = BeautifulSoup(data.text, "lxml")
orderlist = fullpage.find("ol", {"class" : "u-list-reset"})
allLinks = []
for dt in orderlist.findAll("li"):
if not dt.find("a"):
continue
allLinks.append(dt.find("a").get("href"))
return allLinks
def get_authors(url: str) -> list:
browser = requests.session()
headers = """
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br
Referer: https://link.springer.com/journal/208/volumes-and-issues
Alt-Used: link.springer.com
Connection: keep-alive
Upgrade-Insecure-Requests: 1
Sec-Fetch-Dest: document
Sec-Fetch-Mode: navigate
Sec-Fetch-Site: same-origin
Sec-Fetch-User: ?1
Sec-GPC: 1
TE: trailers
"""
head = get_headers(headers)
data = browser.get(url, headers=head)
main_page = BeautifulSoup(data.text, "lxml")
json_data = main_page.find("script", {"type" : "application/ld+json"}).text
json_data = json.loads(json_data)
authors = json_data['mainEntity']['author']
output = []
for author in authors:
if 'email' in author:
output.append(
{
"Name" : author['name'],
'Email' : author['email'],
'Address' : ", ".join(item['address']['name'] for item in author['affiliation'] if 'address' in item and 'name' in item['address'])
}
)
return output