Spaces:
Sleeping
Sleeping
import requests | |
from bs4 import BeautifulSoup | |
import json | |
from sheets import ExcelAutomator | |
def get_headers(data: str) -> dict: | |
"""This funciton helps to get the headers form the string to the dict | |
Args: | |
data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e) | |
Returns: | |
dict: Return the dict or you can say header | |
""" | |
data = data.strip() | |
data = data.split("\n") | |
out = {} | |
for dt in data: | |
key = dt.split(":", 1)[0].strip() | |
value = dt.split(":", 1)[1].strip() | |
if value.lower() == "none": | |
value = None | |
elif value.lower() == "true": | |
value = True | |
elif value.lower() == "false": | |
value = False | |
out[key] = value | |
return out | |
def get_all_articals_link(url: str) -> dict: | |
browser = requests.session() | |
# url = f"https://link.springer.com/journal/208/volumes-and-issues/{volume}-{issue}" | |
headers = """ | |
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0 | |
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8 | |
Accept-Language: en-US,en;q=0.5 | |
Accept-Encoding: gzip, deflate, br | |
Referer: https://link.springer.com/journal/208/volumes-and-issues | |
Alt-Used: link.springer.com | |
Connection: keep-alive | |
Upgrade-Insecure-Requests: 1 | |
Sec-Fetch-Dest: document | |
Sec-Fetch-Mode: navigate | |
Sec-Fetch-Site: same-origin | |
Sec-Fetch-User: ?1 | |
Sec-GPC: 1 | |
TE: trailers | |
""" | |
head = get_headers(headers) | |
data = browser.get(url, headers=head) | |
fullpage = BeautifulSoup(data.text, "lxml") | |
orderlist = fullpage.find("ol", {"class" : "u-list-reset"}) | |
allLinks = [] | |
for dt in orderlist.findAll("li"): | |
if not dt.find("a"): | |
continue | |
allLinks.append(dt.find("a").get("href")) | |
return allLinks | |
def get_authors(url: str) -> list: | |
browser = requests.session() | |
headers = """ | |
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0 | |
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8 | |
Accept-Language: en-US,en;q=0.5 | |
Accept-Encoding: gzip, deflate, br | |
Referer: https://link.springer.com/journal/208/volumes-and-issues | |
Alt-Used: link.springer.com | |
Connection: keep-alive | |
Upgrade-Insecure-Requests: 1 | |
Sec-Fetch-Dest: document | |
Sec-Fetch-Mode: navigate | |
Sec-Fetch-Site: same-origin | |
Sec-Fetch-User: ?1 | |
Sec-GPC: 1 | |
TE: trailers | |
""" | |
head = get_headers(headers) | |
data = browser.get(url, headers=head) | |
main_page = BeautifulSoup(data.text, "lxml") | |
json_data = main_page.find("script", {"type" : "application/ld+json"}).text | |
json_data = json.loads(json_data) | |
authors = json_data['mainEntity']['author'] | |
output = [] | |
for author in authors: | |
if 'email' in author: | |
output.append( | |
{ | |
"Name" : author['name'], | |
'Email' : author['email'], | |
'Address' : ", ".join(item['address']['name'] for item in author['affiliation'] if 'address' in item and 'name' in item['address']) | |
} | |
) | |
return output | |