import requests from bs4 import BeautifulSoup import json from sheets import ExcelAutomator def get_headers(data: str) -> dict: """This funciton helps to get the headers form the string to the dict Args: data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e) Returns: dict: Return the dict or you can say header """ data = data.strip() data = data.split("\n") out = {} for dt in data: key = dt.split(":", 1)[0].strip() value = dt.split(":", 1)[1].strip() if value.lower() == "none": value = None elif value.lower() == "true": value = True elif value.lower() == "false": value = False out[key] = value return out def get_all_articals_link(url: str) -> dict: browser = requests.session() # url = f"https://link.springer.com/journal/208/volumes-and-issues/{volume}-{issue}" headers = """ User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0 Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8 Accept-Language: en-US,en;q=0.5 Accept-Encoding: gzip, deflate, br Referer: https://link.springer.com/journal/208/volumes-and-issues Alt-Used: link.springer.com Connection: keep-alive Upgrade-Insecure-Requests: 1 Sec-Fetch-Dest: document Sec-Fetch-Mode: navigate Sec-Fetch-Site: same-origin Sec-Fetch-User: ?1 Sec-GPC: 1 TE: trailers """ head = get_headers(headers) data = browser.get(url, headers=head) fullpage = BeautifulSoup(data.text, "lxml") orderlist = fullpage.find("ol", {"class" : "u-list-reset"}) allLinks = [] for dt in orderlist.findAll("li"): if not dt.find("a"): continue allLinks.append(dt.find("a").get("href")) return allLinks def get_authors(url: str) -> list: browser = requests.session() headers = """ User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0 Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8 Accept-Language: en-US,en;q=0.5 Accept-Encoding: gzip, deflate, br Referer: https://link.springer.com/journal/208/volumes-and-issues Alt-Used: link.springer.com Connection: keep-alive Upgrade-Insecure-Requests: 1 Sec-Fetch-Dest: document Sec-Fetch-Mode: navigate Sec-Fetch-Site: same-origin Sec-Fetch-User: ?1 Sec-GPC: 1 TE: trailers """ head = get_headers(headers) data = browser.get(url, headers=head) main_page = BeautifulSoup(data.text, "lxml") json_data = main_page.find("script", {"type" : "application/ld+json"}).text json_data = json.loads(json_data) authors = json_data['mainEntity']['author'] output = [] for author in authors: if 'email' in author: output.append( { "Name" : author['name'], 'Email' : author['email'], 'Address' : ", ".join(item['address']['name'] for item in author['affiliation'] if 'address' in item and 'name' in item['address']) } ) return output