Spaces:
Sleeping
Sleeping
import requests | |
from bs4 import BeautifulSoup | |
from sheets import ExcelAutomator | |
from seleliumdriver import WebScraper | |
browser = requests.session() | |
def save(data:str): | |
with open("data.html", "w") as op: | |
op.write(str(data)) | |
def get_headers(data: str) -> dict: | |
"""This funciton helps to get the headers form the string to the dict | |
Args: | |
data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e) | |
Returns: | |
dict: Return the dict or you can say header | |
""" | |
data = data.strip() | |
data = data.split("\n") | |
out = {} | |
for dt in data: | |
key = dt.split(":", 1)[0].strip() | |
value = dt.split(":", 1)[1].strip() | |
if value.lower() == "none": | |
value = None | |
elif value.lower() == "true": | |
value = True | |
elif value.lower() == "false": | |
value = False | |
out[key] = value | |
return out | |
def get_links(url: str, issue: int) -> list: | |
headers = """ | |
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0 | |
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8 | |
Accept-Language: en-US,en;q=0.5 | |
Accept-Encoding: gzip, deflate, br | |
Alt-Used: onlinelibrary.wiley.com | |
Connection: keep-alive | |
Cookie: MAID=5SmfjugKsbanANmqY7QFTQ==; MACHINE_LAST_SEEN=2024-07-13T00%3A17%3A54.434-07%3A00; osano_consentmanager_uuid=a09cf48c-a316-44da-a630-b284fe618561; osano_consentmanager=wtFVO73sxrqPK1QjgWvz2PRznZ_IuLc6ARcv2t0_pFtepafXHZgrg-S478uJo9AvbIWsu3sbpgmvaCKL_zNkJQZzpvdHNzGX6NQQ6cwL_c09p-7H9gmYq7lFeOBlGJxYVbwgVIa5TZDqtpLjvla4iYf-rEyPZ0zXi8nZVVY5aCRrKBkWiIYkwIWvpeVeBLepXirD0RkYCGg-O2PWE000CQi4kWVXGTOkNMFqFOSQ-tthQqpC7pvT9AeCAodC2z6CeM6tTjz3TNmp8sTxikwwT4jzZ9HRy76gqQjb8g==; kndctr_1B6E34B85282A0AC0A490D44_AdobeOrg_identity=CiY4MDg5NTE5MTAxMTg2NDkzMzQzMTI2OTY5MjMzMTU3OTYwODc1N1ITCM6izY3mMRABGAEqBElORDEwAPAB5cnS14oy; Hm_lvt_953dddc9c7bea72022e3bd3ba7782e7b=1720765103,1720855080; AMCV_1B6E34B85282A0AC0A490D44%40AdobeOrg=MCMID|80895191011864933431269692331579608757; JSESSIONID=90BFBDCF8874DBB2B708D37ACC4172DD; __cf_bm=FgCtBcokrG75eoj6.nqj2jTcbcl.vtSPGArq4iAYwYk-1720855074-1.0.1.1-OCKWcrDvKtyaaNLld1aBjaFFwZLoLHauSzJ0NEZFn1JLYK4G4lqmaTMEE50PAzZCReTc13aRgLNyLlqu6JOllleWjBRMQr5vc3YjxJ4kdPs; kndctr_1B6E34B85282A0AC0A490D44_AdobeOrg_cluster=ind1; cf_clearance=B0r0CEgCWVP2M5CKvRhRTvIW8MyIJM2WBVS14NsHxxE-1720855079-1.0.1.1-CqrZHd19zoe3QCemtBtqxsHiVLXILmnPkb9RjSG2yHndhy.XZzt14jGpjymiEPzjA0nFP7xw1hU6xsXIz6UDSg; Hm_lpvt_953dddc9c7bea72022e3bd3ba7782e7b=1720855160; HMACCOUNT=C851A9F6625CC221; randomizeUser=0.5543043437474287 | |
Upgrade-Insecure-Requests: 1 | |
Sec-Fetch-Dest: document | |
Sec-Fetch-Mode: navigate | |
Sec-Fetch-Site: none | |
Sec-Fetch-User: ?1 | |
Sec-GPC: 1 | |
TE: trailers | |
""" | |
# url = f"https://onlinelibrary.wiley.com/toc/14679590/{year}/{volume}/{issue}" | |
data = browser.get(url, headers=get_headers(headers)) | |
fullPage = BeautifulSoup(data.text, "lxml") | |
issuelinks = [] | |
for link in fullPage.findAll("a", {"class" : "issue-item__title visitable"}): | |
issuelinks.append(f'https://onlinelibrary.wiley.com{link.get("href")}') | |
return issuelinks | |
def decode_email(encoded_str): | |
key = int(encoded_str[:2], 16) | |
encoded_bytes = bytes.fromhex(encoded_str[2:]) | |
decoded_email = ''.join(chr(byte ^ key) for byte in encoded_bytes) | |
return decoded_email | |
def get_details(url: str): | |
driver = WebScraper(browser="firefox", hidden=False) | |
driver.get(url) | |
data = driver.get_html() | |
# save(data.text) | |
full_page = BeautifulSoup(data, "lxml") | |
author_detail = full_page.find("div", {"class" : "accordion-tabbed"}) | |
output = [] | |
save(full_page) | |
for author in author_detail.findAll("span", {"class" : "accordion-tabbed__tab-mobile accordion__closed"}): | |
author_name = author.find("p", {"class" : "author-name"}).text.strip() | |
if author.find("span", {"class" : "__cf_email__"}) == None: | |
continue | |
email = decode_email(author.find("span", {"class" : "__cf_email__"}).get("data-cfemail")) | |
address = author.find("p", {"class" : "author-name"}).findNext("p").text.strip() | |
output.append( | |
{ | |
"Name" : author_name, | |
"Email" : email, | |
'Address' : address | |
} | |
) | |
return output | |