Spaces:
Sleeping
Sleeping
import requests | |
from bs4 import BeautifulSoup | |
from sheets import ExcelAutomator | |
from seleliumdriver import WebScraper | |
def save(data:str): | |
with open("data.html", "w") as op: | |
op.write(str(data)) | |
def get_headers(data: str) -> dict: | |
"""This funciton helps to get the headers form the string to the dict | |
Args: | |
data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e) | |
Returns: | |
dict: Return the dict or you can say header | |
""" | |
data = data.strip() | |
data = data.split("\n") | |
out = {} | |
for dt in data: | |
key = dt.split(":", 1)[0].strip() | |
value = dt.split(":", 1)[1].strip() | |
if value.lower() == "none": | |
value = None | |
elif value.lower() == "true": | |
value = True | |
elif value.lower() == "false": | |
value = False | |
out[key] = value | |
return out | |
def get_links(url: str) -> list: | |
browser = WebScraper(browser="firefox", hidden=False) | |
browser.get(url) #browser.get(url, headers=get_headers(headers)) | |
fullPage = BeautifulSoup(browser.get_html(), "lxml") | |
save(browser.get_html()) | |
issuelinks = [] | |
for link in fullPage.findAll("a", {"class" : "issue-item__title visitable"}): | |
issuelinks.append(f'https://onlinelibrary.wiley.com{link.get("href")}') | |
return issuelinks | |
def decode_email(encoded_str): | |
key = int(encoded_str[:2], 16) | |
encoded_bytes = bytes.fromhex(encoded_str[2:]) | |
decoded_email = ''.join(chr(byte ^ key) for byte in encoded_bytes) | |
return decoded_email | |
def get_details(url: str): | |
driver = WebScraper(browser="firefox", hidden=False) | |
driver.get(url) | |
data = driver.get_html() | |
# save(data.text) | |
full_page = BeautifulSoup(data, "lxml") | |
author_detail = full_page.find("div", {"class" : "accordion-tabbed"}) | |
output = [] | |
save(full_page) | |
for author in author_detail.findAll("span", {"class" : "accordion-tabbed__tab-mobile accordion__closed"}): | |
author_name = author.find("p", {"class" : "author-name"}).text.strip() | |
if author.find("span", {"class" : "__cf_email__"}) == None: | |
continue | |
email = decode_email(author.find("span", {"class" : "__cf_email__"}).get("data-cfemail")) | |
address = author.find("p", {"class" : "author-name"}).findNext("p").text.strip() | |
output.append( | |
{ | |
"Name" : author_name, | |
"Email" : email, | |
'Address' : address | |
} | |
) | |
return output | |